From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/mm/Kconfig | 637 ++++ kernel/mm/Kconfig.debug | 30 + kernel/mm/Makefile | 80 + kernel/mm/backing-dev.c | 601 ++++ kernel/mm/balloon_compaction.c | 221 ++ kernel/mm/bootmem.c | 861 ++++++ kernel/mm/cleancache.c | 319 ++ kernel/mm/cma.c | 456 +++ kernel/mm/cma.h | 24 + kernel/mm/cma_debug.c | 205 ++ kernel/mm/compaction.c | 1719 +++++++++++ kernel/mm/debug-pagealloc.c | 137 + kernel/mm/debug.c | 240 ++ kernel/mm/dmapool.c | 529 ++++ kernel/mm/early_ioremap.c | 245 ++ kernel/mm/fadvise.c | 162 + kernel/mm/failslab.c | 60 + kernel/mm/filemap.c | 2670 ++++++++++++++++ kernel/mm/frontswap.c | 457 +++ kernel/mm/gup.c | 1379 +++++++++ kernel/mm/highmem.c | 490 +++ kernel/mm/huge_memory.c | 3011 ++++++++++++++++++ kernel/mm/hugetlb.c | 3957 ++++++++++++++++++++++++ kernel/mm/hugetlb_cgroup.c | 422 +++ kernel/mm/hwpoison-inject.c | 144 + kernel/mm/init-mm.c | 25 + kernel/mm/internal.h | 436 +++ kernel/mm/interval_tree.c | 112 + kernel/mm/kasan/Makefile | 8 + kernel/mm/kasan/kasan.c | 537 ++++ kernel/mm/kasan/kasan.h | 75 + kernel/mm/kasan/report.c | 269 ++ kernel/mm/kmemcheck.c | 123 + kernel/mm/kmemleak-test.c | 111 + kernel/mm/kmemleak.c | 1941 ++++++++++++ kernel/mm/ksm.c | 2341 ++++++++++++++ kernel/mm/list_lru.c | 561 ++++ kernel/mm/maccess.c | 62 + kernel/mm/madvise.c | 549 ++++ kernel/mm/memblock.c | 1605 ++++++++++ kernel/mm/memcontrol.c | 5933 +++++++++++++++++++++++++++++++++++ kernel/mm/memory-failure.c | 1790 +++++++++++ kernel/mm/memory.c | 3857 +++++++++++++++++++++++ kernel/mm/memory_hotplug.c | 2015 ++++++++++++ kernel/mm/mempolicy.c | 2831 +++++++++++++++++ kernel/mm/mempool.c | 494 +++ kernel/mm/memtest.c | 118 + kernel/mm/migrate.c | 1855 +++++++++++ kernel/mm/mincore.c | 269 ++ kernel/mm/mlock.c | 758 +++++ kernel/mm/mm_init.c | 205 ++ kernel/mm/mmap.c | 3396 ++++++++++++++++++++ kernel/mm/mmu_context.c | 64 + kernel/mm/mmu_notifier.c | 396 +++ kernel/mm/mmzone.c | 114 + kernel/mm/mprotect.c | 433 +++ kernel/mm/mremap.c | 584 ++++ kernel/mm/msync.c | 107 + kernel/mm/nobootmem.c | 438 +++ kernel/mm/nommu.c | 2181 +++++++++++++ kernel/mm/oom_kill.c | 832 +++++ kernel/mm/page-writeback.c | 2436 +++++++++++++++ kernel/mm/page_alloc.c | 6695 ++++++++++++++++++++++++++++++++++++++++ kernel/mm/page_counter.c | 193 ++ kernel/mm/page_ext.c | 403 +++ kernel/mm/page_io.c | 381 +++ kernel/mm/page_isolation.c | 314 ++ kernel/mm/page_owner.c | 313 ++ kernel/mm/pagewalk.c | 304 ++ kernel/mm/percpu-km.c | 110 + kernel/mm/percpu-vm.c | 366 +++ kernel/mm/percpu.c | 2295 ++++++++++++++ kernel/mm/pgtable-generic.c | 200 ++ kernel/mm/process_vm_access.c | 365 +++ kernel/mm/quicklist.c | 102 + kernel/mm/readahead.c | 580 ++++ kernel/mm/rmap.c | 1599 ++++++++++ kernel/mm/shmem.c | 3458 +++++++++++++++++++++ kernel/mm/slab.c | 4240 +++++++++++++++++++++++++ kernel/mm/slab.h | 384 +++ kernel/mm/slab_common.c | 1171 +++++++ kernel/mm/slob.c | 641 ++++ kernel/mm/slub.c | 5400 ++++++++++++++++++++++++++++++++ kernel/mm/sparse-vmemmap.c | 235 ++ kernel/mm/sparse.c | 811 +++++ kernel/mm/swap.c | 1168 +++++++ kernel/mm/swap_cgroup.c | 208 ++ kernel/mm/swap_state.c | 486 +++ kernel/mm/swapfile.c | 2940 ++++++++++++++++++ kernel/mm/truncate.c | 800 +++++ kernel/mm/util.c | 465 +++ kernel/mm/vmacache.c | 134 + kernel/mm/vmalloc.c | 2742 ++++++++++++++++ kernel/mm/vmpressure.c | 382 +++ kernel/mm/vmscan.c | 3828 +++++++++++++++++++++++ kernel/mm/vmstat.c | 1705 ++++++++++ kernel/mm/workingset.c | 416 +++ kernel/mm/zbud.c | 624 ++++ kernel/mm/zpool.c | 366 +++ kernel/mm/zsmalloc.c | 1947 ++++++++++++ kernel/mm/zswap.c | 955 ++++++ 101 files changed, 108643 insertions(+) create mode 100644 kernel/mm/Kconfig create mode 100644 kernel/mm/Kconfig.debug create mode 100644 kernel/mm/Makefile create mode 100644 kernel/mm/backing-dev.c create mode 100644 kernel/mm/balloon_compaction.c create mode 100644 kernel/mm/bootmem.c create mode 100644 kernel/mm/cleancache.c create mode 100644 kernel/mm/cma.c create mode 100644 kernel/mm/cma.h create mode 100644 kernel/mm/cma_debug.c create mode 100644 kernel/mm/compaction.c create mode 100644 kernel/mm/debug-pagealloc.c create mode 100644 kernel/mm/debug.c create mode 100644 kernel/mm/dmapool.c create mode 100644 kernel/mm/early_ioremap.c create mode 100644 kernel/mm/fadvise.c create mode 100644 kernel/mm/failslab.c create mode 100644 kernel/mm/filemap.c create mode 100644 kernel/mm/frontswap.c create mode 100644 kernel/mm/gup.c create mode 100644 kernel/mm/highmem.c create mode 100644 kernel/mm/huge_memory.c create mode 100644 kernel/mm/hugetlb.c create mode 100644 kernel/mm/hugetlb_cgroup.c create mode 100644 kernel/mm/hwpoison-inject.c create mode 100644 kernel/mm/init-mm.c create mode 100644 kernel/mm/internal.h create mode 100644 kernel/mm/interval_tree.c create mode 100644 kernel/mm/kasan/Makefile create mode 100644 kernel/mm/kasan/kasan.c create mode 100644 kernel/mm/kasan/kasan.h create mode 100644 kernel/mm/kasan/report.c create mode 100644 kernel/mm/kmemcheck.c create mode 100644 kernel/mm/kmemleak-test.c create mode 100644 kernel/mm/kmemleak.c create mode 100644 kernel/mm/ksm.c create mode 100644 kernel/mm/list_lru.c create mode 100644 kernel/mm/maccess.c create mode 100644 kernel/mm/madvise.c create mode 100644 kernel/mm/memblock.c create mode 100644 kernel/mm/memcontrol.c create mode 100644 kernel/mm/memory-failure.c create mode 100644 kernel/mm/memory.c create mode 100644 kernel/mm/memory_hotplug.c create mode 100644 kernel/mm/mempolicy.c create mode 100644 kernel/mm/mempool.c create mode 100644 kernel/mm/memtest.c create mode 100644 kernel/mm/migrate.c create mode 100644 kernel/mm/mincore.c create mode 100644 kernel/mm/mlock.c create mode 100644 kernel/mm/mm_init.c create mode 100644 kernel/mm/mmap.c create mode 100644 kernel/mm/mmu_context.c create mode 100644 kernel/mm/mmu_notifier.c create mode 100644 kernel/mm/mmzone.c create mode 100644 kernel/mm/mprotect.c create mode 100644 kernel/mm/mremap.c create mode 100644 kernel/mm/msync.c create mode 100644 kernel/mm/nobootmem.c create mode 100644 kernel/mm/nommu.c create mode 100644 kernel/mm/oom_kill.c create mode 100644 kernel/mm/page-writeback.c create mode 100644 kernel/mm/page_alloc.c create mode 100644 kernel/mm/page_counter.c create mode 100644 kernel/mm/page_ext.c create mode 100644 kernel/mm/page_io.c create mode 100644 kernel/mm/page_isolation.c create mode 100644 kernel/mm/page_owner.c create mode 100644 kernel/mm/pagewalk.c create mode 100644 kernel/mm/percpu-km.c create mode 100644 kernel/mm/percpu-vm.c create mode 100644 kernel/mm/percpu.c create mode 100644 kernel/mm/pgtable-generic.c create mode 100644 kernel/mm/process_vm_access.c create mode 100644 kernel/mm/quicklist.c create mode 100644 kernel/mm/readahead.c create mode 100644 kernel/mm/rmap.c create mode 100644 kernel/mm/shmem.c create mode 100644 kernel/mm/slab.c create mode 100644 kernel/mm/slab.h create mode 100644 kernel/mm/slab_common.c create mode 100644 kernel/mm/slob.c create mode 100644 kernel/mm/slub.c create mode 100644 kernel/mm/sparse-vmemmap.c create mode 100644 kernel/mm/sparse.c create mode 100644 kernel/mm/swap.c create mode 100644 kernel/mm/swap_cgroup.c create mode 100644 kernel/mm/swap_state.c create mode 100644 kernel/mm/swapfile.c create mode 100644 kernel/mm/truncate.c create mode 100644 kernel/mm/util.c create mode 100644 kernel/mm/vmacache.c create mode 100644 kernel/mm/vmalloc.c create mode 100644 kernel/mm/vmpressure.c create mode 100644 kernel/mm/vmscan.c create mode 100644 kernel/mm/vmstat.c create mode 100644 kernel/mm/workingset.c create mode 100644 kernel/mm/zbud.c create mode 100644 kernel/mm/zpool.c create mode 100644 kernel/mm/zsmalloc.c create mode 100644 kernel/mm/zswap.c (limited to 'kernel/mm') diff --git a/kernel/mm/Kconfig b/kernel/mm/Kconfig new file mode 100644 index 000000000..0cc453705 --- /dev/null +++ b/kernel/mm/Kconfig @@ -0,0 +1,637 @@ +config SELECT_MEMORY_MODEL + def_bool y + depends on ARCH_SELECT_MEMORY_MODEL + +choice + prompt "Memory model" + depends on SELECT_MEMORY_MODEL + default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT + default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT + default FLATMEM_MANUAL + +config FLATMEM_MANUAL + bool "Flat Memory" + depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here: FLATMEM. This is normal + and a correct option. + + Some users of more advanced features like NUMA and + memory hotplug may have different options here. + DISCONTIGMEM is a more mature, better tested system, + but is incompatible with memory hotplug and may suffer + decreased performance over SPARSEMEM. If unsure between + "Sparse Memory" and "Discontiguous Memory", choose + "Discontiguous Memory". + + If unsure, choose this option (Flat Memory) over any other. + +config DISCONTIGMEM_MANUAL + bool "Discontiguous Memory" + depends on ARCH_DISCONTIGMEM_ENABLE + help + This option provides enhanced support for discontiguous + memory systems, over FLATMEM. These systems have holes + in their physical address spaces, and this option provides + more efficient handling of these holes. However, the vast + majority of hardware has quite flat address spaces, and + can have degraded performance from the extra overhead that + this option imposes. + + Many NUMA configurations will have this as the only option. + + If unsure, choose "Flat Memory" over this option. + +config SPARSEMEM_MANUAL + bool "Sparse Memory" + depends on ARCH_SPARSEMEM_ENABLE + help + This will be the only option for some systems, including + memory hotplug systems. This is normal. + + For many other systems, this will be an alternative to + "Discontiguous Memory". This option provides some potential + performance benefits, along with decreased code complexity, + but it is newer, and more experimental. + + If unsure, choose "Discontiguous Memory" or "Flat Memory" + over this option. + +endchoice + +config DISCONTIGMEM + def_bool y + depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL + +config SPARSEMEM + def_bool y + depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL + +config FLATMEM + def_bool y + depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL + +config FLAT_NODE_MEM_MAP + def_bool y + depends on !SPARSEMEM + +# +# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's +# to represent different areas of memory. This variable allows +# those dependencies to exist individually. +# +config NEED_MULTIPLE_NODES + def_bool y + depends on DISCONTIGMEM || NUMA + +config HAVE_MEMORY_PRESENT + def_bool y + depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM + +# +# SPARSEMEM_EXTREME (which is the default) does some bootmem +# allocations when memory_present() is called. If this cannot +# be done on your architecture, select this option. However, +# statically allocating the mem_section[] array can potentially +# consume vast quantities of .bss, so be careful. +# +# This option will also potentially produce smaller runtime code +# with gcc 3.4 and later. +# +config SPARSEMEM_STATIC + bool + +# +# Architecture platforms which require a two level mem_section in SPARSEMEM +# must select this option. This is usually for architecture platforms with +# an extremely sparse physical address space. +# +config SPARSEMEM_EXTREME + def_bool y + depends on SPARSEMEM && !SPARSEMEM_STATIC + +config SPARSEMEM_VMEMMAP_ENABLE + bool + +config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + def_bool y + depends on SPARSEMEM && X86_64 + +config SPARSEMEM_VMEMMAP + bool "Sparse Memory virtual memmap" + depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE + default y + help + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. + +config HAVE_MEMBLOCK + bool + +config HAVE_MEMBLOCK_NODE_MAP + bool + +config HAVE_MEMBLOCK_PHYS_MAP + bool + +config HAVE_GENERIC_RCU_GUP + bool + +config ARCH_DISCARD_MEMBLOCK + bool + +config NO_BOOTMEM + bool + +config MEMORY_ISOLATION + bool + +config MOVABLE_NODE + bool "Enable to assign a node which has only movable memory" + depends on HAVE_MEMBLOCK + depends on NO_BOOTMEM + depends on X86_64 + depends on NUMA + default n + help + Allow a node to have only movable memory. Pages used by the kernel, + such as direct mapping pages cannot be migrated. So the corresponding + memory device cannot be hotplugged. This option allows the following + two things: + - When the system is booting, node full of hotpluggable memory can + be arranged to have only movable memory so that the whole node can + be hot-removed. (need movable_node boot option specified). + - After the system is up, the option allows users to online all the + memory of a node as movable memory so that the whole node can be + hot-removed. + + Users who don't use the memory hotplug feature are fine with this + option on since they don't specify movable_node boot option or they + don't online memory as movable. + + Say Y here if you want to hotplug a whole node. + Say N here if you want kernel to use memory on all nodes evenly. + +# +# Only be set on architectures that have completely implemented memory hotplug +# feature. If you are not sure, don't touch it. +# +config HAVE_BOOTMEM_INFO_NODE + def_bool n + +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM || X86_64_ACPI_NUMA + depends on ARCH_ENABLE_MEMORY_HOTPLUG + depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) + +config MEMORY_HOTPLUG_SPARSE + def_bool y + depends on SPARSEMEM && MEMORY_HOTPLUG + +config MEMORY_HOTREMOVE + bool "Allow for memory hot remove" + select MEMORY_ISOLATION + select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) + depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE + depends on MIGRATION + +# +# If we have space for more page flags then we can enable additional +# optimizations and functionality. +# +# Regular Sparsemem takes page flag bits for the sectionid if it does not +# use a virtual memmap. Disable extended page flags for 32 bit platforms +# that require the use of a sectionid in the page flags. +# +config PAGEFLAGS_EXTENDED + def_bool y + depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM + +# Heavily threaded applications may benefit from splitting the mm-wide +# page_table_lock, so that faults on different parts of the user address +# space can be handled with less contention: split it at this NR_CPUS. +# Default to 4 for wider testing, though 8 might be more appropriate. +# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. +# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. +# +config SPLIT_PTLOCK_CPUS + int + default "999999" if !MMU + default "999999" if ARM && !CPU_CACHE_VIPT + default "999999" if PARISC && !PA20 + default "4" + +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + bool + +# +# support for memory balloon +config MEMORY_BALLOON + bool + +# +# support for memory balloon compaction +config BALLOON_COMPACTION + bool "Allow for balloon memory compaction/migration" + def_bool y + depends on COMPACTION && MEMORY_BALLOON + help + Memory fragmentation introduced by ballooning might reduce + significantly the number of 2MB contiguous memory blocks that can be + used within a guest, thus imposing performance penalties associated + with the reduced number of transparent huge pages that could be used + by the guest workload. Allowing the compaction & migration for memory + pages enlisted as being part of memory balloon devices avoids the + scenario aforementioned and helps improving memory defragmentation. + +# +# support for memory compaction +config COMPACTION + bool "Allow for memory compaction" + def_bool y + select MIGRATION + depends on MMU + help + Allows the compaction of memory for the allocation of huge pages. + +# +# support for page migration +# +config MIGRATION + bool "Page migration" + def_bool y + depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful in + two situations. The first is on NUMA systems to put pages nearer + to the processors accessing. The second is when allocating huge + pages as migration can relocate pages to satisfy a huge page + allocation instead of reclaiming. + +config ARCH_ENABLE_HUGEPAGE_MIGRATION + bool + +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + +config ZONE_DMA_FLAG + int + default "0" if !ZONE_DMA + default "1" + +config BOUNCE + bool "Enable bounce buffers" + default y + depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) + help + Enable bounce buffers for devices that cannot access + the full range of memory available to the CPU. Enabled + by default when ZONE_DMA or HIGHMEM is selected, but you + may say n to override this. + +# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often +# have more than 4GB of memory, but we don't currently use the IOTLB to present +# a 32-bit address to OHCI. So we need to use a bounce pool instead. +# +# We also use the bounce pool to provide stable page writes for jbd. jbd +# initiates buffer writeback without locking the page or setting PG_writeback, +# and fixing that behavior (a second time; jbd2 doesn't have this problem) is +# a major rework effort. Instead, use the bounce buffer to snapshot pages +# (until jbd goes away). The only jbd user is ext3. +config NEED_BOUNCE_POOL + bool + default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) + +config NR_QUICK + int + depends on QUICKLIST + default "2" if AVR32 + default "1" + +config VIRT_TO_BUS + bool + help + An architecture should select this if it implements the + deprecated interface virt_to_bus(). All new architectures + should probably not select this. + + +config MMU_NOTIFIER + bool + select SRCU + +config KSM + bool "Enable KSM for page merging" + depends on MMU + help + Enable Kernel Samepage Merging: KSM periodically scans those areas + of an application's address space that an app has advised may be + mergeable. When it finds pages of identical content, it replaces + the many instances by a single page with that content, so + saving memory until one or another app needs to modify the content. + Recommended for use with KVM, or with other duplicative applications. + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). + +config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU + default 4096 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. + + This value can be changed after boot using the + /proc/sys/vm/mmap_min_addr tunable. + +config ARCH_SUPPORTS_MEMORY_FAILURE + bool + +config MEMORY_FAILURE + depends on MMU + depends on ARCH_SUPPORTS_MEMORY_FAILURE + bool "Enable recovery from hardware memory errors" + select MEMORY_ISOLATION + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "HWPoison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS + select PROC_PAGE_MONITOR + +config NOMMU_INITIAL_TRIM_EXCESS + int "Turn on mmap() excess space trimming before booting" + depends on !MMU + default 1 + help + The NOMMU mmap() frequently needs to allocate large contiguous chunks + of memory on which to store mappings, but it can only ask the system + allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently + more than it requires. To deal with this, mmap() is able to trim off + the excess and return it to the allocator. + + If trimming is enabled, the excess is trimmed off and returned to the + system allocator, which can cause extra fragmentation, particularly + if there are a lot of transient processes. + + If trimming is disabled, the excess is kept, but not used, which for + long-term mappings means that the space is wasted. + + Trimming can be dynamically controlled through a sysctl option + (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of + excess pages there must be before trimming should occur, or zero if + no trimming is to occur. + + This option specifies the initial value of this option. The default + of 1 says that all excess pages should be trimmed. + + See Documentation/nommu-mmap.txt for more information. + +config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL + select COMPACTION + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + +# +# UP and nommu archs use km based percpu allocator +# +config NEED_PER_CPU_KM + depends on !SMP + bool + default y + +config CLEANCACHE + bool "Enable cleancache driver to cache clean pages if tmem is present" + default n + help + Cleancache can be thought of as a page-granularity victim cache + for clean pages that the kernel's pageframe replacement algorithm + (PFRA) would like to keep around, but can't since there isn't enough + memory. So when the PFRA "evicts" a page, it first attempts to use + cleancache code to put the data contained in that page into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. And when a cleancache-enabled + filesystem wishes to access a page in a file on disk, it first + checks cleancache to see if it already contains it; if it does, + the page is copied into the kernel and a disk access is avoided. + When a transcendent memory driver is available (such as zcache or + Xen transcendent memory), a significant I/O reduction + may be achieved. When none is available, all cleancache calls + are reduced to a single pointer-compare-against-NULL resulting + in a negligible performance hit. + + If unsure, say Y to enable cleancache + +config FRONTSWAP + bool "Enable frontswap to cache swap pages if tmem is present" + depends on SWAP + default n + help + Frontswap is so named because it can be thought of as the opposite + of a "backing" store for a swap device. The data is stored into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. When space in transcendent memory is available, + a significant swap I/O reduction may be achieved. When none is + available, all frontswap calls are reduced to a single pointer- + compare-against-NULL resulting in a negligible performance hit + and swap data is stored as normal on the matching swap device. + + If unsure, say Y to enable frontswap. + +config CMA + bool "Contiguous Memory Allocator" + depends on HAVE_MEMBLOCK && MMU + select MIGRATION + select MEMORY_ISOLATION + help + This enables the Contiguous Memory Allocator which allows other + subsystems to allocate big physically-contiguous blocks of memory. + CMA reserves a region of memory and allows only movable pages to + be allocated from it. This way, the kernel can use the memory for + pagecache and when a subsystem requests for contiguous area, the + allocated pages are migrated away to serve the contiguous request. + + If unsure, say "n". + +config CMA_DEBUG + bool "CMA debug messages (DEVELOPMENT)" + depends on DEBUG_KERNEL && CMA + help + Turns on debug messages in CMA. This produces KERN_DEBUG + messages for every CMA call as well as various messages while + processing calls such as dma_alloc_from_contiguous(). + This option does not affect warning and error messages. + +config CMA_DEBUGFS + bool "CMA debugfs interface" + depends on CMA && DEBUG_FS + help + Turns on the DebugFS interface for CMA. + +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 7 + help + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7". + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. + +config ZSWAP + bool "Compressed cache for swap pages (EXPERIMENTAL)" + depends on FRONTSWAP && CRYPTO=y + select CRYPTO_LZO + select ZPOOL + default n + help + A lightweight compressed cache for swap pages. It takes + pages that are in the process of being swapped out and attempts to + compress them into a dynamically allocated RAM-based memory pool. + This can result in a significant I/O reduction on swap device and, + in the case where decompressing from RAM is faster that swap device + reads, can also improve workload performance. + + This is marked experimental because it is a new feature (as of + v3.11) that interacts heavily with memory reclaim. While these + interactions don't cause any known issues on simple memory setups, + they have not be fully explored on the large set of potential + configurations and workloads that exist. + +config ZPOOL + tristate "Common API for compressed memory storage" + default n + help + Compressed memory storage API. This allows using either zbud or + zsmalloc. + +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + +config ZSMALLOC + tristate "Memory allocator for compressed pages" + depends on MMU + default n + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping + in order to reduce fragmentation. However, this results in a + non-standard allocator interface where a handle, not a pointer, is + returned by an alloc(). This handle must be mapped in order to + access the allocated space. + +config PGTABLE_MAPPING + bool "Use page table mapping to access object in zsmalloc" + depends on ZSMALLOC + help + By default, zsmalloc uses a copy-based object mapping method to + access allocations that span two pages. However, if a particular + architecture (ex, ARM) performs VM mapping faster than copying, + then you should select this. This causes zsmalloc to use page table + mapping rather than copying for object mapping. + + You can check speed with zsmalloc benchmark: + https://github.com/spartacus06/zsmapbench + +config ZSMALLOC_STAT + bool "Export zsmalloc statistics" + depends on ZSMALLOC + select DEBUG_FS + help + This option enables code in the zsmalloc to collect various + statistics about whats happening in zsmalloc and exports that + information to userspace via debugfs. + If unsure, say N. + +config GENERIC_EARLY_IOREMAP + bool + +config MAX_STACK_SIZE_MB + int "Maximum user stack size for 32-bit processes (MB)" + default 80 + range 8 256 if METAG + range 8 2048 + depends on STACK_GROWSUP && (!64BIT || COMPAT) + help + This is the maximum stack size in Megabytes in the VM layout of 32-bit + user processes when the stack grows upwards (currently only on parisc + and metag arch). The stack will be located at the highest memory + address minus the given value, unless the RLIMIT_STACK hard limit is + changed to a smaller value in which case that is used. + + A sane initial value is 80 MB. diff --git a/kernel/mm/Kconfig.debug b/kernel/mm/Kconfig.debug new file mode 100644 index 000000000..957d3da53 --- /dev/null +++ b/kernel/mm/Kconfig.debug @@ -0,0 +1,30 @@ +config PAGE_EXTENSION + bool "Extend memmap on extra space for more information on page" + ---help--- + Extend memmap on extra space for more information on page. This + could be used for debugging features that need to insert extra + field for every page. This extension enables us to save memory + by not allocating this extra memory according to boottime + configuration. + +config DEBUG_PAGEALLOC + bool "Debug page memory allocations" + depends on DEBUG_KERNEL + depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC + depends on !KMEMCHECK + select PAGE_EXTENSION + select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC + ---help--- + Unmap pages from the kernel linear mapping after free_pages(). + This results in a large slowdown, but helps to find certain types + of memory corruption. + + For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, + fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). Additionally, + this option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. + +config PAGE_POISONING + bool diff --git a/kernel/mm/Makefile b/kernel/mm/Makefile new file mode 100644 index 000000000..98c4eaeab --- /dev/null +++ b/kernel/mm/Makefile @@ -0,0 +1,80 @@ +# +# Makefile for the linux memory manager. +# + +KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slub.o := n + +mmu-y := nommu.o +mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + vmalloc.o pagewalk.o pgtable-generic.o + +ifdef CONFIG_CROSS_MEMORY_ATTACH +mmu-$(CONFIG_MMU) += process_vm_access.o +endif + +obj-y := filemap.o mempool.o oom_kill.o \ + maccess.o page_alloc.o page-writeback.o \ + readahead.o swap.o truncate.o vmscan.o shmem.o \ + util.o mmzone.o vmstat.o backing-dev.o \ + mm_init.o mmu_context.o percpu.o slab_common.o \ + compaction.o vmacache.o \ + interval_tree.o list_lru.o workingset.o \ + debug.o $(mmu-y) + +obj-y += init-mm.o + +ifdef CONFIG_NO_BOOTMEM + obj-y += nobootmem.o +else + obj-y += bootmem.o +endif + +obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o +ifdef CONFIG_MMU + obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o +endif +obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o + +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o +obj-$(CONFIG_ZSWAP) += zswap.o +obj-$(CONFIG_HAS_DMA) += dmapool.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_NUMA) += mempolicy.o +obj-$(CONFIG_SPARSEMEM) += sparse.o +obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o +obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_KSM) += ksm.o +obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o +obj-$(CONFIG_SLAB) += slab.o +obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_KMEMCHECK) += kmemcheck.o +obj-$(CONFIG_KASAN) += kasan/ +obj-$(CONFIG_FAILSLAB) += failslab.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_MEMTEST) += memtest.o +obj-$(CONFIG_MIGRATION) += migrate.o +obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o +obj-$(CONFIG_PAGE_COUNTER) += page_counter.o +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o +obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o +obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_PAGE_OWNER) += page_owner.o +obj-$(CONFIG_CLEANCACHE) += cleancache.o +obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o +obj-$(CONFIG_ZBUD) += zbud.o +obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o +obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/kernel/mm/backing-dev.c b/kernel/mm/backing-dev.c new file mode 100644 index 000000000..000e7b3b9 --- /dev/null +++ b/kernel/mm/backing-dev.c @@ -0,0 +1,601 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + +struct backing_dev_info noop_backing_dev_info = { + .name = "noop", + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static struct class *bdi_class; + +/* + * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side + * locking. + */ +DEFINE_SPINLOCK(bdi_lock); +LIST_HEAD(bdi_list); + +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; + +#ifdef CONFIG_DEBUG_FS +#include +#include + +static struct dentry *bdi_debug_root; + +static void bdi_debug_init(void) +{ + bdi_debug_root = debugfs_create_dir("bdi", NULL); +} + +static int bdi_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb = &bdi->wb; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; + unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; + struct inode *inode; + + nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; + spin_lock(&wb->list_lock); + list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_wb_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + nr_more_io++; + list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + if (inode->i_state & I_DIRTY_TIME) + nr_dirty_time++; + spin_unlock(&wb->list_lock); + + global_dirty_limits(&background_thresh, &dirty_thresh); + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + +#define K(x) ((x) << (PAGE_SHIFT - 10)) + seq_printf(m, + "BdiWriteback: %10lu kB\n" + "BdiReclaimable: %10lu kB\n" + "BdiDirtyThresh: %10lu kB\n" + "DirtyThresh: %10lu kB\n" + "BackgroundThresh: %10lu kB\n" + "BdiDirtied: %10lu kB\n" + "BdiWritten: %10lu kB\n" + "BdiWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" + "bdi_list: %10u\n" + "state: %10lx\n", + (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), + (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + K(bdi_thresh), + K(dirty_thresh), + K(background_thresh), + (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), + (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(bdi->write_bandwidth), + nr_dirty, + nr_io, + nr_more_io, + nr_dirty_time, + !list_empty(&bdi->bdi_list), bdi->state); +#undef K + + return 0; +} + +static int bdi_debug_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, bdi_debug_stats_show, inode->i_private); +} + +static const struct file_operations bdi_debug_stats_fops = { + .open = bdi_debug_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) +{ + bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); + bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir, + bdi, &bdi_debug_stats_fops); +} + +static void bdi_debug_unregister(struct backing_dev_info *bdi) +{ + debugfs_remove(bdi->debug_stats); + debugfs_remove(bdi->debug_dir); +} +#else +static inline void bdi_debug_init(void) +{ +} +static inline void bdi_debug_register(struct backing_dev_info *bdi, + const char *name) +{ +} +static inline void bdi_debug_unregister(struct backing_dev_info *bdi) +{ +} +#endif + +static ssize_t read_ahead_kb_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned long read_ahead_kb; + ssize_t ret; + + ret = kstrtoul(buf, 10, &read_ahead_kb); + if (ret < 0) + return ret; + + bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); + + return count; +} + +#define K(pages) ((pages) << (PAGE_SHIFT - 10)) + +#define BDI_SHOW(name, expr) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *page) \ +{ \ + struct backing_dev_info *bdi = dev_get_drvdata(dev); \ + \ + return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ +} \ +static DEVICE_ATTR_RW(name); + +BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) + +static ssize_t min_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(min_ratio, bdi->min_ratio) + +static ssize_t max_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(max_ratio, bdi->max_ratio) + +static ssize_t stable_pages_required_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", + bdi_cap_stable_pages_required(bdi) ? 1 : 0); +} +static DEVICE_ATTR_RO(stable_pages_required); + +static struct attribute *bdi_dev_attrs[] = { + &dev_attr_read_ahead_kb.attr, + &dev_attr_min_ratio.attr, + &dev_attr_max_ratio.attr, + &dev_attr_stable_pages_required.attr, + NULL, +}; +ATTRIBUTE_GROUPS(bdi_dev); + +static __init int bdi_class_init(void) +{ + bdi_class = class_create(THIS_MODULE, "bdi"); + if (IS_ERR(bdi_class)) + return PTR_ERR(bdi_class); + + bdi_class->dev_groups = bdi_dev_groups; + bdi_debug_init(); + return 0; +} +postcore_initcall(bdi_class_init); + +static int __init default_bdi_init(void) +{ + int err; + + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!bdi_wq) + return -ENOMEM; + + err = bdi_init(&noop_backing_dev_info); + + return err; +} +subsys_initcall(default_bdi_init); + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); +} + +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). + */ +void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_unlock_bh(&bdi->wb_lock); +} + +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + synchronize_rcu_expedited(); +} + +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) +{ + va_list args; + struct device *dev; + + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; + + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + bdi->dev = dev; + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; +} +EXPORT_SYMBOL(bdi_register); + +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) +{ + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); + +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void bdi_wb_shutdown(struct backing_dev_info *bdi) +{ + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + if (!test_and_clear_bit(BDI_registered, &bdi->state)) { + spin_unlock_bh(&bdi->wb_lock); + return; + } + spin_unlock_bh(&bdi->wb_lock); + + /* + * Make sure nobody finds us on the bdi_list anymore + */ + bdi_remove_from_list(bdi); + + /* + * Drain work list and shutdown the delayed_work. At this point, + * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi + * is dying and its work_list needs to be drained no matter what. + */ + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + flush_delayed_work(&bdi->wb.dwork); +} + +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); + spin_lock_init(&wb->list_lock); + INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); +} + +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + +int bdi_init(struct backing_dev_info *bdi) +{ + int i, err; + + bdi->dev = NULL; + + bdi->min_ratio = 0; + bdi->max_ratio = 100; + bdi->max_prop_frac = FPROP_FRAC_BASE; + spin_lock_init(&bdi->wb_lock); + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->work_list); + + bdi_wb_init(&bdi->wb, bdi); + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); + if (err) + goto err; + } + + bdi->dirty_exceeded = 0; + + bdi->bw_time_stamp = jiffies; + bdi->written_stamp = 0; + + bdi->balanced_dirty_ratelimit = INIT_BW; + bdi->dirty_ratelimit = INIT_BW; + bdi->write_bandwidth = INIT_BW; + bdi->avg_write_bandwidth = INIT_BW; + + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); + + if (err) { +err: + while (i--) + percpu_counter_destroy(&bdi->bdi_stat[i]); + } + + return err; +} +EXPORT_SYMBOL(bdi_init); + +void bdi_destroy(struct backing_dev_info *bdi) +{ + int i; + + bdi_wb_shutdown(bdi); + bdi_set_min_ratio(bdi, 0); + + WARN_ON(!list_empty(&bdi->work_list)); + WARN_ON(delayed_work_pending(&bdi->wb.dwork)); + + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } + + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) + percpu_counter_destroy(&bdi->bdi_stat[i]); + fprop_local_destroy_percpu(&bdi->completions); +} +EXPORT_SYMBOL(bdi_destroy); + +/* + * For use from filesystems to quickly init and register a bdi associated + * with dirty writeback + */ +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) +{ + int err; + + bdi->name = name; + bdi->capabilities = 0; + err = bdi_init(bdi); + if (err) + return err; + + err = bdi_register(bdi, NULL, "%.28s-%ld", name, + atomic_long_inc_return(&bdi_seq)); + if (err) { + bdi_destroy(bdi); + return err; + } + + return 0; +} +EXPORT_SYMBOL(bdi_setup_and_register); + +static wait_queue_head_t congestion_wqh[2] = { + __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), + __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) + }; +static atomic_t nr_bdi_congested[2]; + +void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + enum bdi_state bit; + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + bit = sync ? BDI_sync_congested : BDI_async_congested; + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); + smp_mb__after_atomic(); + if (waitqueue_active(wqh)) + wake_up(wqh); +} +EXPORT_SYMBOL(clear_bdi_congested); + +void set_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + enum bdi_state bit; + + bit = sync ? BDI_sync_congested : BDI_async_congested; + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); +} +EXPORT_SYMBOL(set_bdi_congested); + +/** + * congestion_wait - wait for a backing_dev to become uncongested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit + * write congestion. If no backing_devs are congested then just wait for the + * next write to be completed. + */ +long congestion_wait(int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(congestion_wait); + +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absence of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !test_bit(ZONE_CONGESTED, &zone->flags)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); + +int pdflush_proc_obsolete(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char kbuf[] = "0\n"; + + if (*ppos || *lenp < sizeof(kbuf)) { + *lenp = 0; + return 0; + } + + if (copy_to_user(buffer, kbuf, sizeof(kbuf))) + return -EFAULT; + printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", + table->procname); + + *lenp = 2; + *ppos += *lenp; + return 2; +} diff --git a/kernel/mm/balloon_compaction.c b/kernel/mm/balloon_compaction.c new file mode 100644 index 000000000..fcad8322e --- /dev/null +++ b/kernel/mm/balloon_compaction.c @@ -0,0 +1,221 @@ +/* + * mm/balloon_compaction.c + * + * Common interface for making balloon pages movable by compaction. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#include +#include +#include +#include + +/* + * balloon_page_enqueue - allocates a new page and inserts it into the balloon + * page list. + * @b_dev_info: balloon device decriptor where we will insert a new page to + * + * Driver must call it to properly allocate a new enlisted balloon page + * before definetively removing it from the guest system. + * This function returns the page address for the recently enqueued page or + * NULL in the case we fail to allocate a new page this turn. + */ +struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + struct page *page = alloc_page(balloon_mapping_gfp_mask() | + __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!page) + return NULL; + + /* + * Block others from accessing the 'page' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'page' at this point. + */ + BUG_ON(!trylock_page(page)); + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_insert(b_dev_info, page); + __count_vm_event(BALLOON_INFLATE); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/* + * balloon_page_dequeue - removes a page from balloon's page list and returns + * the its address to allow the driver release the page. + * @b_dev_info: balloon device decriptor where we will grab a page from. + * + * Driver must call it to properly de-allocate a previous enlisted balloon page + * before definetively releasing it back to the guest system. + * This function returns the page address for the recently dequeued page or + * NULL in the case we find balloon's page list temporarily empty due to + * compaction isolated pages. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + struct page *page, *tmp; + unsigned long flags; + bool dequeued_page; + + dequeued_page = false; + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + /* + * Block others from accessing the 'page' while we get around + * establishing additional references and preparing the 'page' + * to be released by the balloon driver. + */ + if (trylock_page(page)) { +#ifdef CONFIG_BALLOON_COMPACTION + if (!PagePrivate(page)) { + /* raced with isolation */ + unlock_page(page); + continue; + } +#endif + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_delete(page); + __count_vm_event(BALLOON_DEFLATE); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + dequeued_page = true; + break; + } + } + + if (!dequeued_page) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there is no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck into + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + page = NULL; + } + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION + +static inline void __isolate_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + ClearPagePrivate(page); + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline void __putback_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + SetPagePrivate(page); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +/* __isolate_lru_page() counterpart for a ballooned page */ +bool balloon_page_isolate(struct page *page) +{ + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a balloon page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (likely(get_page_unless_zero(page))) { + /* + * As balloon pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the balloon driver releasing a page. + * + * In order to avoid having an already isolated balloon page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released by + * the balloon driver, lets be sure we have the page lock + * before proceeding with the balloon page isolation steps. + */ + if (likely(trylock_page(page))) { + /* + * A ballooned page, by default, has PagePrivate set. + * Prevent concurrent compaction threads from isolating + * an already isolated balloon page by clearing it. + */ + if (balloon_page_movable(page)) { + __isolate_balloon_page(page); + unlock_page(page); + return true; + } + unlock_page(page); + } + put_page(page); + } + return false; +} + +/* putback_lru_page() counterpart for a ballooned page */ +void balloon_page_putback(struct page *page) +{ + /* + * 'lock_page()' stabilizes the page and prevents races against + * concurrent isolation threads attempting to re-isolate it. + */ + lock_page(page); + + if (__is_movable_balloon_page(page)) { + __putback_balloon_page(page); + /* drop the extra ref count taken for page isolation */ + put_page(page); + } else { + WARN_ON(1); + dump_page(page, "not movable balloon page"); + } + unlock_page(page); +} + +/* move_to_new_page() counterpart for a ballooned page */ +int balloon_page_migrate(struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct balloon_dev_info *balloon = balloon_page_device(page); + int rc = -EAGAIN; + + /* + * Block others from accessing the 'newpage' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'newpage' at this point. + */ + BUG_ON(!trylock_page(newpage)); + + if (WARN_ON(!__is_movable_balloon_page(page))) { + dump_page(page, "not movable balloon page"); + unlock_page(newpage); + return rc; + } + + if (balloon && balloon->migratepage) + rc = balloon->migratepage(balloon, newpage, page, mode); + + unlock_page(newpage); + return rc; +} +#endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/kernel/mm/bootmem.c b/kernel/mm/bootmem.c new file mode 100644 index 000000000..477be6965 --- /dev/null +++ b/kernel/mm/bootmem.c @@ -0,0 +1,861 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data = { + .bdata = &bootmem_node_data[0] +}; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; + +static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); + +static int bootmem_debug; + +static int __init bootmem_debug_setup(char *buf) +{ + bootmem_debug = 1; + return 0; +} +early_param("bootmem_debug", bootmem_debug_setup); + +#define bdebug(fmt, args...) ({ \ + if (unlikely(bootmem_debug)) \ + printk(KERN_INFO \ + "bootmem::%s " fmt, \ + __func__, ## args); \ +}) + +static unsigned long __init bootmap_bytes(unsigned long pages) +{ + unsigned long bytes = DIV_ROUND_UP(pages, 8); + + return ALIGN(bytes, sizeof(long)); +} + +/** + * bootmem_bootmap_pages - calculate bitmap size in pages + * @pages: number of pages the bitmap has to represent + */ +unsigned long __init bootmem_bootmap_pages(unsigned long pages) +{ + unsigned long bytes = bootmap_bytes(pages); + + return PAGE_ALIGN(bytes) >> PAGE_SHIFT; +} + +/* + * link bdata in order + */ +static void __init link_bootmem(bootmem_data_t *bdata) +{ + bootmem_data_t *ent; + + list_for_each_entry(ent, &bdata_list, list) { + if (bdata->node_min_pfn < ent->node_min_pfn) { + list_add_tail(&bdata->list, &ent->list); + return; + } + } + + list_add_tail(&bdata->list, &bdata_list); +} + +/* + * Called once to set up the allocator itself. + */ +static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, + unsigned long mapstart, unsigned long start, unsigned long end) +{ + unsigned long mapsize; + + mminit_validate_memmodel_limits(&start, &end); + bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); + bdata->node_min_pfn = start; + bdata->node_low_pfn = end; + link_bootmem(bdata); + + /* + * Initially all pages are reserved - setup_arch() has to + * register free RAM areas explicitly. + */ + mapsize = bootmap_bytes(end - start); + memset(bdata->node_bootmem_map, 0xff, mapsize); + + bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", + bdata - bootmem_node_data, start, mapstart, end, mapsize); + + return mapsize; +} + +/** + * init_bootmem_node - register a node as boot memory + * @pgdat: node to register + * @freepfn: pfn where the bitmap for this node is to be placed + * @startpfn: first pfn on the node + * @endpfn: first pfn after the node + * + * Returns the number of bytes needed to hold the bitmap for this node. + */ +unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, + unsigned long startpfn, unsigned long endpfn) +{ + return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); +} + +/** + * init_bootmem - register boot memory + * @start: pfn where the bitmap is to be placed + * @pages: number of available physical pages + * + * Returns the number of bytes needed to hold the bitmap. + */ +unsigned long __init init_bootmem(unsigned long start, unsigned long pages) +{ + max_low_pfn = pages; + min_low_pfn = start; + return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); +} + +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting physical address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long physaddr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(physaddr), size); + + cursor = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) +{ + struct page *page; + unsigned long *map, start, end, pages, count = 0; + + if (!bdata->node_bootmem_map) + return 0; + + map = bdata->node_bootmem_map; + start = bdata->node_min_pfn; + end = bdata->node_low_pfn; + + bdebug("nid=%td start=%lx end=%lx\n", + bdata - bootmem_node_data, start, end); + + while (start < end) { + unsigned long idx, vec; + unsigned shift; + + idx = start - bdata->node_min_pfn; + shift = idx & (BITS_PER_LONG - 1); + /* + * vec holds at most BITS_PER_LONG map bits, + * bit 0 corresponds to start. + */ + vec = ~map[idx / BITS_PER_LONG]; + + if (shift) { + vec >>= shift; + if (end - start >= BITS_PER_LONG) + vec |= ~map[idx / BITS_PER_LONG + 1] << + (BITS_PER_LONG - shift); + } + /* + * If we have a properly aligned and fully unreserved + * BITS_PER_LONG block of pages in front of us, free + * it in one go. + */ + if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { + int order = ilog2(BITS_PER_LONG); + + __free_pages_bootmem(pfn_to_page(start), order); + count += BITS_PER_LONG; + start += BITS_PER_LONG; + } else { + unsigned long cur = start; + + start = ALIGN(start + 1, BITS_PER_LONG); + while (vec && cur != start) { + if (vec & 1) { + page = pfn_to_page(cur); + __free_pages_bootmem(page, 0); + count++; + } + vec >>= 1; + ++cur; + } + } + } + + page = virt_to_page(bdata->node_bootmem_map); + pages = bdata->node_low_pfn - bdata->node_min_pfn; + pages = bootmem_bootmap_pages(pages); + count += pages; + while (pages--) + __free_pages_bootmem(page++, 0); + + bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); + + return count; +} + +static int reset_managed_pages_done __initdata; + +void reset_node_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + if (reset_managed_pages_done) + return; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + + reset_managed_pages_done = 1; +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + unsigned long total_pages = 0; + bootmem_data_t *bdata; + + reset_all_zones_managed_pages(); + + list_for_each_entry(bdata, &bdata_list, list) + total_pages += free_all_bootmem_core(bdata); + + totalram_pages += total_pages; + + return total_pages; +} + +static void __init __free(bootmem_data_t *bdata, + unsigned long sidx, unsigned long eidx) +{ + unsigned long idx; + + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn); + + if (bdata->hint_idx > sidx) + bdata->hint_idx = sidx; + + for (idx = sidx; idx < eidx; idx++) + if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) + BUG(); +} + +static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, + unsigned long eidx, int flags) +{ + unsigned long idx; + int exclusive = flags & BOOTMEM_EXCLUSIVE; + + bdebug("nid=%td start=%lx end=%lx flags=%x\n", + bdata - bootmem_node_data, + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn, + flags); + + for (idx = sidx; idx < eidx; idx++) + if (test_and_set_bit(idx, bdata->node_bootmem_map)) { + if (exclusive) { + __free(bdata, sidx, idx); + return -EBUSY; + } + bdebug("silent double reserve of PFN %lx\n", + idx + bdata->node_min_pfn); + } + return 0; +} + +static int __init mark_bootmem_node(bootmem_data_t *bdata, + unsigned long start, unsigned long end, + int reserve, int flags) +{ + unsigned long sidx, eidx; + + bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", + bdata - bootmem_node_data, start, end, reserve, flags); + + BUG_ON(start < bdata->node_min_pfn); + BUG_ON(end > bdata->node_low_pfn); + + sidx = start - bdata->node_min_pfn; + eidx = end - bdata->node_min_pfn; + + if (reserve) + return __reserve(bdata, sidx, eidx, flags); + else + __free(bdata, sidx, eidx); + return 0; +} + +static int __init mark_bootmem(unsigned long start, unsigned long end, + int reserve, int flags) +{ + unsigned long pos; + bootmem_data_t *bdata; + + pos = start; + list_for_each_entry(bdata, &bdata_list, list) { + int err; + unsigned long max; + + if (pos < bdata->node_min_pfn || + pos >= bdata->node_low_pfn) { + BUG_ON(pos != start); + continue; + } + + max = min(bdata->node_low_pfn, end); + + err = mark_bootmem_node(bdata, pos, max, reserve, flags); + if (reserve && err) { + mark_bootmem(start, pos, 0, 0); + return err; + } + + if (max == end) + return 0; + pos = bdata->node_low_pfn; + } + BUG(); +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + unsigned long start, end; + + kmemleak_free_part(__va(physaddr), size); + + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); + + mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting physical address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long physaddr, unsigned long size) +{ + unsigned long start, end; + + kmemleak_free_part(__va(physaddr), size); + + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); + + mark_bootmem(start, end, 0, 0); +} + +/** + * reserve_bootmem_node - mark a page range as reserved + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * The range must reside completely on the specified node. + */ +int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size, int flags) +{ + unsigned long start, end; + + start = PFN_DOWN(physaddr); + end = PFN_UP(physaddr + size); + + return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +} + +/** + * reserve_bootmem - mark a page range as reserved + * @addr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * The range must be contiguous but may span node boundaries. + */ +int __init reserve_bootmem(unsigned long addr, unsigned long size, + int flags) +{ + unsigned long start, end; + + start = PFN_DOWN(addr); + end = PFN_UP(addr + size); + + return mark_bootmem(start, end, 1, flags); +} + +static unsigned long __init align_idx(struct bootmem_data *bdata, + unsigned long idx, unsigned long step) +{ + unsigned long base = bdata->node_min_pfn; + + /* + * Align the index with respect to the node start so that the + * combination of both satisfies the requested alignment. + */ + + return ALIGN(base + idx, step) - base; +} + +static unsigned long __init align_off(struct bootmem_data *bdata, + unsigned long off, unsigned long align) +{ + unsigned long base = PFN_PHYS(bdata->node_min_pfn); + + /* Same as align_idx for byte offsets */ + + return ALIGN(base + off, align) - base; +} + +static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + unsigned long fallback = 0; + unsigned long min, max, start, sidx, midx, step; + + bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", + bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, + align, goal, limit); + + BUG_ON(!size); + BUG_ON(align & (align - 1)); + BUG_ON(limit && goal + size > limit); + + if (!bdata->node_bootmem_map) + return NULL; + + min = bdata->node_min_pfn; + max = bdata->node_low_pfn; + + goal >>= PAGE_SHIFT; + limit >>= PAGE_SHIFT; + + if (limit && max > limit) + max = limit; + if (max <= min) + return NULL; + + step = max(align >> PAGE_SHIFT, 1UL); + + if (goal && min < goal && goal < max) + start = ALIGN(goal, step); + else + start = ALIGN(min, step); + + sidx = start - bdata->node_min_pfn; + midx = max - bdata->node_min_pfn; + + if (bdata->hint_idx > sidx) { + /* + * Handle the valid case of sidx being zero and still + * catch the fallback below. + */ + fallback = sidx + 1; + sidx = align_idx(bdata, bdata->hint_idx, step); + } + + while (1) { + int merge; + void *region; + unsigned long eidx, i, start_off, end_off; +find_block: + sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); + sidx = align_idx(bdata, sidx, step); + eidx = sidx + PFN_UP(size); + + if (sidx >= midx || eidx > midx) + break; + + for (i = sidx; i < eidx; i++) + if (test_bit(i, bdata->node_bootmem_map)) { + sidx = align_idx(bdata, i, step); + if (sidx == i) + sidx += step; + goto find_block; + } + + if (bdata->last_end_off & (PAGE_SIZE - 1) && + PFN_DOWN(bdata->last_end_off) + 1 == sidx) + start_off = align_off(bdata, bdata->last_end_off, align); + else + start_off = PFN_PHYS(sidx); + + merge = PFN_DOWN(start_off) < sidx; + end_off = start_off + size; + + bdata->last_end_off = end_off; + bdata->hint_idx = PFN_UP(end_off); + + /* + * Reserve the area now: + */ + if (__reserve(bdata, PFN_DOWN(start_off) + merge, + PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) + BUG(); + + region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + + start_off); + memset(region, 0, size); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(region, size, 0, 0); + return region; + } + + if (fallback) { + sidx = align_idx(bdata, fallback - 1, step); + fallback = 0; + goto find_block; + } + + return NULL; +} + +static void * __init alloc_bootmem_core(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + bootmem_data_t *bdata; + void *region; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + + list_for_each_entry(bdata, &bdata_list, list) { + if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) + continue; + if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) + break; + + region = alloc_bootmem_bdata(bdata, size, align, goal, limit); + if (region) + return region; + } + + return NULL; +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + +restart: + ptr = alloc_bootmem_core(size, align, goal, limit); + if (ptr) + return ptr; + if (goal) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = 0; + + return ___alloc_bootmem_nopanic(size, align, goal, limit); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = 0; + + return ___alloc_bootmem(size, align, goal, limit); +} + +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); +again: + + /* do not panic in alloc_bootmem_bdata() */ + if (limit && goal + size > limit) + limit = 0; + + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); + if (ptr) + return ptr; + + ptr = alloc_bootmem_core(size, align, goal, limit); + if (ptr) + return ptr; + + if (goal) { + goal = 0; + goto again; + } + + return NULL; +} + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); +} + +void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) +{ + void *ptr; + + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); + if (ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat_end_pfn(pgdat); + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, + new_goal, 0); + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node(pgdat, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +} diff --git a/kernel/mm/cleancache.c b/kernel/mm/cleancache.c new file mode 100644 index 000000000..8fc508111 --- /dev/null +++ b/kernel/mm/cleancache.c @@ -0,0 +1,319 @@ +/* + * Cleancache frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of cleancache. See + * Documentation/vm/cleancache.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include +#include + +/* + * cleancache_ops is set by cleancache_register_ops to contain the pointers + * to the cleancache "backend" implementation functions. + */ +static struct cleancache_ops *cleancache_ops __read_mostly; + +/* + * Counters available via /sys/kernel/debug/cleancache (if debugfs is + * properly configured. These are for information only so are not protected + * against increment races. + */ +static u64 cleancache_succ_gets; +static u64 cleancache_failed_gets; +static u64 cleancache_puts; +static u64 cleancache_invalidates; + +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} + +/* + * Register operations for cleancache. Returns 0 on success. + */ +int cleancache_register_ops(struct cleancache_ops *ops) +{ + if (cmpxchg(&cleancache_ops, NULL, ops)) + return -EBUSY; + + /* + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. + */ + iterate_supers(cleancache_register_ops_sb, NULL); + return 0; +} +EXPORT_SYMBOL(cleancache_register_ops); + +/* Called by a cleancache-enabled filesystem at time of mount */ +void __cleancache_init_fs(struct super_block *sb) +{ + int pool_id = CLEANCACHE_NO_BACKEND; + + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; + } + sb->cleancache_poolid = pool_id; +} +EXPORT_SYMBOL(__cleancache_init_fs); + +/* Called by a cleancache-enabled clustered filesystem at time of mount */ +void __cleancache_init_shared_fs(struct super_block *sb) +{ + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; + + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; + } + sb->cleancache_poolid = pool_id; +} +EXPORT_SYMBOL(__cleancache_init_shared_fs); + +/* + * If the filesystem uses exportable filehandles, use the filehandle as + * the key, else use the inode number. + */ +static int cleancache_get_key(struct inode *inode, + struct cleancache_filekey *key) +{ + int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); + int len = 0, maxlen = CLEANCACHE_KEY_MAX; + struct super_block *sb = inode->i_sb; + + key->u.ino = inode->i_ino; + if (sb->s_export_op != NULL) { + fhfn = sb->s_export_op->encode_fh; + if (fhfn) { + len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); + if (len <= FILEID_ROOT || len == FILEID_INVALID) + return -1; + if (maxlen > CLEANCACHE_KEY_MAX) + return -1; + } + } + return 0; +} + +/* + * "Get" data from cleancache associated with the poolid/inode/index + * that were specified when the data was put to cleanache and, if + * successful, use it to fill the specified page with data and return 0. + * The pageframe is unchanged and returns -1 if the get fails. + * Page must be locked by caller. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. + */ +int __cleancache_get_page(struct page *page) +{ + int ret = -1; + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (!cleancache_ops) { + cleancache_failed_gets++; + goto out; + } + + VM_BUG_ON_PAGE(!PageLocked(page), page); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) + goto out; + + if (cleancache_get_key(page->mapping->host, &key) < 0) + goto out; + + ret = cleancache_ops->get_page(pool_id, key, page->index, page); + if (ret == 0) + cleancache_succ_gets++; + else + cleancache_failed_gets++; +out: + return ret; +} +EXPORT_SYMBOL(__cleancache_get_page); + +/* + * "Put" data from a page to cleancache and associate it with the + * (previously-obtained per-filesystem) poolid and the page's, + * inode and page index. Page must be locked. Note that a put_page + * always "succeeds", though a subsequent get_page may succeed or fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. + */ +void __cleancache_put_page(struct page *page) +{ + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (!cleancache_ops) { + cleancache_puts++; + return; + } + + VM_BUG_ON_PAGE(!PageLocked(page), page); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0 && + cleancache_get_key(page->mapping->host, &key) >= 0) { + cleancache_ops->put_page(pool_id, key, page->index, page); + cleancache_puts++; + } +} +EXPORT_SYMBOL(__cleancache_put_page); + +/* + * Invalidate any data from cleancache associated with the poolid and the + * page's inode and page index so that a subsequent "get" will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. + */ +void __cleancache_invalidate_page(struct address_space *mapping, + struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (!cleancache_ops) + return; + + if (pool_id >= 0) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (cleancache_get_key(mapping->host, &key) >= 0) { + cleancache_ops->invalidate_page(pool_id, + key, page->index); + cleancache_invalidates++; + } + } +} +EXPORT_SYMBOL(__cleancache_invalidate_page); + +/* + * Invalidate all data from cleancache associated with the poolid and the + * mappings's inode so that all subsequent gets to this poolid/inode + * will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. + */ +void __cleancache_invalidate_inode(struct address_space *mapping) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (!cleancache_ops) + return; + + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) + cleancache_ops->invalidate_inode(pool_id, key); +} +EXPORT_SYMBOL(__cleancache_invalidate_inode); + +/* + * Called by any cleancache-enabled filesystem at time of unmount; + * note that pool_id is surrendered and may be returned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs. + */ +void __cleancache_invalidate_fs(struct super_block *sb) +{ + int pool_id; + + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); +} +EXPORT_SYMBOL(__cleancache_invalidate_fs); + +static int __init init_cleancache(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("cleancache", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets); + debugfs_create_u64("failed_gets", S_IRUGO, + root, &cleancache_failed_gets); + debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts); + debugfs_create_u64("invalidates", S_IRUGO, + root, &cleancache_invalidates); +#endif + return 0; +} +module_init(init_cleancache) diff --git a/kernel/mm/cma.c b/kernel/mm/cma.c new file mode 100644 index 000000000..3a7a67b93 --- /dev/null +++ b/kernel/mm/cma.c @@ -0,0 +1,456 @@ +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski + * Michal Nazarewicz + * Aneesh Kumar K.V + * Joonsoo Kim + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif +#define CREATE_TRACE_POINTS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cma.h" + +struct cma cma_areas[MAX_CMA_AREAS]; +unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(const struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(const struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, + int align_order) +{ + if (align_order <= cma->order_per_bit) + return 0; + return (1UL << (align_order - cma->order_per_bit)) - 1; +} + +/* + * Find a PFN aligned to the specified order and return an offset represented in + * order_per_bits. + */ +static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, + int align_order) +{ + if (align_order <= cma->order_per_bit) + return 0; + + return (ALIGN(cma->base_pfn, (1UL << align_order)) + - cma->base_pfn) >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, + unsigned int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + +static int __init cma_activate_area(struct cma *cma) +{ + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); + unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; + unsigned i = cma->count >> pageblock_order; + struct zone *zone; + + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + return -ENOMEM; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + goto err; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + + mutex_init(&cma->lock); + +#ifdef CONFIG_CMA_DEBUGFS + INIT_HLIST_HEAD(&cma->mem_head); + spin_lock_init(&cma->mem_head_lock); +#endif + + return 0; + +err: + kfree(cma->bitmap); + cma->count = 0; + return -EINVAL; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = cma_activate_area(&cma_areas[i]); + + if (ret) + return ret; + } + + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * cma_init_reserved_mem() - create custom contiguous area from reserved memory + * @base: Base address of the reserved area + * @size: Size of the reserved area (in bytes), + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @res_cma: Pointer to store the created cma region. + * + * This function creates custom contiguous area from already reserved memory. + */ +int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, + unsigned int order_per_bit, + struct cma **res_cma) +{ + struct cma *cma; + phys_addr_t alignment; + + /* Sanity checks */ + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size || !memblock_is_region_reserved(base, size)) + return -EINVAL; + + /* ensure minimal alignment requied by mm core */ + alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + + /* alignment should be aligned with order_per_bit */ + if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + return -EINVAL; + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + totalcma_pages += (size / PAGE_SIZE); + + return 0; +} + +/** + * cma_declare_contiguous() - reserve custom contiguous area + * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @fixed: hint about where to place the reserved area + * @res_cma: Pointer to store the created cma region. + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, struct cma **res_cma) +{ + phys_addr_t memblock_end = memblock_end_of_DRAM(); + phys_addr_t highmem_start; + int ret = 0; + +#ifdef CONFIG_X86 + /* + * high_memory isn't direct mapped memory so retrieving its physical + * address isn't appropriate. But it would be useful to check the + * physical address of the highmem boundary so it's justfiable to get + * the physical address from it. On x86 there is a validation check for + * this case, so the following workaround is needed to avoid it. + */ + highmem_start = __pa_nodebug(high_memory); +#else + highmem_start = __pa(high_memory); +#endif + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + if (!base) + fixed = false; + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* + * If allocating at a fixed base the request region must not cross the + * low/high memory boundary. + */ + if (fixed && base < highmem_start && base + size > highmem_start) { + ret = -EINVAL; + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); + goto err; + } + + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + + /* Reserve memory */ + if (fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = 0; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range(size, alignment, + highmem_start, limit); + limit = highmem_start; + } + + if (!addr) { + addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } + } + + /* + * kmemleak scans/reads tracked objects for pointers to other + * objects but this address isn't mapped and accessible + */ + kmemleak_ignore(phys_to_virt(addr)); + base = addr; + } + + ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); + if (ret) + goto err; + + pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, + &base); + return 0; + +err: + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) +{ + unsigned long mask, offset, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + struct page *page = NULL; + int ret; + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mask = cma_bitmap_aligned_mask(cma, align); + offset = cma_bitmap_aligned_offset(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + for (;;) { + mutex_lock(&cma->lock); + bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask, + offset); + if (bitmap_no >= bitmap_maxno) { + mutex_unlock(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + mutex_unlock(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } + + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + trace_cma_alloc(page ? pfn : -1UL, page, count, align); + + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + trace_cma_release(pfn, pages, count); + + return true; +} diff --git a/kernel/mm/cma.h b/kernel/mm/cma.h new file mode 100644 index 000000000..1132d7335 --- /dev/null +++ b/kernel/mm/cma.h @@ -0,0 +1,24 @@ +#ifndef __MM_CMA_H__ +#define __MM_CMA_H__ + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +#ifdef CONFIG_CMA_DEBUGFS + struct hlist_head mem_head; + spinlock_t mem_head_lock; +#endif +}; + +extern struct cma cma_areas[MAX_CMA_AREAS]; +extern unsigned cma_area_count; + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +#endif diff --git a/kernel/mm/cma_debug.c b/kernel/mm/cma_debug.c new file mode 100644 index 000000000..7621ee34d --- /dev/null +++ b/kernel/mm/cma_debug.c @@ -0,0 +1,205 @@ +/* + * CMA DebugFS Interface + * + * Copyright (c) 2015 Sasha Levin + */ + + +#include +#include +#include +#include +#include +#include + +#include "cma.h" + +struct cma_mem { + struct hlist_node node; + struct page *p; + unsigned long n; +}; + +static struct dentry *cma_debugfs_root; + +static int cma_debugfs_get(void *data, u64 *val) +{ + unsigned long *p = data; + + *val = *p; + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); + +static int cma_used_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long used; + + mutex_lock(&cma->lock); + /* pages counter is smaller than sizeof(int) */ + used = bitmap_weight(cma->bitmap, (int)cma->count); + mutex_unlock(&cma->lock); + *val = (u64)used << cma->order_per_bit; + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); + +static int cma_maxchunk_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long maxchunk = 0; + unsigned long start, end = 0; + + mutex_lock(&cma->lock); + for (;;) { + start = find_next_zero_bit(cma->bitmap, cma->count, end); + if (start >= cma->count) + break; + end = find_next_bit(cma->bitmap, cma->count, start); + maxchunk = max(end - start, maxchunk); + } + mutex_unlock(&cma->lock); + *val = (u64)maxchunk << cma->order_per_bit; + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); + +static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) +{ + spin_lock(&cma->mem_head_lock); + hlist_add_head(&mem->node, &cma->mem_head); + spin_unlock(&cma->mem_head_lock); +} + +static struct cma_mem *cma_get_entry_from_list(struct cma *cma) +{ + struct cma_mem *mem = NULL; + + spin_lock(&cma->mem_head_lock); + if (!hlist_empty(&cma->mem_head)) { + mem = hlist_entry(cma->mem_head.first, struct cma_mem, node); + hlist_del_init(&mem->node); + } + spin_unlock(&cma->mem_head_lock); + + return mem; +} + +static int cma_free_mem(struct cma *cma, int count) +{ + struct cma_mem *mem = NULL; + + while (count) { + mem = cma_get_entry_from_list(cma); + if (mem == NULL) + return 0; + + if (mem->n <= count) { + cma_release(cma, mem->p, mem->n); + count -= mem->n; + kfree(mem); + } else if (cma->order_per_bit == 0) { + cma_release(cma, mem->p, count); + mem->p += count; + mem->n -= count; + count = 0; + cma_add_to_cma_mem_list(cma, mem); + } else { + pr_debug("cma: cannot release partial block when order_per_bit != 0\n"); + cma_add_to_cma_mem_list(cma, mem); + break; + } + } + + return 0; + +} + +static int cma_free_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_free_mem(cma, pages); +} +DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); + +static int cma_alloc_mem(struct cma *cma, int count) +{ + struct cma_mem *mem; + struct page *p; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + p = cma_alloc(cma, count, 0); + if (!p) { + kfree(mem); + return -ENOMEM; + } + + mem->p = p; + mem->n = count; + + cma_add_to_cma_mem_list(cma, mem); + + return 0; +} + +static int cma_alloc_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_alloc_mem(cma, pages); +} +DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); + +static void cma_debugfs_add_one(struct cma *cma, int idx) +{ + struct dentry *tmp; + char name[16]; + int u32s; + + sprintf(name, "cma-%d", idx); + + tmp = debugfs_create_dir(name, cma_debugfs_root); + + debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma, + &cma_alloc_fops); + + debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma, + &cma_free_fops); + + debugfs_create_file("base_pfn", S_IRUGO, tmp, + &cma->base_pfn, &cma_debugfs_fops); + debugfs_create_file("count", S_IRUGO, tmp, + &cma->count, &cma_debugfs_fops); + debugfs_create_file("order_per_bit", S_IRUGO, tmp, + &cma->order_per_bit, &cma_debugfs_fops); + debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops); + debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops); + + u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); + debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); +} + +static int __init cma_debugfs_init(void) +{ + int i; + + cma_debugfs_root = debugfs_create_dir("cma", NULL); + if (!cma_debugfs_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) + cma_debugfs_add_one(&cma_areas[i], i); + + return 0; +} +late_initcall(cma_debugfs_init); diff --git a/kernel/mm/compaction.c b/kernel/mm/compaction.c new file mode 100644 index 000000000..0af17fef6 --- /dev/null +++ b/kernel/mm/compaction.c @@ -0,0 +1,1719 @@ +/* + * linux/mm/compaction.c + * + * Memory compaction for the reduction of external fragmentation. Note that + * this heavily depends upon page migration to do all the real heavy + * lifting + * + * Copyright IBM Corp. 2007-2010 Mel Gorman + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef CONFIG_COMPACTION +static inline void count_compact_event(enum vm_event_item item) +{ + count_vm_event(item); +} + +static inline void count_compact_events(enum vm_event_item item, long delta) +{ + count_vm_events(item, delta); +} +#else +#define count_compact_event(item) do { } while (0) +#define count_compact_events(item, delta) do { } while (0) +#endif + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA +#ifdef CONFIG_TRACEPOINTS +static const char *const compaction_status_string[] = { + "deferred", + "skipped", + "continue", + "partial", + "complete", + "no_suitable_page", + "not_suitable_zone", +}; +#endif + +#define CREATE_TRACE_POINTS +#include + +static unsigned long release_freepages(struct list_head *freelist) +{ + struct page *page, *next; + unsigned long high_pfn = 0; + + list_for_each_entry_safe(page, next, freelist, lru) { + unsigned long pfn = page_to_pfn(page); + list_del(&page->lru); + __free_page(page); + if (pfn > high_pfn) + high_pfn = pfn; + } + + return high_pfn; +} + +static void map_pages(struct list_head *list) +{ + struct page *page; + + list_for_each_entry(page, list, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + kasan_alloc_pages(page, 0); + } +} + +static inline bool migrate_async_suitable(int migratetype) +{ + return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; +} + +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +static struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + +#ifdef CONFIG_COMPACTION + +/* Do not skip compaction more than 64 times */ +#define COMPACT_MAX_DEFER_SHIFT 6 + +/* + * Compaction is deferred when compaction fails to result in a page + * allocation success. 1 << compact_defer_limit compactions are skipped up + * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT + */ +void defer_compaction(struct zone *zone, int order) +{ + zone->compact_considered = 0; + zone->compact_defer_shift++; + + if (order < zone->compact_order_failed) + zone->compact_order_failed = order; + + if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) + zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; + + trace_mm_compaction_defer_compaction(zone, order); +} + +/* Returns true if compaction should be skipped this time */ +bool compaction_deferred(struct zone *zone, int order) +{ + unsigned long defer_limit = 1UL << zone->compact_defer_shift; + + if (order < zone->compact_order_failed) + return false; + + /* Avoid possible overflow */ + if (++zone->compact_considered > defer_limit) + zone->compact_considered = defer_limit; + + if (zone->compact_considered >= defer_limit) + return false; + + trace_mm_compaction_deferred(zone, order); + + return true; +} + +/* + * Update defer tracking counters after successful compaction of given order, + * which means an allocation either succeeded (alloc_success == true) or is + * expected to succeed. + */ +void compaction_defer_reset(struct zone *zone, int order, + bool alloc_success) +{ + if (alloc_success) { + zone->compact_considered = 0; + zone->compact_defer_shift = 0; + } + if (order >= zone->compact_order_failed) + zone->compact_order_failed = order + 1; + + trace_mm_compaction_defer_reset(zone, order); +} + +/* Returns true if restarting compaction after many failures */ +bool compaction_restarting(struct zone *zone, int order) +{ + if (order < zone->compact_order_failed) + return false; + + return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && + zone->compact_considered >= 1UL << zone->compact_defer_shift; +} + +/* Returns true if the pageblock should be scanned for pages to isolate. */ +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + if (cc->ignore_skip_hint) + return true; + + return !get_pageblock_skip(page); +} + +/* + * This function is called to clear all cached information on pageblocks that + * should be skipped for page isolation when the migrate and free page scanner + * meet. + */ +static void __reset_isolation_suitable(struct zone *zone) +{ + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long pfn; + + zone->compact_cached_migrate_pfn[0] = start_pfn; + zone->compact_cached_migrate_pfn[1] = start_pfn; + zone->compact_cached_free_pfn = end_pfn; + zone->compact_blockskip_flush = false; + + /* Walk the zone and mark every pageblock as suitable for isolation */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + cond_resched(); + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + continue; + + clear_pageblock_skip(page); + } +} + +void reset_isolation_suitable(pg_data_t *pgdat) +{ + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Only flush if a full compaction finished recently */ + if (zone->compact_blockskip_flush) + __reset_isolation_suitable(zone); + } +} + +/* + * If no pages were isolated then mark this pageblock to be skipped in the + * future. The information is later cleared by __reset_isolation_suitable(). + */ +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) +{ + struct zone *zone = cc->zone; + unsigned long pfn; + + if (cc->ignore_skip_hint) + return; + + if (!page) + return; + + if (nr_isolated) + return; + + set_pageblock_skip(page); + + pfn = page_to_pfn(page); + + /* Update where async and sync compaction should restart */ + if (migrate_scanner) { + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; + } else { + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; + } +} +#else +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + return true; +} + +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long nr_isolated, + bool migrate_scanner) +{ +} +#endif /* CONFIG_COMPACTION */ + +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. For async compaction, back out if the lock cannot + * be taken immediately. For sync compaction, spin on the lock if needed. + * + * Returns true if the lock is held + * Returns false if the lock is not held and compaction should abort + */ +static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, + struct compact_control *cc) +{ + if (cc->mode == MIGRATE_ASYNC) { + if (!spin_trylock_irqsave(lock, *flags)) { + cc->contended = COMPACT_CONTENDED_LOCK; + return false; + } + } else { + spin_lock_irqsave(lock, *flags); + } + + return true; +} + +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. The lock should be periodically unlocked to avoid + * having disabled IRQs for a long time, even when there is nobody waiting on + * the lock. It might also be that allowing the IRQs will result in + * need_resched() becoming true. If scheduling is needed, async compaction + * aborts. Sync compaction schedules. + * Either compaction type will also abort if a fatal signal is pending. + * In either case if the lock was locked, it is dropped and not regained. + * + * Returns true if compaction should abort due to fatal signal pending, or + * async compaction due to need_resched() + * Returns false when compaction can continue (sync compaction might have + * scheduled) + */ +static bool compact_unlock_should_abort(spinlock_t *lock, + unsigned long flags, bool *locked, struct compact_control *cc) +{ + if (*locked) { + spin_unlock_irqrestore(lock, flags); + *locked = false; + } + + if (fatal_signal_pending(current)) { + cc->contended = COMPACT_CONTENDED_SCHED; + return true; + } + + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = COMPACT_CONTENDED_SCHED; + return true; + } + cond_resched(); + } + + return false; +} + +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and either schedules in sync compaction or aborts async + * compaction. This is similar to what compact_unlock_should_abort() does, but + * is used where no lock is concerned. + * + * Returns false when no scheduling was needed, or sync compaction scheduled. + * Returns true when async compaction should abort. + */ +static inline bool compact_should_abort(struct compact_control *cc) +{ + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = COMPACT_CONTENDED_SCHED; + return true; + } + + cond_resched(); + } + + return false; +} + +/* + * Isolate free pages onto a private freelist. If @strict is true, will abort + * returning 0 on any invalid PFNs or non-free pages inside of the pageblock + * (even though it may still end up isolating some pages). + */ +static unsigned long isolate_freepages_block(struct compact_control *cc, + unsigned long *start_pfn, + unsigned long end_pfn, + struct list_head *freelist, + bool strict) +{ + int nr_scanned = 0, total_isolated = 0; + struct page *cursor, *valid_page = NULL; + unsigned long flags = 0; + bool locked = false; + unsigned long blockpfn = *start_pfn; + + cursor = pfn_to_page(blockpfn); + + /* Isolate free pages. */ + for (; blockpfn < end_pfn; blockpfn++, cursor++) { + int isolated, i; + struct page *page = cursor; + + /* + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort if fatal signal + * pending or async compaction detects need_resched() + */ + if (!(blockpfn % SWAP_CLUSTER_MAX) + && compact_unlock_should_abort(&cc->zone->lock, flags, + &locked, cc)) + break; + + nr_scanned++; + if (!pfn_valid_within(blockpfn)) + goto isolate_fail; + + if (!valid_page) + valid_page = page; + if (!PageBuddy(page)) + goto isolate_fail; + + /* + * If we already hold the lock, we can skip some rechecking. + * Note that if we hold the lock now, checked_pageblock was + * already set in some previous iteration (or strict is true), + * so it is correct to skip the suitable migration target + * recheck as well. + */ + if (!locked) { + /* + * The zone lock must be held to isolate freepages. + * Unfortunately this is a very coarse lock and can be + * heavily contended if there are parallel allocations + * or parallel compactions. For async compaction do not + * spin on the lock and we acquire the lock as late as + * possible. + */ + locked = compact_trylock_irqsave(&cc->zone->lock, + &flags, cc); + if (!locked) + break; + + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) + goto isolate_fail; + } + + /* Found a free page, break it into order-0 pages */ + isolated = split_free_page(page); + total_isolated += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } + + /* If a page was split, advance to the end of it */ + if (isolated) { + cc->nr_freepages += isolated; + if (!strict && + cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; + } + + blockpfn += isolated - 1; + cursor += isolated - 1; + continue; + } + +isolate_fail: + if (strict) + break; + else + continue; + + } + + trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, + nr_scanned, total_isolated); + + /* Record how far we have got within the block */ + *start_pfn = blockpfn; + + /* + * If strict isolation is requested by CMA then check that all the + * pages requested were isolated. If there were any failures, 0 is + * returned and CMA will fail. + */ + if (strict && blockpfn < end_pfn) + total_isolated = 0; + + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + + /* Update the pageblock-skip if the whole pageblock was scanned */ + if (blockpfn == end_pfn) + update_pageblock_skip(cc, valid_page, total_isolated, false); + + count_compact_events(COMPACTFREE_SCANNED, nr_scanned); + if (total_isolated) + count_compact_events(COMPACTISOLATED, total_isolated); + return total_isolated; +} + +/** + * isolate_freepages_range() - isolate free pages. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Non-free pages, invalid PFNs, or zone boundaries within the + * [start_pfn, end_pfn) range are considered errors, cause function to + * undo its actions and return zero. + * + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater then end_pfn if end fell in a middle of + * a free page). + */ +unsigned long +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long isolated, pfn, block_end_pfn; + LIST_HEAD(freelist); + + pfn = start_pfn; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + + for (; pfn < end_pfn; pfn += isolated, + block_end_pfn += pageblock_nr_pages) { + /* Protect pfn from changing by isolate_freepages_block */ + unsigned long isolate_start_pfn = pfn; + + block_end_pfn = min(block_end_pfn, end_pfn); + + /* + * pfn could pass the block_end_pfn if isolated freepage + * is more than pageblock order. In this case, we adjust + * scanning range to right one. + */ + if (pfn >= block_end_pfn) { + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + } + + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + break; + + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, &freelist, true); + + /* + * In strict mode, isolate_freepages_block() returns 0 if + * there are any holes in the block (ie. invalid PFNs or + * non-free pages). + */ + if (!isolated) + break; + + /* + * If we managed to isolate pages, it is always (1 << n) * + * pageblock_nr_pages for some non-negative n. (Max order + * page may span two pageblocks). + */ + } + + /* split_free_page does not map the pages */ + map_pages(&freelist); + + if (pfn < end_pfn) { + /* Loop terminated early, cleanup. */ + release_freepages(&freelist); + return 0; + } + + /* We don't use freelists for anything. */ + return pfn; +} + +/* Update the number of anon and file isolated pages in the zone */ +static void acct_isolated(struct zone *zone, struct compact_control *cc) +{ + struct page *page; + unsigned int count[2] = { 0, }; + + if (list_empty(&cc->migratepages)) + return; + + list_for_each_entry(page, &cc->migratepages, lru) + count[!!page_is_file_cache(page)]++; + + mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); +} + +/* Similar to reclaim, but different enough that they don't share logic */ +static bool too_many_isolated(struct zone *zone) +{ + unsigned long active, inactive, isolated; + + inactive = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON); + active = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_FILE) + + zone_page_state(zone, NR_ISOLATED_ANON); + + return isolated > (inactive + active) / 2; +} + +/** + * isolate_migratepages_block() - isolate all migrate-able pages within + * a single pageblock + * @cc: Compaction control structure. + * @low_pfn: The first PFN to isolate + * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock + * @isolate_mode: Isolation mode to be used. + * + * Isolate all pages that can be migrated from the range specified by + * [low_pfn, end_pfn). The range is expected to be within same pageblock. + * Returns zero if there is a fatal signal pending, otherwise PFN of the + * first page that was not scanned (which may be both less, equal to or more + * than end_pfn). + * + * The pages are isolated on cc->migratepages list (not required to be empty), + * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field + * is neither read nor updated. + */ +static unsigned long +isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, + unsigned long end_pfn, isolate_mode_t isolate_mode) +{ + struct zone *zone = cc->zone; + unsigned long nr_scanned = 0, nr_isolated = 0; + struct list_head *migratelist = &cc->migratepages; + struct lruvec *lruvec; + unsigned long flags = 0; + bool locked = false; + struct page *page = NULL, *valid_page = NULL; + unsigned long start_pfn = low_pfn; + + /* + * Ensure that there are not too many pages isolated from the LRU + * list by either parallel reclaimers or compaction. If there are, + * delay for some time until fewer pages are isolated + */ + while (unlikely(too_many_isolated(zone))) { + /* async migration should just abort */ + if (cc->mode == MIGRATE_ASYNC) + return 0; + + congestion_wait(BLK_RW_ASYNC, HZ/10); + + if (fatal_signal_pending(current)) + return 0; + } + + if (compact_should_abort(cc)) + return 0; + + /* Time to isolate some pages for migration */ + for (; low_pfn < end_pfn; low_pfn++) { + /* + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort async compaction + * if contended. + */ + if (!(low_pfn % SWAP_CLUSTER_MAX) + && compact_unlock_should_abort(&zone->lru_lock, flags, + &locked, cc)) + break; + + if (!pfn_valid_within(low_pfn)) + continue; + nr_scanned++; + + page = pfn_to_page(low_pfn); + + if (!valid_page) + valid_page = page; + + /* + * Skip if free. We read page order here without zone lock + * which is generally unsafe, but the race window is small and + * the worst thing that can happen is that we skip some + * potential isolation targets. + */ + if (PageBuddy(page)) { + unsigned long freepage_order = page_order_unsafe(page); + + /* + * Without lock, we cannot be sure that what we got is + * a valid page order. Consider only values in the + * valid order range to prevent low_pfn overflow. + */ + if (freepage_order > 0 && freepage_order < MAX_ORDER) + low_pfn += (1UL << freepage_order) - 1; + continue; + } + + /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU pages and balloon pages + * Skip any other type of page + */ + if (!PageLRU(page)) { + if (unlikely(balloon_page_movable(page))) { + if (balloon_page_isolate(page)) { + /* Successfully isolated */ + goto isolate_success; + } + } + continue; + } + + /* + * PageLRU is set. lru_lock normally excludes isolation + * splitting and collapsing (collapsing has already happened + * if PageLRU is set) but the lock is not necessarily taken + * here and it is wasteful to take it just to check transhuge. + * Check TransHuge without lock and skip the whole pageblock if + * it's either a transhuge or hugetlbfs page, as calling + * compound_order() without preventing THP from splitting the + * page underneath us may return surprising results. + */ + if (PageTransHuge(page)) { + if (!locked) + low_pfn = ALIGN(low_pfn + 1, + pageblock_nr_pages) - 1; + else + low_pfn += (1 << compound_order(page)) - 1; + + continue; + } + + /* + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. + */ + if (!page_mapping(page) && + page_count(page) > page_mapcount(page)) + continue; + + /* If we already hold the lock, we can skip some rechecking */ + if (!locked) { + locked = compact_trylock_irqsave(&zone->lru_lock, + &flags, cc); + if (!locked) + break; + + /* Recheck PageLRU and PageTransHuge under lock */ + if (!PageLRU(page)) + continue; + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + } + + lruvec = mem_cgroup_page_lruvec(page, zone); + + /* Try isolate the page */ + if (__isolate_lru_page(page, isolate_mode) != 0) + continue; + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + /* Successfully isolated */ + del_page_from_lru_list(page, lruvec, page_lru(page)); + +isolate_success: + list_add(&page->lru, migratelist); + cc->nr_migratepages++; + nr_isolated++; + + /* Avoid isolating too much */ + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + ++low_pfn; + break; + } + } + + /* + * The PageBuddy() check could have potentially brought us outside + * the range to be scanned. + */ + if (unlikely(low_pfn > end_pfn)) + low_pfn = end_pfn; + + if (locked) + spin_unlock_irqrestore(&zone->lru_lock, flags); + + /* + * Update the pageblock-skip information and cached scanner pfn, + * if the whole pageblock was scanned without isolating any page. + */ + if (low_pfn == end_pfn) + update_pageblock_skip(cc, valid_page, nr_isolated, true); + + trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, + nr_scanned, nr_isolated); + + count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); + if (nr_isolated) + count_compact_events(COMPACTISOLATED, nr_isolated); + + return low_pfn; +} + +/** + * isolate_migratepages_range() - isolate migrate-able pages in a PFN range + * @cc: Compaction control structure. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Returns zero if isolation fails fatally due to e.g. pending signal. + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater than end_pfn if end fell in a middle of a THP page). + */ +unsigned long +isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn, block_end_pfn; + + /* Scan block by block. First and last block may be incomplete */ + pfn = start_pfn; + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + + for (; pfn < end_pfn; pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, end_pfn); + + if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + continue; + + pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); + + /* + * In case of fatal failure, release everything that might + * have been isolated in the previous iteration, and signal + * the failure back to caller. + */ + if (!pfn) { + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + break; + } + + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; + } + acct_isolated(cc->zone, cc); + + return pfn; +} + +#endif /* CONFIG_COMPACTION || CONFIG_CMA */ +#ifdef CONFIG_COMPACTION + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + /* If the page is a large free page, then disallow migration */ + if (PageBuddy(page)) { + /* + * We are checking page_order without zone->lock taken. But + * the only small danger is that we skip a potentially suitable + * pageblock, so it's not worth to check order for valid range. + */ + if (page_order_unsafe(page) >= pageblock_order) + return false; + } + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (migrate_async_suitable(get_pageblock_migratetype(page))) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct compact_control *cc) +{ + struct zone *zone = cc->zone; + struct page *page; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long isolate_start_pfn; /* exact pfn we start at */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ + struct list_head *freelist = &cc->freepages; + + /* + * Initialise the free scanner. The starting point is where we last + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. For looping we also need + * this pfn aligned down to the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * a zone which ends in the middle of a pageblock. + * The low boundary is the end of the pageblock the migration scanner + * is using. + */ + isolate_start_pfn = cc->free_pfn; + block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); + low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + for (; block_start_pfn >= low_pfn && + cc->nr_migratepages > cc->nr_freepages; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages, + isolate_start_pfn = block_start_pfn) { + + /* + * This can iterate a massively long zone without finding any + * suitable migration targets, so periodically check if we need + * to schedule, or even abort async compaction. + */ + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; + + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); + if (!page) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + + /* Found a block suitable for isolating free pages from. */ + isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, false); + + /* + * Remember where the free scanner should restart next time, + * which is where isolate_freepages_block() left off. + * But if it scanned the whole pageblock, isolate_start_pfn + * now points at block_end_pfn, which is the start of the next + * pageblock. + * In that case we will however want to restart at the start + * of the previous pageblock. + */ + cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? + isolate_start_pfn : + block_start_pfn - pageblock_nr_pages; + + /* + * isolate_freepages_block() might have aborted due to async + * compaction being contended + */ + if (cc->contended) + break; + } + + /* split_free_page does not map the pages */ + map_pages(freelist); + + /* + * If we crossed the migrate scanner, we want to keep it that way + * so that compact_finished() may detect this + */ + if (block_start_pfn < low_pfn) + cc->free_pfn = cc->migrate_pfn; +} + +/* + * This is a migrate-callback that "allocates" freepages by taking pages + * from the isolated freelists in the block we are migrating to. + */ +static struct page *compaction_alloc(struct page *migratepage, + unsigned long data, + int **result) +{ + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; + + /* + * Isolate free pages if necessary, and if we are not aborting due to + * contention. + */ + if (list_empty(&cc->freepages)) { + if (!cc->contended) + isolate_freepages(cc); + + if (list_empty(&cc->freepages)) + return NULL; + } + + freepage = list_entry(cc->freepages.next, struct page, lru); + list_del(&freepage->lru); + cc->nr_freepages--; + + return freepage; +} + +/* + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. + */ +static void compaction_free(struct page *page, unsigned long data) +{ + struct compact_control *cc = (struct compact_control *)data; + + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; +} + +/* possible outcome of isolate_migratepages */ +typedef enum { + ISOLATE_ABORT, /* Abort compaction now */ + ISOLATE_NONE, /* No pages isolated, continue scanning */ + ISOLATE_SUCCESS, /* Pages isolated, migrate */ +} isolate_migrate_t; + +/* + * Allow userspace to control policy on scanning the unevictable LRU for + * compactable pages. + */ +int sysctl_compact_unevictable_allowed __read_mostly = 1; + +/* + * Isolate all pages that can be migrated from the first suitable block, + * starting at the block pointed to by the migrate scanner pfn within + * compact_control. + */ +static isolate_migrate_t isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + struct page *page; + const isolate_mode_t isolate_mode = + (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | + (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); + + /* + * Start at where we last stopped, or beginning of the zone as + * initialized by compact_zone() + */ + low_pfn = cc->migrate_pfn; + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + + /* + * Iterate over whole pageblocks until we find the first suitable. + * Do not cross the free scanner. + */ + for (; end_pfn <= cc->free_pfn; + low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + + /* + * This can potentially iterate a massively long zone with + * many pageblocks unsuitable, so periodically check if we + * need to schedule, or even abort async compaction. + */ + if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; + + page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + if (!page) + continue; + + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + + /* + * For async compaction, also only scan in MOVABLE blocks. + * Async compaction is optimistic to see if the minimum amount + * of work satisfies the allocation. + */ + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(get_pageblock_migratetype(page))) + continue; + + /* Perform the isolation */ + low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, + isolate_mode); + + if (!low_pfn || cc->contended) { + acct_isolated(zone, cc); + return ISOLATE_ABORT; + } + + /* + * Either we isolated something and proceed with migration. Or + * we failed and compact_zone should decide if we should + * continue or not. + */ + break; + } + + acct_isolated(zone, cc); + /* + * Record where migration scanner will be restarted. If we end up in + * the same pageblock as the free scanner, make the scanners fully + * meet so that compact_finished() terminates compaction. + */ + cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; + + return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; +} + +static int __compact_finished(struct zone *zone, struct compact_control *cc, + const int migratetype) +{ + unsigned int order; + unsigned long watermark; + + if (cc->contended || fatal_signal_pending(current)) + return COMPACT_PARTIAL; + + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) { + /* Let the next compaction start anew. */ + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); + + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kswapd does not set the + * flag itself as the decision to be clear should be directly + * based on an allocation request. + */ + if (!current_is_kswapd()) + zone->compact_blockskip_flush = true; + + return COMPACT_COMPLETE; + } + + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ + if (cc->order == -1) + return COMPACT_CONTINUE; + + /* Compaction run is not finished if the watermark is not met */ + watermark = low_wmark_pages(zone); + + if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, + cc->alloc_flags)) + return COMPACT_CONTINUE; + + /* Direct compactor: Is a suitable page free? */ + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + bool can_steal; + + /* Job done if page is free of the right migratetype */ + if (!list_empty(&area->free_list[migratetype])) + return COMPACT_PARTIAL; + +#ifdef CONFIG_CMA + /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ + if (migratetype == MIGRATE_MOVABLE && + !list_empty(&area->free_list[MIGRATE_CMA])) + return COMPACT_PARTIAL; +#endif + /* + * Job done if allocation would steal freepages from + * other migratetype buddy lists. + */ + if (find_suitable_fallback(area, order, migratetype, + true, &can_steal) != -1) + return COMPACT_PARTIAL; + } + + return COMPACT_NO_SUITABLE_PAGE; +} + +static int compact_finished(struct zone *zone, struct compact_control *cc, + const int migratetype) +{ + int ret; + + ret = __compact_finished(zone, cc, migratetype); + trace_mm_compaction_finished(zone, cc->order, ret); + if (ret == COMPACT_NO_SUITABLE_PAGE) + ret = COMPACT_CONTINUE; + + return ret; +} + +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +static unsigned long __compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) +{ + int fragindex; + unsigned long watermark; + + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ + if (order == -1) + return COMPACT_CONTINUE; + + watermark = low_wmark_pages(zone); + /* + * If watermarks for high-order allocation are already met, there + * should be no need for compaction at all. + */ + if (zone_watermark_ok(zone, order, watermark, classzone_idx, + alloc_flags)) + return COMPACT_PARTIAL; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark += (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) + return COMPACT_SKIPPED; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1000 would imply allocations might succeed depending on + * watermarks, but we already failed the high-order watermark check + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_NOT_SUITABLE_ZONE; + + return COMPACT_CONTINUE; +} + +unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) +{ + unsigned long ret; + + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); + trace_mm_compaction_suitable(zone, order, ret); + if (ret == COMPACT_NOT_SUITABLE_ZONE) + ret = COMPACT_SKIPPED; + + return ret; +} + +static int compact_zone(struct zone *zone, struct compact_control *cc) +{ + int ret; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); + const bool sync = cc->mode != MIGRATE_ASYNC; + unsigned long last_migrated_pfn = 0; + + ret = compaction_suitable(zone, cc->order, cc->alloc_flags, + cc->classzone_idx); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. kswapd does not do + * this reset as it'll reset the cached information when going to sleep. + */ + if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + __reset_isolation_suitable(zone); + + /* + * Setup to move all movable pages to the end of the zone. Used cached + * information on where the scanners should start but check that it + * is initialised by ensuring the values are within zone boundaries. + */ + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { + cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + cc->migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + } + + trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, + cc->free_pfn, end_pfn, sync); + + migrate_prep_local(); + + while ((ret = compact_finished(zone, cc, migratetype)) == + COMPACT_CONTINUE) { + int err; + unsigned long isolate_start_pfn = cc->migrate_pfn; + + switch (isolate_migratepages(zone, cc)) { + case ISOLATE_ABORT: + ret = COMPACT_PARTIAL; + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + goto out; + case ISOLATE_NONE: + /* + * We haven't isolated and migrated anything, but + * there might still be unflushed migrations from + * previous cc->order aligned block. + */ + goto check_drain; + case ISOLATE_SUCCESS: + ; + } + + err = migrate_pages(&cc->migratepages, compaction_alloc, + compaction_free, (unsigned long)cc, cc->mode, + MR_COMPACTION); + + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); + + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; + if (err) { + putback_movable_pages(&cc->migratepages); + /* + * migrate_pages() may return -ENOMEM when scanners meet + * and we want compact_finished() to detect it + */ + if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { + ret = COMPACT_PARTIAL; + goto out; + } + } + + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. We use the pfn that + * isolate_migratepages() started from in this loop iteration + * - this is the lowest page that could have been isolated and + * then freed by migration. + */ + if (!last_migrated_pfn) + last_migrated_pfn = isolate_start_pfn; + +check_drain: + /* + * Has the migration scanner moved away from the previous + * cc->order aligned block where we migrated from? If yes, + * flush the pages that were freed, so that they can merge and + * compact_finished() can detect immediately if allocation + * would succeed. + */ + if (cc->order > 0 && last_migrated_pfn) { + int cpu; + unsigned long current_block_start = + cc->migrate_pfn & ~((1UL << cc->order) - 1); + + if (last_migrated_pfn < current_block_start) { + cpu = get_cpu_light(); + local_lock_irq(swapvec_lock); + lru_add_drain_cpu(cpu); + local_unlock_irq(swapvec_lock); + drain_local_pages(zone); + put_cpu_light(); + /* No more flushing until we migrate again */ + last_migrated_pfn = 0; + } + } + + } + +out: + /* + * Release free pages and update where the free scanner should restart, + * so we don't leave any returned pages behind in the next attempt. + */ + if (cc->nr_freepages > 0) { + unsigned long free_pfn = release_freepages(&cc->freepages); + + cc->nr_freepages = 0; + VM_BUG_ON(free_pfn == 0); + /* The cached pfn is always the first in a pageblock */ + free_pfn &= ~(pageblock_nr_pages-1); + /* + * Only go back, not forward. The cached pfn might have been + * already reset to zone end in compact_finished() + */ + if (free_pfn > zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = free_pfn; + } + + trace_mm_compaction_end(start_pfn, cc->migrate_pfn, + cc->free_pfn, end_pfn, sync, ret); + + return ret; +} + +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, int *contended, + int alloc_flags, int classzone_idx) +{ + unsigned long ret; + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = order, + .gfp_mask = gfp_mask, + .zone = zone, + .mode = mode, + .alloc_flags = alloc_flags, + .classzone_idx = classzone_idx, + }; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + ret = compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + + *contended = cc.contended; + return ret; +} + +int sysctl_extfrag_threshold = 500; + +/** + * try_to_compact_pages - Direct compact to satisfy a high-order allocation + * @gfp_mask: The GFP mask of the current allocation + * @order: The order of the current allocation + * @alloc_flags: The allocation flags of the current allocation + * @ac: The context of current allocation + * @mode: The migration mode for async, sync light, or sync migration + * @contended: Return value that determines if compaction was aborted due to + * need_resched() or lock contention + * + * This is the main entry point for direct page compaction. + */ +unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) +{ + int may_enter_fs = gfp_mask & __GFP_FS; + int may_perform_io = gfp_mask & __GFP_IO; + struct zoneref *z; + struct zone *zone; + int rc = COMPACT_DEFERRED; + int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ + + *contended = COMPACT_CONTENDED_NONE; + + /* Check if the GFP flags allow compaction */ + if (!order || !may_enter_fs || !may_perform_io) + return COMPACT_SKIPPED; + + trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); + + /* Compact each zone in the list */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + int status; + int zone_contended; + + if (compaction_deferred(zone, order)) + continue; + + status = compact_zone_order(zone, order, gfp_mask, mode, + &zone_contended, alloc_flags, + ac->classzone_idx); + rc = max(status, rc); + /* + * It takes at least one zone that wasn't lock contended + * to clear all_zones_contended. + */ + all_zones_contended &= zone_contended; + + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), + ac->classzone_idx, alloc_flags)) { + /* + * We think the allocation will succeed in this zone, + * but it is not certain, hence the false. The caller + * will repeat this with true if allocation indeed + * succeeds in this zone. + */ + compaction_defer_reset(zone, order, false); + /* + * It is possible that async compaction aborted due to + * need_resched() and the watermarks were ok thanks to + * somebody else freeing memory. The allocation can + * however still fail so we better signal the + * need_resched() contention anyway (this will not + * prevent the allocation attempt). + */ + if (zone_contended == COMPACT_CONTENDED_SCHED) + *contended = COMPACT_CONTENDED_SCHED; + + goto break_loop; + } + + if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { + /* + * We think that allocation won't succeed in this zone + * so we defer compaction there. If it ends up + * succeeding after all, it will be reset. + */ + defer_compaction(zone, order); + } + + /* + * We might have stopped compacting due to need_resched() in + * async compaction, or due to a fatal signal detected. In that + * case do not try further zones and signal need_resched() + * contention. + */ + if ((zone_contended == COMPACT_CONTENDED_SCHED) + || fatal_signal_pending(current)) { + *contended = COMPACT_CONTENDED_SCHED; + goto break_loop; + } + + continue; +break_loop: + /* + * We might not have tried all the zones, so be conservative + * and assume they are not all lock contended. + */ + all_zones_contended = 0; + break; + } + + /* + * If at least one zone wasn't deferred or skipped, we report if all + * zones that were tried were lock contended. + */ + if (rc > COMPACT_SKIPPED && all_zones_contended) + *contended = COMPACT_CONTENDED_LOCK; + + return rc; +} + + +/* Compact all zones within a node */ +static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) +{ + int zoneid; + struct zone *zone; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc->nr_freepages = 0; + cc->nr_migratepages = 0; + cc->zone = zone; + INIT_LIST_HEAD(&cc->freepages); + INIT_LIST_HEAD(&cc->migratepages); + + /* + * When called via /proc/sys/vm/compact_memory + * this makes sure we compact the whole zone regardless of + * cached scanner positions. + */ + if (cc->order == -1) + __reset_isolation_suitable(zone); + + if (cc->order == -1 || !compaction_deferred(zone, cc->order)) + compact_zone(zone, cc); + + if (cc->order > 0) { + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); + } + + VM_BUG_ON(!list_empty(&cc->freepages)); + VM_BUG_ON(!list_empty(&cc->migratepages)); + } +} + +void compact_pgdat(pg_data_t *pgdat, int order) +{ + struct compact_control cc = { + .order = order, + .mode = MIGRATE_ASYNC, + }; + + if (!order) + return; + + __compact_pgdat(pgdat, &cc); +} + +static void compact_node(int nid) +{ + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + }; + + __compact_pgdat(NODE_DATA(nid), &cc); +} + +/* Compact all nodes in the system */ +static void compact_nodes(void) +{ + int nid; + + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + for_each_online_node(nid) + compact_node(nid); +} + +/* The written value is actually unused, all memory is compacted */ +int sysctl_compact_memory; + +/* This is the entry point for compacting all nodes via /proc/sys/vm */ +int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + if (write) + compact_nodes(); + + return 0; +} + +int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, buffer, length, ppos); + + return 0; +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t sysfs_compact_node(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int nid = dev->id; + + if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + compact_node(nid); + } + + return count; +} +static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); + +int compaction_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_compact); +} + +void compaction_unregister_node(struct node *node) +{ + return device_remove_file(&node->dev, &dev_attr_compact); +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ + +#endif /* CONFIG_COMPACTION */ diff --git a/kernel/mm/debug-pagealloc.c b/kernel/mm/debug-pagealloc.c new file mode 100644 index 000000000..5bf5906ce --- /dev/null +++ b/kernel/mm/debug-pagealloc.c @@ -0,0 +1,137 @@ +#include +#include +#include +#include +#include +#include +#include + +static bool page_poisoning_enabled __read_mostly; + +static bool need_page_poisoning(void) +{ + if (!debug_pagealloc_enabled()) + return false; + + return true; +} + +static void init_page_poisoning(void) +{ + if (!debug_pagealloc_enabled()) + return; + + page_poisoning_enabled = true; +} + +struct page_ext_operations page_poisoning_ops = { + .need = need_page_poisoning, + .init = init_page_poisoning, +}; + +static inline void set_page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static inline void clear_page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static inline bool page_poison(struct page *page) +{ + struct page_ext *page_ext; + + page_ext = lookup_page_ext(page); + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); +} + +static void poison_page(struct page *page) +{ + void *addr = kmap_atomic(page); + + set_page_poison(page); + memset(addr, PAGE_POISON, PAGE_SIZE); + kunmap_atomic(addr); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(unsigned char *mem, size_t bytes) +{ + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); + unsigned char *start; + unsigned char *end; + + start = memchr_inv(mem, PAGE_POISON, bytes); + if (!start) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!__ratelimit(&ratelimit)) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + printk(KERN_ERR "pagealloc: single bit error\n"); + else + printk(KERN_ERR "pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); +} + +static void unpoison_page(struct page *page) +{ + void *addr; + + if (!page_poison(page)) + return; + + addr = kmap_atomic(page); + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + kunmap_atomic(addr); +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!page_poisoning_enabled) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} diff --git a/kernel/mm/debug.c b/kernel/mm/debug.c new file mode 100644 index 000000000..3eb3ac2fc --- /dev/null +++ b/kernel/mm/debug.c @@ -0,0 +1,240 @@ +/* + * mm/debug.c + * + * mm/ specific debug routines. + * + */ + +#include +#include +#include +#include + +static const struct trace_print_flags pageflag_names[] = { + {1UL << PG_locked, "locked" }, + {1UL << PG_error, "error" }, + {1UL << PG_referenced, "referenced" }, + {1UL << PG_uptodate, "uptodate" }, + {1UL << PG_dirty, "dirty" }, + {1UL << PG_lru, "lru" }, + {1UL << PG_active, "active" }, + {1UL << PG_slab, "slab" }, + {1UL << PG_owner_priv_1, "owner_priv_1" }, + {1UL << PG_arch_1, "arch_1" }, + {1UL << PG_reserved, "reserved" }, + {1UL << PG_private, "private" }, + {1UL << PG_private_2, "private_2" }, + {1UL << PG_writeback, "writeback" }, +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_head, "head" }, + {1UL << PG_tail, "tail" }, +#else + {1UL << PG_compound, "compound" }, +#endif + {1UL << PG_swapcache, "swapcache" }, + {1UL << PG_mappedtodisk, "mappedtodisk" }, + {1UL << PG_reclaim, "reclaim" }, + {1UL << PG_swapbacked, "swapbacked" }, + {1UL << PG_unevictable, "unevictable" }, +#ifdef CONFIG_MMU + {1UL << PG_mlocked, "mlocked" }, +#endif +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + {1UL << PG_uncached, "uncached" }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + {1UL << PG_hwpoison, "hwpoison" }, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + {1UL << PG_compound_lock, "compound_lock" }, +#endif +}; + +static void dump_flags(unsigned long flags, + const struct trace_print_flags *names, int count) +{ + const char *delim = ""; + unsigned long mask; + int i; + + pr_emerg("flags: %#lx(", flags); + + /* remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + + for (i = 0; i < count && flags; i++) { + + mask = names[i].mask; + if ((flags & mask) != mask) + continue; + + flags &= ~mask; + pr_cont("%s%s", delim, names[i].name); + delim = "|"; + } + + /* check for left over flags */ + if (flags) + pr_cont("%s%#lx", delim, flags); + + pr_cont(")\n"); +} + +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) +{ + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + page, atomic_read(&page->_count), page_mapcount(page), + page->mapping, page->index); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); + dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + if (reason) + pr_alert("page dumped because: %s\n", reason); + if (page->flags & badflags) { + pr_alert("bad because of flags:\n"); + dump_flags(page->flags & badflags, + pageflag_names, ARRAY_SIZE(pageflag_names)); + } +#ifdef CONFIG_MEMCG + if (page->mem_cgroup) + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); +#endif +} + +void dump_page(struct page *page, const char *reason) +{ + dump_page_badflags(page, reason, 0); +} +EXPORT_SYMBOL(dump_page); + +#ifdef CONFIG_DEBUG_VM + +static const struct trace_print_flags vmaflags_names[] = { + {VM_READ, "read" }, + {VM_WRITE, "write" }, + {VM_EXEC, "exec" }, + {VM_SHARED, "shared" }, + {VM_MAYREAD, "mayread" }, + {VM_MAYWRITE, "maywrite" }, + {VM_MAYEXEC, "mayexec" }, + {VM_MAYSHARE, "mayshare" }, + {VM_GROWSDOWN, "growsdown" }, + {VM_PFNMAP, "pfnmap" }, + {VM_DENYWRITE, "denywrite" }, + {VM_LOCKED, "locked" }, + {VM_IO, "io" }, + {VM_SEQ_READ, "seqread" }, + {VM_RAND_READ, "randread" }, + {VM_DONTCOPY, "dontcopy" }, + {VM_DONTEXPAND, "dontexpand" }, + {VM_ACCOUNT, "account" }, + {VM_NORESERVE, "noreserve" }, + {VM_HUGETLB, "hugetlb" }, +#if defined(CONFIG_X86) + {VM_PAT, "pat" }, +#elif defined(CONFIG_PPC) + {VM_SAO, "sao" }, +#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) + {VM_GROWSUP, "growsup" }, +#elif !defined(CONFIG_MMU) + {VM_MAPPED_COPY, "mappedcopy" }, +#else + {VM_ARCH_1, "arch_1" }, +#endif + {VM_DONTDUMP, "dontdump" }, +#ifdef CONFIG_MEM_SOFT_DIRTY + {VM_SOFTDIRTY, "softdirty" }, +#endif + {VM_MIXEDMAP, "mixedmap" }, + {VM_HUGEPAGE, "hugepage" }, + {VM_NOHUGEPAGE, "nohugepage" }, + {VM_MERGEABLE, "mergeable" }, +}; + +void dump_vma(const struct vm_area_struct *vma) +{ + pr_emerg("vma %p start %p end %p\n" + "next %p prev %p mm %p\n" + "prot %lx anon_vma %p vm_ops %p\n" + "pgoff %lx file %p private_data %p\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, + vma->vm_prev, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data); + dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); +} +EXPORT_SYMBOL(dump_vma); + +void dump_mm(const struct mm_struct *mm) +{ + pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" +#ifdef CONFIG_MMU + "get_unmapped_area %p\n" +#endif + "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" + "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "start_code %lx end_code %lx start_data %lx end_data %lx\n" + "start_brk %lx brk %lx start_stack %lx\n" + "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" + "binfmt %p flags %lx core_state %p\n" +#ifdef CONFIG_AIO + "ioctx_table %p\n" +#endif +#ifdef CONFIG_MEMCG + "owner %p " +#endif + "exe_file %p\n" +#ifdef CONFIG_MMU_NOTIFIER + "mmu_notifier_mm %p\n" +#endif +#ifdef CONFIG_NUMA_BALANCING + "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + "tlb_flush_pending %d\n" +#endif + "%s", /* This is here to hold the comma */ + + mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, +#ifdef CONFIG_MMU + mm->get_unmapped_area, +#endif + mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->pgd, atomic_read(&mm->mm_users), + atomic_read(&mm->mm_count), + atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_pmds((struct mm_struct *)mm), + mm->map_count, + mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, + mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->start_code, mm->end_code, mm->start_data, mm->end_data, + mm->start_brk, mm->brk, mm->start_stack, + mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, + mm->binfmt, mm->flags, mm->core_state, +#ifdef CONFIG_AIO + mm->ioctx_table, +#endif +#ifdef CONFIG_MEMCG + mm->owner, +#endif + mm->exe_file, +#ifdef CONFIG_MMU_NOTIFIER + mm->mmu_notifier_mm, +#endif +#ifdef CONFIG_NUMA_BALANCING + mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, +#endif +#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) + mm->tlb_flush_pending, +#endif + "" /* This is here to not have a comma! */ + ); + + dump_flags(mm->def_flags, vmaflags_names, + ARRAY_SIZE(vmaflags_names)); +} + +#endif /* CONFIG_DEBUG_VM */ diff --git a/kernel/mm/dmapool.c b/kernel/mm/dmapool.c new file mode 100644 index 000000000..fd5fe4342 --- /dev/null +++ b/kernel/mm/dmapool.c @@ -0,0 +1,529 @@ +/* + * DMA Pool allocator + * + * Copyright 2001 David Brownell + * Copyright 2007 Intel Corporation + * Author: Matthew Wilcox + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 as published by the + * Free Software Foundation. + * + * This allocator returns small blocks of a given size which are DMA-able by + * the given device. It uses the dma_alloc_coherent page allocator to get + * new pages, then splits them up into blocks of the required size. + * Many older drivers still have their own code to do this. + * + * The current design of this allocator is fairly simple. The pool is + * represented by the 'struct dma_pool' which keeps a doubly-linked list of + * allocated pages. Each page in the page_list is split into blocks of at + * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked + * list of free blocks within the page. Used blocks aren't tracked, but we + * keep a count of how many are currently allocated from each page. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#define DMAPOOL_DEBUG 1 +#endif + +struct dma_pool { /* the pool */ + struct list_head page_list; + spinlock_t lock; + size_t size; + struct device *dev; + size_t allocation; + size_t boundary; + char name[32]; + struct list_head pools; +}; + +struct dma_page { /* cacheable header for 'allocation' bytes */ + struct list_head page_list; + void *vaddr; + dma_addr_t dma; + unsigned int in_use; + unsigned int offset; +}; + +static DEFINE_MUTEX(pools_lock); +static DEFINE_MUTEX(pools_reg_lock); + +static ssize_t +show_pools(struct device *dev, struct device_attribute *attr, char *buf) +{ + unsigned temp; + unsigned size; + char *next; + struct dma_page *page; + struct dma_pool *pool; + + next = buf; + size = PAGE_SIZE; + + temp = scnprintf(next, size, "poolinfo - 0.1\n"); + size -= temp; + next += temp; + + mutex_lock(&pools_lock); + list_for_each_entry(pool, &dev->dma_pools, pools) { + unsigned pages = 0; + unsigned blocks = 0; + + spin_lock_irq(&pool->lock); + list_for_each_entry(page, &pool->page_list, page_list) { + pages++; + blocks += page->in_use; + } + spin_unlock_irq(&pool->lock); + + /* per-pool info, no real statistics yet */ + temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", + pool->name, blocks, + pages * (pool->allocation / pool->size), + pool->size, pages); + size -= temp; + next += temp; + } + mutex_unlock(&pools_lock); + + return PAGE_SIZE - size; +} + +static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL); + +/** + * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @boundary: returned blocks won't cross this power of two boundary + * Context: !in_interrupt() + * + * Returns a dma allocation pool with the requested characteristics, or + * null if one can't be created. Given one of these pools, dma_pool_alloc() + * may be used to allocate memory. Such memory will all have "consistent" + * DMA mappings, accessible by the device and its driver without using + * cache flushing primitives. The actual size of blocks allocated may be + * larger than requested because of alignment. + * + * If @boundary is nonzero, objects returned from dma_pool_alloc() won't + * cross that size boundary. This is useful for devices which have + * addressing restrictions on individual DMA transfers, such as not crossing + * boundaries of 4KBytes. + */ +struct dma_pool *dma_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary) +{ + struct dma_pool *retval; + size_t allocation; + bool empty = false; + + if (align == 0) + align = 1; + else if (align & (align - 1)) + return NULL; + + if (size == 0) + return NULL; + else if (size < 4) + size = 4; + + if ((size % align) != 0) + size = ALIGN(size, align); + + allocation = max_t(size_t, size, PAGE_SIZE); + + if (!boundary) + boundary = allocation; + else if ((boundary < size) || (boundary & (boundary - 1))) + return NULL; + + retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + if (!retval) + return retval; + + strlcpy(retval->name, name, sizeof(retval->name)); + + retval->dev = dev; + + INIT_LIST_HEAD(&retval->page_list); + spin_lock_init(&retval->lock); + retval->size = size; + retval->boundary = boundary; + retval->allocation = allocation; + + INIT_LIST_HEAD(&retval->pools); + + /* + * pools_lock ensures that the ->dma_pools list does not get corrupted. + * pools_reg_lock ensures that there is not a race between + * dma_pool_create() and dma_pool_destroy() or within dma_pool_create() + * when the first invocation of dma_pool_create() failed on + * device_create_file() and the second assumes that it has been done (I + * know it is a short window). + */ + mutex_lock(&pools_reg_lock); + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools)) + empty = true; + list_add(&retval->pools, &dev->dma_pools); + mutex_unlock(&pools_lock); + if (empty) { + int err; + + err = device_create_file(dev, &dev_attr_pools); + if (err) { + mutex_lock(&pools_lock); + list_del(&retval->pools); + mutex_unlock(&pools_lock); + mutex_unlock(&pools_reg_lock); + kfree(retval); + return NULL; + } + } + mutex_unlock(&pools_reg_lock); + return retval; +} +EXPORT_SYMBOL(dma_pool_create); + +static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) +{ + unsigned int offset = 0; + unsigned int next_boundary = pool->boundary; + + do { + unsigned int next = offset + pool->size; + if (unlikely((next + pool->size) >= next_boundary)) { + next = next_boundary; + next_boundary += pool->boundary; + } + *(int *)(page->vaddr + offset) = next; + offset = next; + } while (offset < pool->allocation); +} + +static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) +{ + struct dma_page *page; + + page = kmalloc(sizeof(*page), mem_flags); + if (!page) + return NULL; + page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, + &page->dma, mem_flags); + if (page->vaddr) { +#ifdef DMAPOOL_DEBUG + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + pool_initialise_page(pool, page); + page->in_use = 0; + page->offset = 0; + } else { + kfree(page); + page = NULL; + } + return page; +} + +static inline int is_page_busy(struct dma_page *page) +{ + return page->in_use != 0; +} + +static void pool_free_page(struct dma_pool *pool, struct dma_page *page) +{ + dma_addr_t dma = page->dma; + +#ifdef DMAPOOL_DEBUG + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); + list_del(&page->page_list); + kfree(page); +} + +/** + * dma_pool_destroy - destroys a pool of dma memory blocks. + * @pool: dma pool that will be destroyed + * Context: !in_interrupt() + * + * Caller guarantees that no more memory from the pool is in use, + * and that nothing will try to use the pool after this call. + */ +void dma_pool_destroy(struct dma_pool *pool) +{ + bool empty = false; + + mutex_lock(&pools_reg_lock); + mutex_lock(&pools_lock); + list_del(&pool->pools); + if (pool->dev && list_empty(&pool->dev->dma_pools)) + empty = true; + mutex_unlock(&pools_lock); + if (empty) + device_remove_file(pool->dev, &dev_attr_pools); + mutex_unlock(&pools_reg_lock); + + while (!list_empty(&pool->page_list)) { + struct dma_page *page; + page = list_entry(pool->page_list.next, + struct dma_page, page_list); + if (is_page_busy(page)) { + if (pool->dev) + dev_err(pool->dev, + "dma_pool_destroy %s, %p busy\n", + pool->name, page->vaddr); + else + printk(KERN_ERR + "dma_pool_destroy %s, %p busy\n", + pool->name, page->vaddr); + /* leak the still-in-use consistent memory */ + list_del(&page->page_list); + kfree(page); + } else + pool_free_page(pool, page); + } + + kfree(pool); +} +EXPORT_SYMBOL(dma_pool_destroy); + +/** + * dma_pool_alloc - get a block of consistent memory + * @pool: dma pool that will produce the block + * @mem_flags: GFP_* bitmask + * @handle: pointer to dma address of block + * + * This returns the kernel virtual address of a currently unused block, + * and reports its dma address through the handle. + * If such a memory block can't be allocated, %NULL is returned. + */ +void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, + dma_addr_t *handle) +{ + unsigned long flags; + struct dma_page *page; + size_t offset; + void *retval; + + might_sleep_if(mem_flags & __GFP_WAIT); + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry(page, &pool->page_list, page_list) { + if (page->offset < pool->allocation) + goto ready; + } + + /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ + spin_unlock_irqrestore(&pool->lock, flags); + + page = pool_alloc_page(pool, mem_flags); + if (!page) + return NULL; + + spin_lock_irqsave(&pool->lock, flags); + + list_add(&page->page_list, &pool->page_list); + ready: + page->in_use++; + offset = page->offset; + page->offset = *(int *)(page->vaddr + offset); + retval = offset + page->vaddr; + *handle = offset + page->dma; +#ifdef DMAPOOL_DEBUG + { + int i; + u8 *data = retval; + /* page->offset is stored in first 4 bytes */ + for (i = sizeof(page->offset); i < pool->size; i++) { + if (data[i] == POOL_POISON_FREED) + continue; + if (pool->dev) + dev_err(pool->dev, + "dma_pool_alloc %s, %p (corrupted)\n", + pool->name, retval); + else + pr_err("dma_pool_alloc %s, %p (corrupted)\n", + pool->name, retval); + + /* + * Dump the first 4 bytes even if they are not + * POOL_POISON_FREED + */ + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, + data, pool->size, 1); + break; + } + } + memset(retval, POOL_POISON_ALLOCATED, pool->size); +#endif + spin_unlock_irqrestore(&pool->lock, flags); + return retval; +} +EXPORT_SYMBOL(dma_pool_alloc); + +static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) +{ + struct dma_page *page; + + list_for_each_entry(page, &pool->page_list, page_list) { + if (dma < page->dma) + continue; + if (dma < (page->dma + pool->allocation)) + return page; + } + return NULL; +} + +/** + * dma_pool_free - put block back into dma pool + * @pool: the dma pool holding the block + * @vaddr: virtual address of block + * @dma: dma address of block + * + * Caller promises neither device nor driver will again touch this block + * unless it is first re-allocated. + */ +void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) +{ + struct dma_page *page; + unsigned long flags; + unsigned int offset; + + spin_lock_irqsave(&pool->lock, flags); + page = pool_find_page(pool, dma); + if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); + if (pool->dev) + dev_err(pool->dev, + "dma_pool_free %s, %p/%lx (bad dma)\n", + pool->name, vaddr, (unsigned long)dma); + else + printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", + pool->name, vaddr, (unsigned long)dma); + return; + } + + offset = vaddr - page->vaddr; +#ifdef DMAPOOL_DEBUG + if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); + if (pool->dev) + dev_err(pool->dev, + "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pool->name, vaddr, (unsigned long long)dma); + else + printk(KERN_ERR + "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pool->name, vaddr, (unsigned long long)dma); + return; + } + { + unsigned int chain = page->offset; + while (chain < pool->allocation) { + if (chain != offset) { + chain = *(int *)(page->vaddr + chain); + continue; + } + spin_unlock_irqrestore(&pool->lock, flags); + if (pool->dev) + dev_err(pool->dev, "dma_pool_free %s, dma %Lx " + "already free\n", pool->name, + (unsigned long long)dma); + else + printk(KERN_ERR "dma_pool_free %s, dma %Lx " + "already free\n", pool->name, + (unsigned long long)dma); + return; + } + } + memset(vaddr, POOL_POISON_FREED, pool->size); +#endif + + page->in_use--; + *(int *)vaddr = page->offset; + page->offset = offset; + /* + * Resist a temptation to do + * if (!is_page_busy(page)) pool_free_page(pool, page); + * Better have a few empty pages hang around. + */ + spin_unlock_irqrestore(&pool->lock, flags); +} +EXPORT_SYMBOL(dma_pool_free); + +/* + * Managed DMA pool + */ +static void dmam_pool_release(struct device *dev, void *res) +{ + struct dma_pool *pool = *(struct dma_pool **)res; + + dma_pool_destroy(pool); +} + +static int dmam_pool_match(struct device *dev, void *res, void *match_data) +{ + return *(struct dma_pool **)res == match_data; +} + +/** + * dmam_pool_create - Managed dma_pool_create() + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @allocation: returned blocks won't cross this boundary (or zero) + * + * Managed dma_pool_create(). DMA pool created with this function is + * automatically destroyed on driver detach. + */ +struct dma_pool *dmam_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t allocation) +{ + struct dma_pool **ptr, *pool; + + ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + pool = *ptr = dma_pool_create(name, dev, size, align, allocation); + if (pool) + devres_add(dev, ptr); + else + devres_free(ptr); + + return pool; +} +EXPORT_SYMBOL(dmam_pool_create); + +/** + * dmam_pool_destroy - Managed dma_pool_destroy() + * @pool: dma pool that will be destroyed + * + * Managed dma_pool_destroy(). + */ +void dmam_pool_destroy(struct dma_pool *pool) +{ + struct device *dev = pool->dev; + + WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); +} +EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/kernel/mm/early_ioremap.c b/kernel/mm/early_ioremap.c new file mode 100644 index 000000000..e10ccd299 --- /dev/null +++ b/kernel/mm/early_ioremap.c @@ -0,0 +1,245 @@ +/* + * Provide common bits of early_ioremap() support for architectures needing + * temporary mappings during boot before ioremap() is available. + * + * This is mostly a direct copy of the x86 early_ioremap implementation. + * + * (C) Copyright 1995 1996, 2014 Linus Torvalds + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_MMU +static int early_ioremap_debug __initdata; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static int after_paging_init __initdata; + +void __init __weak early_ioremap_shutdown(void) +{ +} + +void __init early_ioremap_reset(void) +{ + early_ioremap_shutdown(); + after_paging_init = 1; +} + +/* + * Generally, ioremap() is available after paging_init() has been called. + * Architectures wanting to allow early_ioremap after paging_init() can + * define __late_set_fixmap and __late_clear_fixmap to do the right thing. + */ +#ifndef __late_set_fixmap +static inline void __init __late_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + BUG(); +} +#endif + +#ifndef __late_clear_fixmap +static inline void __init __late_clear_fixmap(enum fixed_addresses idx) +{ + BUG(); +} +#endif + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_setup(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (WARN_ON(prev_map[i])) + break; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (WARN(count, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n" + "please boot with early_ioremap_debug and report the dmesg.\n", + count)) + return 1; + return 0; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", + __func__, (u64)phys_addr, size)) + return NULL; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (WARN_ON(!size || last_addr < phys_addr)) + return NULL; + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (WARN_ON(nrpages > NR_FIX_BTMAPS)) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_set_fixmap(idx, phys_addr, prot); + else + __early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", + __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", + addr, size)) + return; + + if (WARN(prev_size[slot] != size, + "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot])) + return; + + WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", + addr, size, slot); + + virt_addr = (unsigned long)addr; + if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) + return; + + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, + FIXMAP_PAGE_NORMAL); +} +#else /* CONFIG_MMU */ + +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void __iomem *)phys_addr; +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ +} + +#endif /* CONFIG_MMU */ + + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} diff --git a/kernel/mm/fadvise.c b/kernel/mm/fadvise.c new file mode 100644 index 000000000..4a3907cf7 --- /dev/null +++ b/kernel/mm/fadvise.c @@ -0,0 +1,162 @@ +/* + * mm/fadvise.c + * + * Copyright (C) 2002, Linus Torvalds + * + * 11Jan2003 Andrew Morton + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could + * deactivate the pages and clear PG_Referenced. + */ +SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) +{ + struct fd f = fdget(fd); + struct inode *inode; + struct address_space *mapping; + struct backing_dev_info *bdi; + loff_t endbyte; /* inclusive */ + pgoff_t start_index; + pgoff_t end_index; + unsigned long nrpages; + int ret = 0; + + if (!f.file) + return -EBADF; + + inode = file_inode(f.file); + if (S_ISFIFO(inode->i_mode)) { + ret = -ESPIPE; + goto out; + } + + mapping = f.file->f_mapping; + if (!mapping || len < 0) { + ret = -EINVAL; + goto out; + } + + if (IS_DAX(inode)) { + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_NOREUSE: + case POSIX_FADV_DONTNEED: + /* no bad return value, but ignore advice */ + break; + default: + ret = -EINVAL; + } + goto out; + } + + /* Careful about overflows. Len == 0 means "as much as possible" */ + endbyte = offset + len; + if (!len || endbyte < len) + endbyte = -1; + else + endbyte--; /* inclusive */ + + bdi = inode_to_bdi(mapping->host); + + switch (advice) { + case POSIX_FADV_NORMAL: + f.file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); + break; + case POSIX_FADV_RANDOM: + spin_lock(&f.file->f_lock); + f.file->f_mode |= FMODE_RANDOM; + spin_unlock(&f.file->f_lock); + break; + case POSIX_FADV_SEQUENTIAL: + f.file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&f.file->f_lock); + f.file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&f.file->f_lock); + break; + case POSIX_FADV_WILLNEED: + /* First and last PARTIAL page! */ + start_index = offset >> PAGE_CACHE_SHIFT; + end_index = endbyte >> PAGE_CACHE_SHIFT; + + /* Careful about overflow on the "+1" */ + nrpages = end_index - start_index + 1; + if (!nrpages) + nrpages = ~0UL; + + /* + * Ignore return value because fadvise() shall return + * success even if filesystem can't retrieve a hint, + */ + force_page_cache_readahead(mapping, f.file, start_index, + nrpages); + break; + case POSIX_FADV_NOREUSE: + break; + case POSIX_FADV_DONTNEED: + if (!bdi_write_congested(bdi)) + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); + + /* + * First and last FULL page! Partial pages are deliberately + * preserved on the expectation that it is better to preserve + * needed memory than to discard unneeded memory. + */ + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + end_index = (endbyte >> PAGE_CACHE_SHIFT); + + if (end_index >= start_index) { + unsigned long count = invalidate_mapping_pages(mapping, + start_index, end_index); + + /* + * If fewer pages were invalidated than expected then + * it is possible that some of the pages were on + * a per-cpu pagevec for a remote CPU. Drain all + * pagevecs and try again. + */ + if (count < (end_index - start_index + 1)) { + lru_add_drain_all(); + invalidate_mapping_pages(mapping, start_index, + end_index); + } + } + break; + default: + ret = -EINVAL; + } +out: + fdput(f); + return ret; +} + +#ifdef __ARCH_WANT_SYS_FADVISE64 + +SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) +{ + return sys_fadvise64_64(fd, offset, len, advice); +} + +#endif diff --git a/kernel/mm/failslab.c b/kernel/mm/failslab.c new file mode 100644 index 000000000..fefaabaab --- /dev/null +++ b/kernel/mm/failslab.c @@ -0,0 +1,60 @@ +#include +#include + +static struct { + struct fault_attr attr; + u32 ignore_gfp_wait; + int cache_filter; +} failslab = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, + .cache_filter = 0, +}; + +bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) +{ + if (gfpflags & __GFP_NOFAIL) + return false; + + if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) + return false; + + if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + return false; + + return should_fail(&failslab.attr, size); +} + +static int __init setup_failslab(char *str) +{ + return setup_fault_attr(&failslab.attr, str); +} +__setup("failslab=", setup_failslab); + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS +static int __init failslab_debugfs_init(void) +{ + struct dentry *dir; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + + dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter)) + goto fail; + + return 0; +fail: + debugfs_remove_recursive(dir); + + return -ENOMEM; +} + +late_initcall(failslab_debugfs_init); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ diff --git a/kernel/mm/filemap.c b/kernel/mm/filemap.c new file mode 100644 index 000000000..01cf28476 --- /dev/null +++ b/kernel/mm/filemap.c @@ -0,0 +1,2670 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for BUG_ON(!in_atomic()) only */ +#include +#include +#include +#include +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include + +/* + * FIXME: remove all knowledge of the buffer layer from the core VM + */ +#include /* for try_to_free_buffers */ + +#include + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli + */ + +/* + * Lock ordering: + * + * ->i_mmap_rwsem (truncate_pagecache) + * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->swap_lock (exclusive_swap_page, others) + * ->mapping->tree_lock + * + * ->i_mutex + * ->i_mmap_rwsem (truncate->unmap_mapping_range) + * + * ->mmap_sem + * ->i_mmap_rwsem + * ->page_table_lock or pte_lock (various, mainly in memory.c) + * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * + * ->mmap_sem + * ->lock_page (access_process_vm) + * + * ->i_mutex (generic_perform_write) + * ->mmap_sem (fault_in_pages_readable->do_page_fault) + * + * bdi->wb.list_lock + * sb_lock (fs/fs-writeback.c) + * ->mapping->tree_lock (__sync_single_inode) + * + * ->i_mmap_rwsem + * ->anon_vma.lock (vma_adjust) + * + * ->anon_vma.lock + * ->page_table_lock or pte_lock (anon_vma_prepare and various) + * + * ->page_table_lock or pte_lock + * ->swap_lock (try_to_unmap_one) + * ->private_lock (try_to_unmap_one) + * ->tree_lock (try_to_unmap_one) + * ->zone.lru_lock (follow_page->mark_page_accessed) + * ->zone.lru_lock (check_pte_range->isolate_lru_page) + * ->private_lock (page_remove_rmap->set_page_dirty) + * ->tree_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) + * ->private_lock (zap_pte_range->__set_page_dirty_buffers) + * + * ->i_mmap_rwsem + * ->tasklist_lock (memory_failure, collect_procs_ao) + */ + +static void page_cache_tree_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; + + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { + mapping->nrshadows++; + /* + * Make sure the nrshadows update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + local_lock(workingset_shadow_lock); + list_lru_add(&__workingset_shadow_nodes, &node->private_list); + local_unlock(workingset_shadow_lock); + } +} + +/* + * Delete a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold the mapping's tree_lock. + */ +void __delete_from_page_cache(struct page *page, void *shadow) +{ + struct address_space *mapping = page->mapping; + + trace_mm_filemap_delete_from_page_cache(page); + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_invalidate_page(mapping, page); + + page_cache_tree_delete(mapping, page, shadow); + + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ + + __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); + BUG_ON(page_mapped(page)); + + /* + * At this point page must be either written or cleaned by truncate. + * Dirty page here signals a bug and loss of unwritten data. + * + * This fixes dirty accounting after removing the page entirely but + * leaves PageDirty set: it has no effect for truncated page and + * anyway will be cleared before returning page into buddy allocator. + */ + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping); +} + +/** + * delete_from_page_cache - delete page from page cache + * @page: the page which the kernel is trying to remove from page cache + * + * This must be called only on pages that have been verified to be in the page + * cache and locked. It will never put the page into the free list, the caller + * has a reference on the page. + */ +void delete_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + void (*freepage)(struct page *); + + BUG_ON(!PageLocked(page)); + + freepage = mapping->a_ops->freepage; + spin_lock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, NULL); + spin_unlock_irq(&mapping->tree_lock); + + if (freepage) + freepage(page); + page_cache_release(page); +} +EXPORT_SYMBOL(delete_from_page_cache); + +static int filemap_check_errors(struct address_space *mapping) +{ + int ret = 0; + /* Check for outstanding write errors */ + if (test_bit(AS_ENOSPC, &mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &mapping->flags) && + test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; +} + +/** + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets inclusive. + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + */ +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) +{ + int ret; + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + if (!mapping_cap_writeback_dirty(mapping)) + return 0; + + ret = do_writepages(mapping, &wbc); + return ret; +} + +static inline int __filemap_fdatawrite(struct address_space *mapping, + int sync_mode) +{ + return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); +} + +int filemap_fdatawrite(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_ALL); +} +EXPORT_SYMBOL(filemap_fdatawrite); + +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); +} +EXPORT_SYMBOL(filemap_fdatawrite_range); + +/** + * filemap_flush - mostly a non-blocking flush + * @mapping: target address_space + * + * This is a mostly non-blocking flush. Not suitable for data-integrity + * purposes - I/O may not be started against all dirty pages. + */ +int filemap_flush(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_NONE); +} +EXPORT_SYMBOL(filemap_flush); + +/** + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. + */ +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; + pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; + struct pagevec pvec; + int nr_pages; + int ret2, ret = 0; + + if (end_byte < start_byte) + goto out; + + pagevec_init(&pvec, 0); + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + wait_on_page_writeback(page); + if (TestClearPageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } +out: + ret2 = filemap_check_errors(mapping); + if (!ret) + ret = ret2; + + return ret; +} +EXPORT_SYMBOL(filemap_fdatawait_range); + +/** + * filemap_fdatawait - wait for all under-writeback pages to complete + * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. + */ +int filemap_fdatawait(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return 0; + + return filemap_fdatawait_range(mapping, 0, i_size - 1); +} +EXPORT_SYMBOL(filemap_fdatawait); + +int filemap_write_and_wait(struct address_space *mapping) +{ + int err = 0; + + if (mapping->nrpages) { + err = filemap_fdatawrite(mapping); + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err != -EIO) { + int err2 = filemap_fdatawait(mapping); + if (!err) + err = err2; + } + } else { + err = filemap_check_errors(mapping); + } + return err; +} +EXPORT_SYMBOL(filemap_write_and_wait); + +/** + * filemap_write_and_wait_range - write out & wait on a file range + * @mapping: the address_space for the pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that `lend' is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + */ +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int err = 0; + + if (mapping->nrpages) { + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) { + int err2 = filemap_fdatawait_range(mapping, + lstart, lend); + if (!err) + err = err2; + } + } else { + err = filemap_check_errors(mapping); + } + return err; +} +EXPORT_SYMBOL(filemap_write_and_wait_range); + +/** + * replace_page_cache_page - replace a pagecache page with a new one + * @old: page to be replaced + * @new: page to replace with + * @gfp_mask: allocation mode + * + * This function replaces a page in the pagecache with a new one. On + * success it acquires the pagecache reference for the new page and + * drops it for the old page. Both the old and new pages must be + * locked. This function does not add the new page to the LRU, the + * caller must do that. + * + * The remove + add is atomic. The only way this function can fail is + * memory allocation failure. + */ +int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +{ + int error; + + VM_BUG_ON_PAGE(!PageLocked(old), old); + VM_BUG_ON_PAGE(!PageLocked(new), new); + VM_BUG_ON_PAGE(new->mapping, new); + + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + if (!error) { + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *); + + pgoff_t offset = old->index; + freepage = mapping->a_ops->freepage; + + page_cache_get(new); + new->mapping = mapping; + new->index = offset; + + spin_lock_irq(&mapping->tree_lock); + __delete_from_page_cache(old, NULL); + error = radix_tree_insert(&mapping->page_tree, offset, new); + BUG_ON(error); + mapping->nrpages++; + __inc_zone_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(new)) + __inc_zone_page_state(new, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_migrate(old, new, true); + radix_tree_preload_end(); + if (freepage) + freepage(old); + page_cache_release(old); + } + + return error; +} +EXPORT_SYMBOL_GPL(replace_page_cache_page); + +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + if (shadowp) + *shadowp = p; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) { + local_lock(workingset_shadow_lock); + list_lru_del(&__workingset_shadow_nodes, + &node->private_list); + local_unlock(workingset_shadow_lock); + } + } + return 0; +} + +static int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) +{ + int huge = PageHuge(page); + struct mem_cgroup *memcg; + int error; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageSwapBacked(page), page); + + if (!huge) { + error = mem_cgroup_try_charge(page, current->mm, + gfp_mask, &memcg); + if (error) + return error; + } + + error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); + if (error) { + if (!huge) + mem_cgroup_cancel_charge(page, memcg); + return error; + } + + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = page_cache_tree_insert(mapping, page, shadowp); + radix_tree_preload_end(); + if (unlikely(error)) + goto err_insert; + __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + if (!huge) + mem_cgroup_commit_charge(page, memcg, false); + trace_mm_filemap_add_to_page_cache(page); + return 0; +err_insert: + page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ + spin_unlock_irq(&mapping->tree_lock); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); + page_cache_release(page); + return error; +} + +/** + * add_to_page_cache_locked - add a locked page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add a page to the pagecache. It must be locked. + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + return __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, NULL); +} +EXPORT_SYMBOL(add_to_page_cache_locked); + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + void *shadow = NULL; + int ret; + + __set_page_locked(page); + ret = __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, &shadow); + if (unlikely(ret)) + __clear_page_locked(page); + else { + /* + * The page might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed page. + */ + if (shadow && workingset_refault(shadow)) { + SetPageActive(page); + workingset_activation(page); + } else + ClearPageActive(page); + lru_cache_add(page); + } + return ret; +} +EXPORT_SYMBOL_GPL(add_to_page_cache_lru); + +#ifdef CONFIG_NUMA +struct page *__page_cache_alloc(gfp_t gfp) +{ + int n; + struct page *page; + + if (cpuset_do_page_mem_spread()) { + unsigned int cpuset_mems_cookie; + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); + + return page; + } + return alloc_pages(gfp, 0); +} +EXPORT_SYMBOL(__page_cache_alloc); +#endif + +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +wait_queue_head_t *page_waitqueue(struct page *page) +{ + const struct zone *zone = page_zone(page); + + return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; +} +EXPORT_SYMBOL(page_waitqueue); + +void wait_on_page_bit(struct page *page, int bit_nr) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + if (test_bit(bit_nr, &page->flags)) + __wait_on_bit(page_waitqueue(page), &wait, bit_wait_io, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_on_page_bit); + +int wait_on_page_bit_killable(struct page *page, int bit_nr) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + if (!test_bit(bit_nr, &page->flags)) + return 0; + + return __wait_on_bit(page_waitqueue(page), &wait, + bit_wait_io, TASK_KILLABLE); +} + +int wait_on_page_bit_killable_timeout(struct page *page, + int bit_nr, unsigned long timeout) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + wait.key.timeout = jiffies + timeout; + if (!test_bit(bit_nr, &page->flags)) + return 0; + return __wait_on_bit(page_waitqueue(page), &wait, + bit_wait_io_timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); + +/** + * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue + * @page: Page defining the wait queue of interest + * @waiter: Waiter to add to the queue + * + * Add an arbitrary @waiter to the wait queue for the nominated @page. + */ +void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +{ + wait_queue_head_t *q = page_waitqueue(page); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, waiter); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(add_page_wait_queue); + +/** + * unlock_page - unlock a locked page + * @page: the page + * + * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Also wakes sleepers in wait_on_page_writeback() because the wakeup + * mechanism between PageLocked pages and PageWriteback pages is shared. + * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. + * + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). + */ +void unlock_page(struct page *page) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_atomic(); + wake_up_page(page, PG_locked); +} +EXPORT_SYMBOL(unlock_page); + +/** + * end_page_writeback - end writeback against a page + * @page: the page + */ +void end_page_writeback(struct page *page) +{ + /* + * TestClearPageReclaim could be used here but it is an atomic + * operation and overkill in this particular case. Failing to + * shuffle a page marked for immediate reclaim is too mild to + * justify taking an atomic operation penalty at the end of + * ever page writeback. + */ + if (PageReclaim(page)) { + ClearPageReclaim(page); + rotate_reclaimable_page(page); + } + + if (!test_clear_page_writeback(page)) + BUG(); + + smp_mb__after_atomic(); + wake_up_page(page, PG_writeback); +} +EXPORT_SYMBOL(end_page_writeback); + +/* + * After completing I/O on a page, call this routine to update the page + * flags appropriately + */ +void page_endio(struct page *page, int rw, int err) +{ + if (rw == READ) { + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* rw == WRITE */ + if (err) { + SetPageError(page); + if (page->mapping) + mapping_set_error(page->mapping, err); + } + end_page_writeback(page); + } +} +EXPORT_SYMBOL_GPL(page_endio); + +/** + * __lock_page - get a lock on the page, assuming we need to sleep to get it + * @page: the page to lock + */ +void __lock_page(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, + TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(__lock_page); + +int __lock_page_killable(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + + return __wait_on_bit_lock(page_waitqueue(page), &wait, + bit_wait_io, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(__lock_page_killable); + +/* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + /* + * CAUTION! In this case, mmap_sem is not released + * even though return 0. + */ + if (flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + + up_read(&mm->mmap_sem); + if (flags & FAULT_FLAG_KILLABLE) + wait_on_page_locked_killable(page); + else + wait_on_page_locked(page); + return 0; + } else { + if (flags & FAULT_FLAG_KILLABLE) { + int ret; + + ret = __lock_page_killable(page); + if (ret) { + up_read(&mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; + } +} + +/** + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry + * @mapping: the address_space to search + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. + * + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. + * + * Otherwise, %NULL is returned. + */ +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) +{ + void **pagep; + struct page *page; + + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + goto out; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. + */ + goto out; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } +out: + rcu_read_unlock(); + + return page; +} +EXPORT_SYMBOL(find_get_entry); + +/** + * find_lock_entry - locate, pin and lock a page cache entry + * @mapping: the address_space to search + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. + * + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. + */ +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) +{ + struct page *page; + +repeat: + page = find_get_entry(mapping, offset); + if (page && !radix_tree_exception(page)) { + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON_PAGE(page->index != offset, page); + } + return page; +} +EXPORT_SYMBOL(find_lock_entry); + +/** + * pagecache_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * @fgp_flags: PCG flags + * @gfp_mask: gfp mask to use for the page cache data page allocation + * + * Looks up the page cache slot at @mapping & @offset. + * + * PCG flags modify how the page is returned. + * + * FGP_ACCESSED: the page will be marked accessed + * FGP_LOCK: Page is return locked + * FGP_CREAT: If page is not present then a new page is allocated using + * @gfp_mask and added to the page cache and the VM's LRU + * list. The page is returned locked and with an increased + * refcount. Otherwise, %NULL is returned. + * + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even + * if the GFP flags specified for FGP_CREAT are atomic. + * + * If there is a page cache page, it is returned with an increased refcount. + */ +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t gfp_mask) +{ + struct page *page; + +repeat: + page = find_get_entry(mapping, offset); + if (radix_tree_exceptional_entry(page)) + page = NULL; + if (!page) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!trylock_page(page)) { + page_cache_release(page); + return NULL; + } + } else { + lock_page(page); + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON_PAGE(page->index != offset, page); + } + + if (page && (fgp_flags & FGP_ACCESSED)) + mark_page_accessed(page); + +no_page: + if (!page && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) + gfp_mask &= ~__GFP_FS; + + page = __page_cache_alloc(gfp_mask); + if (!page) + return NULL; + + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoid atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + __SetPageReferenced(page); + + err = add_to_page_cache_lru(page, mapping, offset, + gfp_mask & GFP_RECLAIM_MASK); + if (unlikely(err)) { + page_cache_release(page); + page = NULL; + if (err == -EEXIST) + goto repeat; + } + } + + return page; +} +EXPORT_SYMBOL(pagecache_get_page); + +/** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages, or swap entries from + * shmem/tmpfs, are included in the returned array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} + +/** + * find_get_pages - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages() will search for and return a group of up to + * @nr_pages pages in the mapping. The pages are placed at @pages. + * find_get_pages() takes a reference against the returned pages. + * + * The search returns a group of mapping-contiguous pages with ascending + * indexes. There may be holes in the indices due to not-present pages. + * + * find_get_pages() returns the number of pages which were found. + */ +unsigned find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages) +{ + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + WARN_ON(iter.index); + goto restart; + } + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Skip + * over it. + */ + continue; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + if (++ret == nr_pages) + break; + } + + rcu_read_unlock(); + return ret; +} + +/** + * find_get_pages_contig - gang contiguous pagecache lookup + * @mapping: The address_space to search + * @index: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages_contig() works exactly like find_get_pages(), except + * that the returned number of pages are guaranteed to be contiguous. + * + * find_get_pages_contig() returns the number of pages which were found. + */ +unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, + unsigned int nr_pages, struct page **pages) +{ + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!nr_pages)) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + /* The hole, there no reason to continue */ + if (unlikely(!page)) + break; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Stop + * looking for contiguous pages. + */ + break; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != iter.index) { + page_cache_release(page); + break; + } + + pages[ret] = page; + if (++ret == nr_pages) + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(find_get_pages_contig); + +/** + * find_get_pages_tag - find and return pages that match @tag + * @mapping: the address_space to search + * @index: the starting page index + * @tag: the tag index + * @nr_pages: the maximum number of pages + * @pages: where the resulting pages are placed + * + * Like find_get_pages, except we only return pages which are tagged with + * @tag. We update @index to index the next page for the traversal. + */ +unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, + int tag, unsigned int nr_pages, struct page **pages) +{ + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, *index, tag) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * A shadow entry of a recently evicted page. + * + * Those entries should never be tagged, but + * this tree walk is lockless and the tags are + * looked up in bulk, one radix tree node at a + * time, so there is a sizable window for page + * reclaim to evict a page we saw tagged. + * + * Skip over it. + */ + continue; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + if (++ret == nr_pages) + break; + } + + rcu_read_unlock(); + + if (ret) + *index = pages[ret - 1]->index + 1; + + return ret; +} +EXPORT_SYMBOL(find_get_pages_tag); + +/* + * CD/DVDs are error prone. When a medium error occurs, the driver may fail + * a _large_ part of the i/o request. Imagine the worst scenario: + * + * ---R__________________________________________B__________ + * ^ reading here ^ bad block(assume 4k) + * + * read(R) => miss => readahead(R...B) => media error => frustrating retries + * => failing the whole request => read(R) => read(R+1) => + * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => + * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => + * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... + * + * It is going insane. Fix it by quickly scaling down the readahead size. + */ +static void shrink_readahead_size_eio(struct file *filp, + struct file_ra_state *ra) +{ + ra->ra_pages /= 4; +} + +/** + * do_generic_file_read - generic file read routine + * @filp: the file to read + * @ppos: current file position + * @iter: data destination + * @written: already copied + * + * This is a generic file read routine, and uses the + * mapping->a_ops->readpage() function for the actual low-level stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + */ +static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, + struct iov_iter *iter, ssize_t written) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + struct file_ra_state *ra = &filp->f_ra; + pgoff_t index; + pgoff_t last_index; + pgoff_t prev_index; + unsigned long offset; /* offset into pagecache page */ + unsigned int prev_offset; + int error = 0; + + index = *ppos >> PAGE_CACHE_SHIFT; + prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; + prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); + last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + for (;;) { + struct page *page; + pgoff_t end_index; + loff_t isize; + unsigned long nr, ret; + + cond_resched(); +find_page: + page = find_get_page(mapping, index); + if (!page) { + page_cache_sync_readahead(mapping, + ra, filp, + index, last_index - index); + page = find_get_page(mapping, index); + if (unlikely(page == NULL)) + goto no_cached_page; + } + if (PageReadahead(page)) { + page_cache_async_readahead(mapping, + ra, filp, page, + index, last_index - index); + } + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; + if (!mapping->a_ops->is_partially_uptodate(page, + offset, iter->count)) + goto page_not_up_to_date_locked; + unlock_page(page); + } +page_ok: + /* + * i_size must be checked after we know the page is Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + goto out; + } + + /* nr is the maximum number of bytes to copy from this page */ + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (nr <= offset) { + page_cache_release(page); + goto out; + } + } + nr = nr - offset; + + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + /* + * When a sequential read accesses a page several times, + * only mark it as accessed the first time. + */ + if (prev_index != index || offset != prev_offset) + mark_page_accessed(page); + prev_index = index; + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ + + ret = copy_page_to_iter(page, offset, nr, iter); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + prev_offset = offset; + + page_cache_release(page); + written += ret; + if (!iov_iter_count(iter)) + goto out; + if (ret < nr) { + error = -EFAULT; + goto out; + } + continue; + +page_not_up_to_date: + /* Get exclusive access to the page ... */ + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; + +page_not_up_to_date_locked: + /* Did it get truncated before we got the lock? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) { + unlock_page(page); + goto page_ok; + } + +readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); + /* Start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (unlikely(error)) { + if (error == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + error = 0; + goto find_page; + } + goto readpage_error; + } + + if (!PageUptodate(page)) { + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_mapping_pages got it + */ + unlock_page(page); + page_cache_release(page); + goto find_page; + } + unlock_page(page); + shrink_readahead_size_eio(filp, ra); + error = -EIO; + goto readpage_error; + } + unlock_page(page); + } + + goto page_ok; + +readpage_error: + /* UHHUH! A synchronous read error occurred. Report it */ + page_cache_release(page); + goto out; + +no_cached_page: + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + page = page_cache_alloc_cold(mapping); + if (!page) { + error = -ENOMEM; + goto out; + } + error = add_to_page_cache_lru(page, mapping, + index, GFP_KERNEL); + if (error) { + page_cache_release(page); + if (error == -EEXIST) { + error = 0; + goto find_page; + } + goto out; + } + goto readpage; + } + +out: + ra->prev_pos = prev_index; + ra->prev_pos <<= PAGE_CACHE_SHIFT; + ra->prev_pos |= prev_offset; + + *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; + file_accessed(filp); + return written ? written : error; +} + +/** + * generic_file_read_iter - generic filesystem read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the "read_iter()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + ssize_t retval = 0; + loff_t *ppos = &iocb->ki_pos; + loff_t pos = *ppos; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + loff_t size; + + if (!count) + goto out; /* skip atime */ + size = i_size_read(inode); + retval = filemap_write_and_wait_range(mapping, pos, + pos + count - 1); + if (!retval) { + struct iov_iter data = *iter; + retval = mapping->a_ops->direct_IO(iocb, &data, pos); + } + + if (retval > 0) { + *ppos = pos + retval; + iov_iter_advance(iter, retval); + } + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. Buffered reads will not work for + * DAX files, so don't bother trying. + */ + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + IS_DAX(inode)) { + file_accessed(file); + goto out; + } + } + + retval = do_generic_file_read(file, ppos, iter, retval); +out: + return retval; +} +EXPORT_SYMBOL(generic_file_read_iter); + +#ifdef CONFIG_MMU +/** + * page_cache_read - adds requested page to the page cache if not already there + * @file: file to read + * @offset: page index + * + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int page_cache_read(struct file *file, pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + struct page *page; + int ret; + + do { + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (ret == 0) + ret = mapping->a_ops->readpage(file, page); + else if (ret == -EEXIST) + ret = 0; /* losing race to add is OK */ + + page_cache_release(page); + + } while (ret == AOP_TRUNCATED_PAGE); + + return ret; +} + +#define MMAP_LOTSAMISS (100) + +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (vma->vm_flags & VM_RAND_READ) + return; + if (!ra->ra_pages) + return; + + if (vma->vm_flags & VM_SEQ_READ) { + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); + return; + } + + /* Avoid banging the cache line if not needed */ + if (ra->mmap_miss < MMAP_LOTSAMISS * 10) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + /* + * mmap read-around + */ + ra_pages = max_sane_readahead(ra->ra_pages); + ra->start = max_t(long, 0, offset - ra_pages / 2); + ra->size = ra_pages; + ra->async_size = ra_pages / 4; + ra_submit(ra, mapping, file); +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (vma->vm_flags & VM_RAND_READ) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); +} + +/** + * filemap_fault - read in file data for page fault handling + * @vma: vma in which the fault was taken + * @vmf: struct vm_fault containing details of the fault + * + * filemap_fault() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. + */ +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int error; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct file_ra_state *ra = &file->f_ra; + struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; + struct page *page; + loff_t size; + int ret = 0; + + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (offset >= size >> PAGE_CACHE_SHIFT) + return VM_FAULT_SIGBUS; + + /* + * Do we have something in the page cache already? + */ + page = find_get_page(mapping, offset); + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { + /* + * We found the page, so try async readahead before + * waiting for the lock. + */ + do_async_mmap_readahead(vma, ra, file, page, offset); + } else if (!page) { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_get_page(mapping, offset); + if (!page) + goto no_cached_page; + } + + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return ret | VM_FAULT_RETRY; + } + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON_PAGE(page->index != offset, page); + + /* + * We have a locked page in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error. + */ + if (unlikely(!PageUptodate(page))) + goto page_not_uptodate; + + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + vmf->page = page; + return ret | VM_FAULT_LOCKED; + +no_cached_page: + /* + * We're only likely to ever get here if MADV_RANDOM is in + * effect. + */ + error = page_cache_read(file, offset); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; + +page_not_uptodate: + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + ClearPageError(page); + error = mapping->a_ops->readpage(file, page); + if (!error) { + wait_on_page_locked(page); + if (!PageUptodate(page)) + error = -EIO; + } + page_cache_release(page); + + if (!error || error == AOP_TRUNCATED_PAGE) + goto retry_find; + + /* Things didn't work out. Return zero to tell the mm layer so. */ + shrink_readahead_size_eio(file, ra); + return VM_FAULT_SIGBUS; +} +EXPORT_SYMBOL(filemap_fault); + +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct radix_tree_iter iter; + void **slot; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + loff_t size; + struct page *page; + unsigned long address = (unsigned long) vmf->virtual_address; + unsigned long addr; + pte_t *pte; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { + if (iter.index > vmf->max_pgoff) + break; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + goto next; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + break; + else + goto next; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + if (!PageUptodate(page) || + PageReadahead(page) || + PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); + if (page->index >= size >> PAGE_CACHE_SHIFT) + goto unlock; + + pte = vmf->pte + page->index - vmf->pgoff; + if (!pte_none(*pte)) + goto unlock; + + if (file->f_ra.mmap_miss > 0) + file->f_ra.mmap_miss--; + addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; + do_set_pte(vma, addr, page, pte, false, false); + unlock_page(page); + goto next; +unlock: + unlock_page(page); +skip: + page_cache_release(page); +next: + if (iter.index == vmf->max_pgoff) + break; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(filemap_map_pages); + +int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + int ret = VM_FAULT_LOCKED; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + /* + * We mark the page dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty page and writeprotect it again. + */ + set_page_dirty(page); + wait_for_stable_page(page); +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(filemap_page_mkwrite); + +const struct vm_operations_struct generic_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +/* + * This is for filesystems which do not implement ->writepage. + */ +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + return generic_file_mmap(file, vma); +} +#else +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +#endif /* CONFIG_MMU */ + +EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap); + +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + +static struct page *__read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *, struct page *), + void *data, + gfp_t gfp) +{ + struct page *page; + int err; +repeat: + page = find_get_page(mapping, index); + if (!page) { + page = __page_cache_alloc(gfp | __GFP_COLD); + if (!page) + return ERR_PTR(-ENOMEM); + err = add_to_page_cache_lru(page, mapping, index, gfp); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; + /* Presumably ENOMEM for radix tree node */ + return ERR_PTR(err); + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); + } + } + return page; +} + +static struct page *do_read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *, struct page *), + void *data, + gfp_t gfp) + +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data, gfp); + if (IS_ERR(page)) + return page; + if (PageUptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry; + } + if (PageUptodate(page)) { + unlock_page(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; + } +out: + mark_page_accessed(page); + return page; +} + +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: first arg to filler(data, page) function, often left as NULL + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page and wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *, struct page *), + void *data) +{ + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_page); + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + + return do_read_cache_page(mapping, index, filler, NULL, gfp); +} +EXPORT_SYMBOL(read_cache_page_gfp); + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + unsigned long limit = rlimit(RLIMIT_FSIZE); + loff_t pos; + + if (!iov_iter_count(from)) + return 0; + + /* FIXME: this is for backwards compatibility with 2.4 */ + if (iocb->ki_flags & IOCB_APPEND) + iocb->ki_pos = i_size_read(inode); + + pos = iocb->ki_pos; + + if (limit != RLIM_INFINITY) { + if (iocb->ki_pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + iov_iter_truncate(from, limit - (unsigned long)pos); + } + + /* + * LFS rule + */ + if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && + !(file->f_flags & O_LARGEFILE))) { + if (pos >= MAX_NON_LFS) + return -EFBIG; + iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write. If we have + * exceeded without writing data we send a signal and return EFBIG. + * Linus frestrict idea will clean these up nicely.. + */ + if (unlikely(pos >= inode->i_sb->s_maxbytes)) + return -EFBIG; + + iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); + return iov_iter_count(from); +} +EXPORT_SYMBOL(generic_write_checks); + +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + return aops->write_end(file, mapping, pos, len, copied, page, fsdata); +} +EXPORT_SYMBOL(pagecache_write_end); + +ssize_t +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t written; + size_t write_len; + pgoff_t end; + struct iov_iter data; + + write_len = iov_iter_count(from); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; + } + } + + data = *from; + written = mapping->a_ops->direct_IO(iocb, &data, pos); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + + if (written > 0) { + pos += written; + iov_iter_advance(from, written); + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + iocb->ki_pos = pos; + } +out: + return written; +} +EXPORT_SYMBOL(generic_file_direct_write); + +/* + * Find or create a page at the given pagecache position. Return the locked + * page. This function is specifically for buffered writes. + */ +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) +{ + struct page *page; + int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; + + if (flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; + + page = pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); + if (page) + wait_for_stable_page(page); + + return page; +} +EXPORT_SYMBOL(grab_cache_page_write_begin); + +ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; + + /* + * Copies from kernel address space cannot fail (NFSD is a big user). + */ + if (!iter_is_iovec(i)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status < 0)) + break; + + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + } while (iov_iter_count(i)); + + return written ? written : status; +} +EXPORT_SYMBOL(generic_perform_write); + +/** + * __generic_file_write_iter - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_mutex to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_mutex. + */ +ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t written = 0; + ssize_t err; + ssize_t status; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + if (iocb->ki_flags & IOCB_DIRECT) { + loff_t pos, endbyte; + + written = generic_file_direct_write(iocb, from, iocb->ki_pos); + /* + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). + */ + if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) + goto out; + + status = generic_perform_write(file, from, pos = iocb->ki_pos); + /* + * If generic_perform_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (unlikely(status < 0)) { + err = status; + goto out; + } + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + status - 1; + err = filemap_write_and_wait_range(mapping, pos, endbyte); + if (err == 0) { + iocb->ki_pos = endbyte + 1; + written += status; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_perform_write(file, from, iocb->ki_pos); + if (likely(written > 0)) + iocb->ki_pos += written; + } +out: + current->backing_dev_info = NULL; + return written ? written : err; +} +EXPORT_SYMBOL(__generic_file_write_iter); + +/** + * generic_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * This is a wrapper around __generic_file_write_iter() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_mutex as needed. + */ +ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = __generic_file_write_iter(iocb, from); + mutex_unlock(&inode->i_mutex); + + if (ret > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(generic_file_write_iter); + +/** + * try_to_release_page() - release old fs-specific metadata on a page + * + * @page: the page which the kernel is trying to free + * @gfp_mask: memory allocation flags (and I/O mode) + * + * The address_space is to try to release any data against the page + * (presumably at page->private). If the release was successful, return `1'. + * Otherwise return zero. + * + * This may also be called if PG_fscache is set on a page, indicating that the + * page is known to the local caching routines. + * + * The @gfp_mask argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * + */ +int try_to_release_page(struct page *page, gfp_t gfp_mask) +{ + struct address_space * const mapping = page->mapping; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping && mapping->a_ops->releasepage) + return mapping->a_ops->releasepage(page, gfp_mask); + return try_to_free_buffers(page); +} + +EXPORT_SYMBOL(try_to_release_page); diff --git a/kernel/mm/frontswap.c b/kernel/mm/frontswap.c new file mode 100644 index 000000000..8d82809eb --- /dev/null +++ b/kernel/mm/frontswap.c @@ -0,0 +1,457 @@ +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/vm/frontswap.txt for more information. + * + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * frontswap_ops is set by frontswap_register_ops to contain the pointers + * to the frontswap "backend" implementation functions. + */ +static struct frontswap_ops *frontswap_ops __read_mostly; + +/* + * If enabled, frontswap_store will return failure even on success. As + * a result, the swap subsystem will always write the page to swap, in + * effect converting frontswap into a writethrough cache. In this mode, + * there is no direct reduction in swap writes, but a frontswap backend + * can unilaterally "reclaim" any pages in use with no data loss, thus + * providing increases control over maximum memory usage due to frontswap. + */ +static bool frontswap_writethrough_enabled __read_mostly; + +/* + * If enabled, the underlying tmem implementation is capable of doing + * exclusive gets, so frontswap_load, on a successful tmem_get must + * mark the page as no longer in frontswap AND mark it dirty. + */ +static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; + +#ifdef CONFIG_DEBUG_FS +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured). These are for information only so are not protected + * against increment races. + */ +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; +static u64 frontswap_invalidates; + +static inline void inc_frontswap_loads(void) { + frontswap_loads++; +} +static inline void inc_frontswap_succ_stores(void) { + frontswap_succ_stores++; +} +static inline void inc_frontswap_failed_stores(void) { + frontswap_failed_stores++; +} +static inline void inc_frontswap_invalidates(void) { + frontswap_invalidates++; +} +#else +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } +static inline void inc_frontswap_invalidates(void) { } +#endif + +/* + * Due to the asynchronous nature of the backends loading potentially + * _after_ the swap system has been activated, we have chokepoints + * on all frontswap functions to not call the backend until the backend + * has registered. + * + * Specifically when no backend is registered (nobody called + * frontswap_register_ops) all calls to frontswap_init (which is done via + * swapon -> enable_swap_info -> frontswap_init) are registered and remembered + * (via the setting of need_init bitmap) but fail to create tmem_pools. When a + * backend registers with frontswap at some later point the previous + * calls to frontswap_init are executed (by iterating over the need_init + * bitmap) to create tmem_pools and set the respective poolids. All of that is + * guarded by us using atomic bit operations on the 'need_init' bitmap. + * + * This would not guards us against the user deciding to call swapoff right as + * we are calling the backend to initialize (so swapon is in action). + * Fortunatly for us, the swapon_mutex has been taked by the callee so we are + * OK. The other scenario where calls to frontswap_store (called via + * swap_writepage) is racing with frontswap_invalidate_area (called via + * swapoff) is again guarded by the swap subsystem. + * + * While no backend is registered all calls to frontswap_[store|load| + * invalidate_area|invalidate_page] are ignored or fail. + * + * The time between the backend being registered and the swap file system + * calling the backend (via the frontswap_* functions) is indeterminate as + * frontswap_ops is not atomic_t (or a value guarded by a spinlock). + * That is OK as we are comfortable missing some of these calls to the newly + * registered backend. + * + * Obviously the opposite (unloading the backend) must be done after all + * the frontswap_[store|load|invalidate_area|invalidate_page] start + * ignorning or failing the requests - at which point frontswap_ops + * would have to be made in some fashion atomic. + */ +static DECLARE_BITMAP(need_init, MAX_SWAPFILES); + +/* + * Register operations for frontswap, returning previous thus allowing + * detection of multiple backends and possible nesting. + */ +struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) +{ + struct frontswap_ops *old = frontswap_ops; + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (test_and_clear_bit(i, need_init)) { + struct swap_info_struct *sis = swap_info[i]; + /* __frontswap_init _should_ have set it! */ + if (!sis->frontswap_map) + return ERR_PTR(-EINVAL); + ops->init(i); + } + } + /* + * We MUST have frontswap_ops set _after_ the frontswap_init's + * have been called. Otherwise __frontswap_store might fail. Hence + * the barrier to make sure compiler does not re-order us. + */ + barrier(); + frontswap_ops = ops; + return old; +} +EXPORT_SYMBOL(frontswap_register_ops); + +/* + * Enable/disable frontswap writethrough (see above). + */ +void frontswap_writethrough(bool enable) +{ + frontswap_writethrough_enabled = enable; +} +EXPORT_SYMBOL(frontswap_writethrough); + +/* + * Enable/disable frontswap exclusive gets (see above). + */ +void frontswap_tmem_exclusive_gets(bool enable) +{ + frontswap_tmem_exclusive_gets_enabled = enable; +} +EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); + +/* + * Called when a swap device is swapon'd. + */ +void __frontswap_init(unsigned type, unsigned long *map) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + + /* + * p->frontswap is a bitmap that we MUST have to figure out which page + * has gone in frontswap. Without it there is no point of continuing. + */ + if (WARN_ON(!map)) + return; + /* + * Irregardless of whether the frontswap backend has been loaded + * before this function or it will be later, we _MUST_ have the + * p->frontswap set to something valid to work properly. + */ + frontswap_map_set(sis, map); + if (frontswap_ops) + frontswap_ops->init(type); + else { + BUG_ON(type >= MAX_SWAPFILES); + set_bit(type, need_init); + } +} +EXPORT_SYMBOL(__frontswap_init); + +bool __frontswap_test(struct swap_info_struct *sis, + pgoff_t offset) +{ + bool ret = false; + + if (frontswap_ops && sis->frontswap_map) + ret = test_bit(offset, sis->frontswap_map); + return ret; +} +EXPORT_SYMBOL(__frontswap_test); + +static inline void __frontswap_clear(struct swap_info_struct *sis, + pgoff_t offset) +{ + clear_bit(offset, sis->frontswap_map); + atomic_dec(&sis->frontswap_pages); +} + +/* + * "Store" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implementation may either overwrite the data and + * return success or invalidate the page from frontswap and return failure. + */ +int __frontswap_store(struct page *page) +{ + int ret = -1, dup = 0; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + /* + * Return if no backend registed. + * Don't need to inc frontswap_failed_stores here. + */ + if (!frontswap_ops) + return ret; + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + if (__frontswap_test(sis, offset)) + dup = 1; + ret = frontswap_ops->store(type, offset, page); + if (ret == 0) { + set_bit(offset, sis->frontswap_map); + inc_frontswap_succ_stores(); + if (!dup) + atomic_inc(&sis->frontswap_pages); + } else { + /* + failed dup always results in automatic invalidate of + the (older) page from frontswap + */ + inc_frontswap_failed_stores(); + if (dup) { + __frontswap_clear(sis, offset); + frontswap_ops->invalidate_page(type, offset); + } + } + if (frontswap_writethrough_enabled) + /* report failure so swap also writes to swap device */ + ret = -1; + return ret; +} +EXPORT_SYMBOL(__frontswap_store); + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache. + */ +int __frontswap_load(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + BUG_ON(!PageLocked(page)); + BUG_ON(sis == NULL); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) + ret = frontswap_ops->load(type, offset, page); + if (ret == 0) { + inc_frontswap_loads(); + if (frontswap_tmem_exclusive_gets_enabled) { + SetPageDirty(page); + __frontswap_clear(sis, offset); + } + } + return ret; +} +EXPORT_SYMBOL(__frontswap_load); + +/* + * Invalidate any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + BUG_ON(sis == NULL); + /* + * __frontswap_test() will check whether there is backend registered + */ + if (__frontswap_test(sis, offset)) { + frontswap_ops->invalidate_page(type, offset); + __frontswap_clear(sis, offset); + inc_frontswap_invalidates(); + } +} +EXPORT_SYMBOL(__frontswap_invalidate_page); + +/* + * Invalidate all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_invalidate_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + if (frontswap_ops) { + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + frontswap_ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + bitmap_zero(sis->frontswap_map, sis->max); + } + clear_bit(type, need_init); +} +EXPORT_SYMBOL(__frontswap_invalidate_area); + +static unsigned long __frontswap_curr_pages(void) +{ + unsigned long totalpages = 0; + struct swap_info_struct *si = NULL; + + assert_spin_locked(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) + totalpages += atomic_read(&si->frontswap_pages); + return totalpages; +} + +static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, + int *swapid) +{ + int ret = -EINVAL; + struct swap_info_struct *si = NULL; + int si_frontswap_pages; + unsigned long total_pages_to_unuse = total; + unsigned long pages = 0, pages_to_unuse = 0; + + assert_spin_locked(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) { + si_frontswap_pages = atomic_read(&si->frontswap_pages); + if (total_pages_to_unuse < si_frontswap_pages) { + pages = pages_to_unuse = total_pages_to_unuse; + } else { + pages = si_frontswap_pages; + pages_to_unuse = 0; /* unuse all */ + } + /* ensure there is enough RAM to fetch pages from frontswap */ + if (security_vm_enough_memory_mm(current->mm, pages)) { + ret = -ENOMEM; + continue; + } + vm_unacct_memory(pages); + *unused = pages_to_unuse; + *swapid = si->type; + ret = 0; + break; + } + + return ret; +} + +/* + * Used to check if it's necessory and feasible to unuse pages. + * Return 1 when nothing to do, 0 when need to shink pages, + * error code when there is an error. + */ +static int __frontswap_shrink(unsigned long target_pages, + unsigned long *pages_to_unuse, + int *type) +{ + unsigned long total_pages = 0, total_pages_to_unuse; + + assert_spin_locked(&swap_lock); + + total_pages = __frontswap_curr_pages(); + if (total_pages <= target_pages) { + /* Nothing to do */ + *pages_to_unuse = 0; + return 1; + } + total_pages_to_unuse = total_pages - target_pages; + return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); +} + +/* + * Frontswap, like a true swap device, may unnecessarily retain pages + * under certain circumstances; "shrink" frontswap is essentially a + * "partial swapoff" and works by calling try_to_unuse to attempt to + * unuse enough frontswap pages to attempt to -- subject to memory + * constraints -- reduce the number of pages in frontswap to the + * number given in the parameter target_pages. + */ +void frontswap_shrink(unsigned long target_pages) +{ + unsigned long pages_to_unuse = 0; + int uninitialized_var(type), ret; + + /* + * we don't want to hold swap_lock while doing a very + * lengthy try_to_unuse, but swap_list may change + * so restart scan from swap_active_head each time + */ + spin_lock(&swap_lock); + ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); + spin_unlock(&swap_lock); + if (ret == 0) + try_to_unuse(type, true, pages_to_unuse); + return; +} +EXPORT_SYMBOL(frontswap_shrink); + +/* + * Count and return the number of frontswap pages across all + * swap devices. This is exported so that backend drivers can + * determine current usage without reading debugfs. + */ +unsigned long frontswap_curr_pages(void) +{ + unsigned long totalpages = 0; + + spin_lock(&swap_lock); + totalpages = __frontswap_curr_pages(); + spin_unlock(&swap_lock); + + return totalpages; +} +EXPORT_SYMBOL(frontswap_curr_pages); + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("frontswap", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); + debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", S_IRUGO, root, + &frontswap_failed_stores); + debugfs_create_u64("invalidates", S_IRUGO, + root, &frontswap_invalidates); +#endif + return 0; +} + +module_init(init_frontswap); diff --git a/kernel/mm/gup.c b/kernel/mm/gup.c new file mode 100644 index 000000000..6297f6bcc --- /dev/null +++ b/kernel/mm/gup.c @@ -0,0 +1,1379 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "internal.h" + +static struct page *no_page_table(struct vm_area_struct *vma, + unsigned int flags) +{ + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return NULL; +} + +static struct page *follow_page_pte(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + spinlock_t *ptl; + pte_t *ptep, pte; + +retry: + if (unlikely(pmd_bad(*pmd))) + return no_page_table(vma, flags); + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto retry; + } + if ((flags & FOLL_NUMA) && pte_protnone(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) { + pte_unmap_unlock(ptep, ptl); + return NULL; + } + + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } + + if (flags & FOLL_GET) + get_page_foll(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ + mark_page_accessed(page); + } + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. + */ + mlock_vma_page(page); + unlock_page(page); + } + } + pte_unmap_unlock(ptep, ptl); + return page; +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return NULL; + return no_page_table(vma, flags); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + return page; + } + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return no_page_table(vma, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + return no_page_table(vma, flags); + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + return follow_page_pte(vma, address, pmd, flags); + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; + } + } else + spin_unlock(ptl); + } + return follow_page_pte(vma, address, pmd, flags); +} + +static int get_gate_page(struct mm_struct *mm, unsigned long address, + unsigned int gup_flags, struct vm_area_struct **vma, + struct page **page) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret = -EFAULT; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + return -EFAULT; + if (address > TASK_SIZE) + pgd = pgd_offset_k(address); + else + pgd = pgd_offset_gate(mm, address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, address); + if (pte_none(*pte)) + goto unmap; + *vma = get_gate_vma(mm); + if (!page) + goto out; + *page = vm_normal_page(*vma, address, *pte); + if (!*page) { + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + goto unmap; + *page = pte_page(*pte); + } + get_page(*page); +out: + ret = 0; +unmap: + pte_unmap(pte); + return ret; +} + +/* + * mmap_sem must be held on entry. If @nonblocking != NULL and + * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. + * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + */ +static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned int fault_flags = 0; + int ret; + + /* For mm_populate(), just skip the stack guard page. */ + if ((*flags & FOLL_POPULATE) && + (stack_guard_page_start(vma, address) || + stack_guard_page_end(vma, address + PAGE_SIZE))) + return -ENOENT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (*flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + if (*flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; + if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) + return -EFAULT; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return -EBUSY; + } + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when + * necessary, even if maybe_mkwrite decided not to set pte_write. We + * can thus safely do subsequent page lookups as if they were reads. + * But only do so when looping for pte_write is futile: in some cases + * userspace may also be wanting to write to the gotten user page, + * which a read fault here might prevent (a readonly page might get + * reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) + *flags &= ~FOLL_WRITE; + return 0; +} + +static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) +{ + vm_flags_t vm_flags = vma->vm_flags; + + if (vm_flags & (VM_IO | VM_PFNMAP)) + return -EFAULT; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * We used to let the write,force case do COW in a + * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could + * set a breakpoint in a read-only mapping of an + * executable, without corrupting the file (yet only + * when that file had been opened for writing!). + * Anon pages in shared mappings are surprising: now + * just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + return -EFAULT; + } + } + } else if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * Is there actually any vma we can reach here which does not + * have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + return -EFAULT; + } + return 0; +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held. It may be released. See below. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. Further, if @gup_flags does not + * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in + * this case. + * + * A caller using such a combination of @nonblocking and @gup_flags + * must therefore hold the mmap_sem for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + long i = 0; + unsigned int page_mask; + struct vm_area_struct *vma = NULL; + + if (!nr_pages) + return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + + /* + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + + do { + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + int ret; + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, + pages ? &pages[i] : NULL); + if (ret) + return i ? : ret; + page_mask = 0; + goto next_page; + } + + if (!vma || check_vma_flags(vma, gup_flags)) + return i ? : -EFAULT; + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, + gup_flags); + continue; + } + } +retry: + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + cond_resched(); + page = follow_page_mask(vma, start, foll_flags, &page_mask); + if (!page) { + int ret; + ret = faultin_page(tsk, vma, start, &foll_flags, + nonblocking); + switch (ret) { + case 0: + goto retry; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + return i ? i : ret; + case -EBUSY: + return i; + case -ENOENT: + goto next_page; + } + BUG(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; + } +next_page: + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages); + return i; +} +EXPORT_SYMBOL(__get_user_pages); + +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + vm_flags_t vm_flags; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +static __always_inline long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + int write, int force, + struct page **pages, + struct vm_area_struct **vmas, + int *locked, bool notify_drop, + unsigned int flags) +{ + long ret, pages_done; + bool lock_dropped; + + if (locked) { + /* if VM_FAULT_RETRY can be returned, vmas become invalid */ + BUG_ON(vmas); + /* check caller initialized locked */ + BUG_ON(*locked != 1); + } + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + pages_done = 0; + lock_dropped = false; + for (;;) { + ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, + vmas, locked); + if (!locked) + /* VM_FAULT_RETRY couldn't trigger, bypass */ + return ret; + + /* VM_FAULT_RETRY cannot return errors */ + if (!*locked) { + BUG_ON(ret < 0); + BUG_ON(ret >= nr_pages); + } + + if (!pages) + /* If it's a prefault don't insist harder */ + return ret; + + if (ret > 0) { + nr_pages -= ret; + pages_done += ret; + if (!nr_pages) + break; + } + if (*locked) { + /* VM_FAULT_RETRY didn't trigger */ + if (!pages_done) + pages_done = ret; + break; + } + /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ + pages += ret; + start += ret << PAGE_SHIFT; + + /* + * Repeat on the address that fired VM_FAULT_RETRY + * without FAULT_FLAG_ALLOW_RETRY but with + * FAULT_FLAG_TRIED. + */ + *locked = 1; + lock_dropped = true; + down_read(&mm->mmap_sem); + ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, + pages, NULL, NULL); + if (ret != 1) { + BUG_ON(ret > 1); + if (!pages_done) + pages_done = ret; + break; + } + nr_pages--; + pages_done++; + if (!nr_pages) + break; + pages++; + start += PAGE_SIZE; + } + if (notify_drop && lock_dropped && *locked) { + /* + * We must let the caller know we temporarily dropped the lock + * and so the critical section protected by it was lost. + */ + up_read(&mm->mmap_sem); + *locked = 0; + } + return pages_done; +} + +/* + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * + * get_user_pages_locked() is suitable to replace the form: + * + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * to: + * + * int locked = 1; + * down_read(&mm->mmap_sem); + * do_something() + * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * if (locked) + * up_read(&mm->mmap_sem); + */ +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, locked, true, FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages_locked); + +/* + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to + * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * + * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the + * caller if required (just like with __get_user_pages). "FOLL_GET", + * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed + * according to the parameters "pages", "write", "force" + * respectively. + */ +__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + unsigned int gup_flags) +{ + long ret; + int locked = 1; + down_read(&mm->mmap_sem); + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, &locked, false, gup_flags); + if (locked) + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(__get_user_pages_unlocked); + +/* + * get_user_pages_unlocked() is suitable to replace the form: + * + * down_read(&mm->mmap_sem); + * get_user_pages(tsk, mm, ..., pages, NULL); + * up_read(&mm->mmap_sem); + * + * with: + * + * get_user_pages_unlocked(tsk, mm, ..., pages); + * + * It is functionally equivalent to get_user_pages_fast so + * get_user_pages_fast should be used instead, if the two parameters + * "tsk" and "mm" are respectively equal to current and current->mm, + * or if "force" shall be set to 1 (get_user_pages_fast misses the + * "force" parameter). + */ +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages, FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages_unlocked); + +/* + * get_user_pages() - pin user pages in memory + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + * + * get_user_pages should be phased out in favor of + * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing + * should use get_user_pages because it cannot pass + * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, vmas, NULL, false, FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); + +/** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @nonblocking: + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking); +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + +/* + * Generic RCU Fast GUP + * + * get_user_pages_fast attempts to pin user pages by walking the page + * tables directly and avoids taking locks. Thus the walker needs to be + * protected from page table pages being freed from under it, and should + * block any THP splits. + * + * One way to achieve this is to have the walker disable interrupts, and + * rely on IPIs from the TLB flushing code blocking before the page table + * pages are freed. This is unsuitable for architectures that do not need + * to broadcast an IPI when invalidating TLBs. + * + * Another way to achieve this is to batch up page table containing pages + * belonging to more than one mm_user, then rcu_sched a callback to free those + * pages. Disabling interrupts will allow the fast_gup walker to both block + * the rcu_sched callback, and an IPI that we broadcast for splitting THPs + * (which is a relatively rare event). The code below adopts this strategy. + * + * Before activating this code, please be aware that the following assumptions + * are currently made: + * + * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free + * pages containing page tables. + * + * *) THP splits will broadcast an IPI, this can be achieved by overriding + * pmdp_splitting_flush. + * + * *) ptes can be read atomically by the architecture. + * + * *) access_ok is sufficient to validate userspace address ranges. + * + * The last two assumptions can be relaxed by the addition of helper functions. + * + * This code is based heavily on the PowerPC implementation by Nick Piggin. + */ +#ifdef CONFIG_HAVE_GENERIC_RCU_GUP + +#ifdef __HAVE_ARCH_PTE_SPECIAL +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t *ptep, *ptem; + int ret = 0; + + ptem = ptep = pte_offset_map(&pmd, addr); + do { + /* + * In the line below we are assuming that the pte can be read + * atomically. If this is not the case for your architecture, + * please wrap this in a helper function! + * + * for an example see gup_get_pte in arch/x86/mm/gup.c + */ + pte_t pte = READ_ONCE(*ptep); + struct page *page; + + /* + * Similar to the PMD case below, NUMA hinting must take slow + * path using the pte_protnone check. + */ + if (!pte_present(pte) || pte_special(pte) || + pte_protnone(pte) || (write && !pte_write(pte))) + goto pte_unmap; + + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + + if (!page_cache_get_speculative(page)) + goto pte_unmap; + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + put_page(page); + goto pte_unmap; + } + + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + + ret = 1; + +pte_unmap: + pte_unmap(ptem); + return ret; +} +#else + +/* + * If we can't determine whether or not a pte is special, then fail immediately + * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not + * to be special. + * + * For a futex to be placed on a THP tail page, get_futex_key requires a + * __get_user_pages_fast implementation that can pin pages. Thus it's still + * useful to have gup_huge_pmd even if we can't operate on ptes. + */ +static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + return 0; +} +#endif /* __HAVE_ARCH_PTE_SPECIAL */ + +static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page, *tail; + int refs; + + if (write && !pmd_write(orig)) + return 0; + + refs = 0; + head = pmd_page(orig); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail pages need their mapcount reference taken before we + * return. (This allows the THP code to bump their ref count when + * they are split into base pages). + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + +static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page, *tail; + int refs; + + if (write && !pud_write(orig)) + return 0; + + refs = 0; + head = pud_page(orig); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pud_val(orig) != pud_val(*pudp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + +static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, + unsigned long end, int write, + struct page **pages, int *nr) +{ + int refs; + struct page *head, *page, *tail; + + if (write && !pgd_write(orig)) + return 0; + + refs = 0; + head = pgd_page(orig); + page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = READ_ONCE(*pmdp); + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return 0; + + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_protnone(pmd)) + return 0; + + if (!gup_huge_pmd(pmd, pmdp, addr, next, write, + pages, nr)) + return 0; + + } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { + /* + * architecture have different format for hugetlbfs + * pmd format and THP pmd format + */ + if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, + PMD_SHIFT, next, write, pages, nr)) + return 0; + } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = READ_ONCE(*pudp); + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (unlikely(pud_huge(pud))) { + if (!gup_huge_pud(pud, pudp, addr, next, write, + pages, nr)) + return 0; + } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { + if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, + PUD_SHIFT, next, write, pages, nr)) + return 0; + } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +/* + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. It will only return non-negative values. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next, flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, len))) + return 0; + + /* + * Disable interrupts. We use the nested form as we can already have + * interrupts disabled by get_futex_key. + * + * With interrupts disabled, we block page table pages from being + * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h + * for more details. + * + * We do not adopt an rcu_read_lock(.) here as we also want to + * block IPIs that come from THPs splitting. + */ + + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = READ_ONCE(*pgdp); + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (unlikely(pgd_huge(pgd))) { + if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + pages, &nr)) + break; + } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { + if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, + PGDIR_SHIFT, next, write, pages, &nr)) + break; + } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + int nr, ret; + + start &= PAGE_MASK; + nr = __get_user_pages_fast(start, nr_pages, write, pages); + ret = nr; + + if (nr < nr_pages) { + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + ret = get_user_pages_unlocked(current, mm, start, + nr_pages - nr, write, 0, pages); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + } + + return ret; +} + +#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ diff --git a/kernel/mm/highmem.c b/kernel/mm/highmem.c new file mode 100644 index 000000000..16e8cf26d --- /dev/null +++ b/kernel/mm/highmem.c @@ -0,0 +1,490 @@ +/* + * High memory handling common code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * 64-bit physical space. With current x86 CPUs this + * means up to 64 Gigabytes physical RAM. + * + * Rewrote high memory support to move the page cache into + * high memory. Implemented permanent (schedulable) kmaps + * based on Linus' idea. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_PREEMPT_RT_FULL +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +DEFINE_PER_CPU(int, __kmap_atomic_idx); +#endif +#endif + +/* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped + * since a TLB flush - it is usable. + * 1 means that there are no users, but it has been mapped + * since the last TLB flush - so we can't use it. + * n means that there are (n-1) current users of it. + */ +#ifdef CONFIG_HIGHMEM + +/* + * Architecture with aliasing data cache may define the following family of + * helper functions in its asm/highmem.h to control cache color of virtual + * addresses where physical memory pages are mapped by kmap. + */ +#ifndef get_pkmap_color + +/* + * Determine color of virtual address where the page should be mapped. + */ +static inline unsigned int get_pkmap_color(struct page *page) +{ + return 0; +} +#define get_pkmap_color get_pkmap_color + +/* + * Get next index for mapping inside PKMAP region for page with given color. + */ +static inline unsigned int get_next_pkmap_nr(unsigned int color) +{ + static unsigned int last_pkmap_nr; + + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + return last_pkmap_nr; +} + +/* + * Determine if page index inside PKMAP region (pkmap_nr) of given color + * has wrapped around PKMAP region end. When this happens an attempt to + * flush all unused PKMAP slots is made. + */ +static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) +{ + return pkmap_nr == 0; +} + +/* + * Get the number of PKMAP entries of the given color. If no free slot is + * found after checking that many entries, kmap will sleep waiting for + * someone to call kunmap and free PKMAP slot. + */ +static inline int get_pkmap_entries_count(unsigned int color) +{ + return LAST_PKMAP; +} + +/* + * Get head of a wait queue for PKMAP entries of the given color. + * Wait queues for different mapping colors should be independent to avoid + * unnecessary wakeups caused by freeing of slots of other colors. + */ +static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) +{ + static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + + return &pkmap_map_wait; +} +#endif + +unsigned long totalhigh_pages __read_mostly; +EXPORT_SYMBOL(totalhigh_pages); + +#ifndef CONFIG_PREEMPT_RT_FULL +EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); +#endif + +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_online_pgdat(pgdat) { + pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], + NR_FREE_PAGES); + if (zone_movable_is_highmem()) + pages += zone_page_state( + &pgdat->node_zones[ZONE_MOVABLE], + NR_FREE_PAGES); + } + + return pages; +} + +static int pkmap_count[LAST_PKMAP]; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); + +pte_t * pkmap_page_table; + +/* + * Most architectures have no use for kmap_high_get(), so let's abstract + * the disabling of IRQ out of the locking in that case to save on a + * potential useless overhead. + */ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +#define lock_kmap() spin_lock_irq(&kmap_lock) +#define unlock_kmap() spin_unlock_irq(&kmap_lock) +#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags) +#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags) +#else +#define lock_kmap() spin_lock(&kmap_lock) +#define unlock_kmap() spin_unlock(&kmap_lock) +#define lock_kmap_any(flags) \ + do { spin_lock(&kmap_lock); (void)(flags); } while (0) +#define unlock_kmap_any(flags) \ + do { spin_unlock(&kmap_lock); (void)(flags); } while (0) +#endif + +struct page *kmap_to_page(void *vaddr) +{ + unsigned long addr = (unsigned long)vaddr; + + if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { + int i = PKMAP_NR(addr); + return pte_page(pkmap_page_table[i]); + } + + return virt_to_page(addr); +} +EXPORT_SYMBOL(kmap_to_page); + +static void flush_all_zero_pkmaps(void) +{ + int i; + int need_flush = 0; + + flush_cache_kmaps(); + + for (i = 0; i < LAST_PKMAP; i++) { + struct page *page; + + /* + * zero means we don't have anything to do, + * >1 means that it is still in use. Only + * a count of 1 means that it is free but + * needs to be unmapped + */ + if (pkmap_count[i] != 1) + continue; + pkmap_count[i] = 0; + + /* sanity check */ + BUG_ON(pte_none(pkmap_page_table[i])); + + /* + * Don't need an atomic fetch-and-clear op here; + * no-one has the page mapped, and cannot get at + * its virtual address (and hence PTE) without first + * getting the kmap_lock (which is held here). + * So no dangers, even with speculative execution. + */ + page = pte_page(pkmap_page_table[i]); + pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); + + set_page_address(page, NULL); + need_flush = 1; + } + if (need_flush) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); +} + +/** + * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings + */ +void kmap_flush_unused(void) +{ + lock_kmap(); + flush_all_zero_pkmaps(); + unlock_kmap(); +} + +static inline unsigned long map_new_virtual(struct page *page) +{ + unsigned long vaddr; + int count; + unsigned int last_pkmap_nr; + unsigned int color = get_pkmap_color(page); + +start: + count = get_pkmap_entries_count(color); + /* Find an empty entry */ + for (;;) { + last_pkmap_nr = get_next_pkmap_nr(color); + if (no_more_pkmaps(last_pkmap_nr, color)) { + flush_all_zero_pkmaps(); + count = get_pkmap_entries_count(color); + } + if (!pkmap_count[last_pkmap_nr]) + break; /* Found a usable entry */ + if (--count) + continue; + + /* + * Sleep for somebody else to unmap their entries + */ + { + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *pkmap_map_wait = + get_pkmap_wait_queue_head(color); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(pkmap_map_wait, &wait); + unlock_kmap(); + schedule(); + remove_wait_queue(pkmap_map_wait, &wait); + lock_kmap(); + + /* Somebody else might have mapped it while we slept */ + if (page_address(page)) + return (unsigned long)page_address(page); + + /* Re-start */ + goto start; + } + } + vaddr = PKMAP_ADDR(last_pkmap_nr); + set_pte_at(&init_mm, vaddr, + &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + + pkmap_count[last_pkmap_nr] = 1; + set_page_address(page, (void *)vaddr); + + return vaddr; +} + +/** + * kmap_high - map a highmem page into memory + * @page: &struct page to map + * + * Returns the page's virtual memory address. + * + * We cannot call this from interrupts, as it may block. + */ +void *kmap_high(struct page *page) +{ + unsigned long vaddr; + + /* + * For highmem pages, we can't trust "virtual" until + * after we have the lock. + */ + lock_kmap(); + vaddr = (unsigned long)page_address(page); + if (!vaddr) + vaddr = map_new_virtual(page); + pkmap_count[PKMAP_NR(vaddr)]++; + BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); + unlock_kmap(); + return (void*) vaddr; +} + +EXPORT_SYMBOL(kmap_high); + +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +/** + * kmap_high_get - pin a highmem page into memory + * @page: &struct page to pin + * + * Returns the page's current virtual memory address, or NULL if no mapping + * exists. If and only if a non null address is returned then a + * matching call to kunmap_high() is necessary. + * + * This can be called from any context. + */ +void *kmap_high_get(struct page *page) +{ + unsigned long vaddr, flags; + + lock_kmap_any(flags); + vaddr = (unsigned long)page_address(page); + if (vaddr) { + BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1); + pkmap_count[PKMAP_NR(vaddr)]++; + } + unlock_kmap_any(flags); + return (void*) vaddr; +} +#endif + +/** + * kunmap_high - unmap a highmem page into memory + * @page: &struct page to unmap + * + * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called + * only from user context. + */ +void kunmap_high(struct page *page) +{ + unsigned long vaddr; + unsigned long nr; + unsigned long flags; + int need_wakeup; + unsigned int color = get_pkmap_color(page); + wait_queue_head_t *pkmap_map_wait; + + lock_kmap_any(flags); + vaddr = (unsigned long)page_address(page); + BUG_ON(!vaddr); + nr = PKMAP_NR(vaddr); + + /* + * A count must never go down to zero + * without a TLB flush! + */ + need_wakeup = 0; + switch (--pkmap_count[nr]) { + case 0: + BUG(); + case 1: + /* + * Avoid an unnecessary wake_up() function call. + * The common case is pkmap_count[] == 1, but + * no waiters. + * The tasks queued in the wait-queue are guarded + * by both the lock in the wait-queue-head and by + * the kmap_lock. As the kmap_lock is held here, + * no need for the wait-queue-head's lock. Simply + * test if the queue is empty. + */ + pkmap_map_wait = get_pkmap_wait_queue_head(color); + need_wakeup = waitqueue_active(pkmap_map_wait); + } + unlock_kmap_any(flags); + + /* do wake-up, if needed, race-free outside of the spin lock */ + if (need_wakeup) + wake_up(pkmap_map_wait); +} + +EXPORT_SYMBOL(kunmap_high); +#endif + +#if defined(HASHED_PAGE_VIRTUAL) + +#define PA_HASH_ORDER 7 + +/* + * Describes one page->virtual association + */ +struct page_address_map { + struct page *page; + void *virtual; + struct list_head list; +}; + +static struct page_address_map page_address_maps[LAST_PKMAP]; + +/* + * Hash table bucket + */ +static struct page_address_slot { + struct list_head lh; /* List of page_address_maps */ + spinlock_t lock; /* Protect this bucket's list */ +} ____cacheline_aligned_in_smp page_address_htable[1<lock, flags); + if (!list_empty(&pas->lh)) { + struct page_address_map *pam; + + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + ret = pam->virtual; + goto done; + } + } + } +done: + spin_unlock_irqrestore(&pas->lock, flags); + return ret; +} + +EXPORT_SYMBOL(page_address); + +/** + * set_page_address - set a page's virtual address + * @page: &struct page to set + * @virtual: virtual address to use + */ +void set_page_address(struct page *page, void *virtual) +{ + unsigned long flags; + struct page_address_slot *pas; + struct page_address_map *pam; + + BUG_ON(!PageHighMem(page)); + + pas = page_slot(page); + if (virtual) { /* Add */ + pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; + pam->page = page; + pam->virtual = virtual; + + spin_lock_irqsave(&pas->lock, flags); + list_add_tail(&pam->list, &pas->lh); + spin_unlock_irqrestore(&pas->lock, flags); + } else { /* Remove */ + spin_lock_irqsave(&pas->lock, flags); + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + list_del(&pam->list); + spin_unlock_irqrestore(&pas->lock, flags); + goto done; + } + } + spin_unlock_irqrestore(&pas->lock, flags); + } +done: + return; +} + +void __init page_address_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { + INIT_LIST_HEAD(&page_address_htable[i].lh); + spin_lock_init(&page_address_htable[i].lock); + } +} + +#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ diff --git a/kernel/mm/huge_memory.c b/kernel/mm/huge_memory.c new file mode 100644 index 000000000..078832cf3 --- /dev/null +++ b/kernel/mm/huge_memory.c @@ -0,0 +1,3011 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "internal.h" + +/* + * By default transparent hugepage support is disabled in order that avoid + * to risk increase the memory footprint of applications without a guaranteed + * benefit. When transparent hugepage support is enabled, is for all mappings, + * and khugepaged scans all mappings. + * Defrag is invoked by khugepaged hugepage allocations and by page faults + * for all hugepage allocations. + */ +unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1< min_free_kbytes) { + if (user_min_free_kbytes >= 0) + pr_info("raising min_free_kbytes from %d to %lu " + "to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); + + min_free_kbytes = recommended_min; + } + setup_per_zone_wmarks(); + return 0; +} + +static int start_stop_khugepaged(void) +{ + int err = 0; + if (khugepaged_enabled()) { + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (unlikely(IS_ERR(khugepaged_thread))) { + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + goto fail; + } + + if (!list_empty(&khugepaged_scan.mm_head)) + wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); + } else if (khugepaged_thread) { + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } +fail: + return err; +} + +static atomic_t huge_zero_refcount; +struct page *huge_zero_page __read_mostly; + +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return is_huge_zero_page(pmd_page(pmd)); +} + +static struct page *get_huge_zero_page(void) +{ + struct page *zero_page; +retry: + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) + return READ_ONCE(huge_zero_page); + + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + HPAGE_PMD_ORDER); + if (!zero_page) { + count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); + return NULL; + } + count_vm_event(THP_ZERO_PAGE_ALLOC); + preempt_disable(); + if (cmpxchg(&huge_zero_page, NULL, zero_page)) { + preempt_enable(); + __free_pages(zero_page, compound_order(zero_page)); + goto retry; + } + + /* We take additional reference here. It will be put back by shrinker */ + atomic_set(&huge_zero_refcount, 2); + preempt_enable(); + return READ_ONCE(huge_zero_page); +} + +static void put_huge_zero_page(void) +{ + /* + * Counter should never go to zero here. Only shrinker can put + * last reference. + */ + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); +} + +static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + /* we can free zero page only if last reference remains */ + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; +} + +static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { + struct page *zero_page = xchg(&huge_zero_page, NULL); + BUG_ON(zero_page == NULL); + __free_pages(zero_page, compound_order(zero_page)); + return HPAGE_PMD_NR; + } + + return 0; +} + +static struct shrinker huge_zero_page_shrinker = { + .count_objects = shrink_huge_zero_page_count, + .scan_objects = shrink_huge_zero_page_scan, + .seeks = DEFAULT_SEEKS, +}; + +#ifdef CONFIG_SYSFS + +static ssize_t double_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (test_bit(enabled, &transparent_hugepage_flags)) { + VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); + return sprintf(buf, "[always] madvise never\n"); + } else if (test_bit(req_madv, &transparent_hugepage_flags)) + return sprintf(buf, "always [madvise] never\n"); + else + return sprintf(buf, "always madvise [never]\n"); +} +static ssize_t double_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + set_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + set_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); +} +static ssize_t enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + + if (ret > 0) { + int err; + + mutex_lock(&khugepaged_mutex); + err = start_stop_khugepaged(); + mutex_unlock(&khugepaged_mutex); + + if (err) + ret = err; + } + + return ret; +} +static struct kobj_attribute enabled_attr = + __ATTR(enabled, 0644, enabled_show, enabled_store); + +static ssize_t single_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) +{ + return sprintf(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); +} + +static ssize_t single_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag) +{ + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) + set_bit(flag, &transparent_hugepage_flags); + else + clear_bit(flag, &transparent_hugepage_flags); + + return count; +} + +/* + * Currently defrag only disables __GFP_NOWAIT for allocation. A blind + * __GFP_REPEAT is too aggressive, it's never worth swapping tons of + * memory just to allocate one more hugepage. + */ +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static struct kobj_attribute defrag_attr = + __ATTR(defrag, 0644, defrag_show, defrag_store); + +static ssize_t use_zero_page_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static ssize_t use_zero_page_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static struct kobj_attribute use_zero_page_attr = + __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); +#ifdef CONFIG_DEBUG_VM +static ssize_t debug_cow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static ssize_t debug_cow_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static struct kobj_attribute debug_cow_attr = + __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); +#endif /* CONFIG_DEBUG_VM */ + +static struct attribute *hugepage_attr[] = { + &enabled_attr.attr, + &defrag_attr.attr, + &use_zero_page_attr.attr, +#ifdef CONFIG_DEBUG_VM + &debug_cow_attr.attr, +#endif + NULL, +}; + +static struct attribute_group hugepage_attr_group = { + .attrs = hugepage_attr, +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = kstrtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = kstrtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; + +static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + int err; + + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!*hugepage_kobj)) { + pr_err("failed to create transparent hugepage kobject\n"); + return -ENOMEM; + } + + err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); + if (err) { + pr_err("failed to register transparent hugepage group\n"); + goto delete_obj; + } + + err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); + if (err) { + pr_err("failed to register transparent hugepage group\n"); + goto remove_hp_group; + } + + return 0; + +remove_hp_group: + sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); +delete_obj: + kobject_put(*hugepage_kobj); + return err; +} + +static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); + sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); + kobject_put(hugepage_kobj); +} +#else +static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + return 0; +} + +static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ +} +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; + struct kobject *hugepage_kobj; + + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + return -EINVAL; + } + + err = hugepage_init_sysfs(&hugepage_kobj); + if (err) + goto err_sysfs; + + err = khugepaged_slab_init(); + if (err) + goto err_slab; + + err = register_shrinker(&huge_zero_page_shrinker); + if (err) + goto err_hzp_shrinker; + + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { + transparent_hugepage_flags = 0; + return 0; + } + + err = start_stop_khugepaged(); + if (err) + goto err_khugepaged; + + return 0; +err_khugepaged: + unregister_shrinker(&huge_zero_page_shrinker); +err_hzp_shrinker: + khugepaged_slab_exit(); +err_slab: + hugepage_exit_sysfs(hugepage_kobj); +err_sysfs: + return err; +} +subsys_initcall(hugepage_init); + +static int __init setup_transparent_hugepage(char *str) +{ + int ret = 0; + if (!str) + goto out; + if (!strcmp(str, "always")) { + set_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } +out: + if (!ret) + pr_warn("transparent_hugepage= cannot parse, ignored\n"); + return ret; +} +__setup("transparent_hugepage=", setup_transparent_hugepage); + +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pmd = pmd_mkwrite(pmd); + return pmd; +} + +static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) +{ + pmd_t entry; + entry = mk_pmd(page, prot); + entry = pmd_mkhuge(entry); + return entry; +} + +static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + struct page *page, gfp_t gfp) +{ + struct mem_cgroup *memcg; + pgtable_t pgtable; + spinlock_t *ptl; + + VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) + return VM_FAULT_OOM; + + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) { + mem_cgroup_cancel_charge(page, memcg); + return VM_FAULT_OOM; + } + + clear_huge_page(page, haddr, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + __SetPageUptodate(page); + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_none(*pmd))) { + spin_unlock(ptl); + mem_cgroup_cancel_charge(page, memcg); + put_page(page); + pte_free(mm, pgtable); + } else { + pmd_t entry; + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, haddr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&mm->nr_ptes); + spin_unlock(ptl); + } + + return 0; +} + +static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) +{ + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; +} + +/* Caller must hold page table lock. */ +static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + struct page *zero_page) +{ + pmd_t entry; + if (!pmd_none(*pmd)) + return false; + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); + atomic_long_inc(&mm->nr_ptes); + return true; +} + +int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags) +{ + gfp_t gfp; + struct page *page; + unsigned long haddr = address & HPAGE_PMD_MASK; + + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) + return VM_FAULT_OOM; + if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && + transparent_hugepage_use_zero_page()) { + spinlock_t *ptl; + pgtable_t pgtable; + struct page *zero_page; + bool set; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) + return VM_FAULT_OOM; + zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) { + pte_free(mm, pgtable); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + ptl = pmd_lock(mm, pmd); + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_page); + spin_unlock(ptl); + if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); + } + return 0; + } + gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + + count_vm_event(THP_FAULT_ALLOC); + return 0; +} + +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma) +{ + spinlock_t *dst_ptl, *src_ptl; + struct page *src_page; + pmd_t pmd; + pgtable_t pgtable; + int ret; + + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pmd = *src_pmd; + if (unlikely(!pmd_trans_huge(pmd))) { + pte_free(dst_mm, pgtable); + goto out_unlock; + } + /* + * When page table lock is held, the huge zero pmd should not be + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + struct page *zero_page; + bool set; + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + zero_page = get_huge_zero_page(); + set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + zero_page); + BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ + ret = 0; + goto out_unlock; + } + + if (unlikely(pmd_trans_splitting(pmd))) { + /* split huge page running from under us */ + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + pte_free(dst_mm, pgtable); + + wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ + goto out; + } + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + + pmdp_set_wrprotect(src_mm, addr, src_pmd); + pmd = pmd_mkold(pmd_wrprotect(pmd)); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + atomic_long_inc(&dst_mm->nr_ptes); + + ret = 0; +out_unlock: + spin_unlock(src_ptl); + spin_unlock(dst_ptl); +out: + return ret; +} + +void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + spinlock_t *ptl; + pmd_t entry; + unsigned long haddr; + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(ptl); +} + +/* + * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages + * during copy_user_huge_page()'s copy_page_rep(): in the case when + * the source page gets split and a tail freed before copy completes. + * Called under pmd_lock of checked pmd, so safe from splitting itself. + */ +static void get_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; + + atomic_add(HPAGE_PMD_NR, &page->_count); + while (++page < endpage) + get_huge_page_tail(page); + } else { + get_page(page); + } +} + +static void put_user_huge_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { + struct page *endpage = page + HPAGE_PMD_NR; + + while (page < endpage) + put_page(page++); + } else { + put_page(page); + } +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + struct mem_cgroup *memcg; + spinlock_t *ptl; + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | + __GFP_OTHER_NODE, + vma, address, page_to_nid(page)); + if (unlikely(!pages[i] || + mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, + &memcg))) { + if (pages[i]) + put_page(pages[i]); + while (--i >= 0) { + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); + put_page(pages[i]); + } + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + set_page_private(pages[i], (unsigned long)memcg); + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SIZE * i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON_PAGE(!PageHead(page), page); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + page_add_new_anon_rmap(pages[i], vma, haddr); + mem_cgroup_commit_charge(pages[i], memcg, false); + lru_cache_add_active_or_unevictable(pages[i], vma); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(ptl); + + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + for (i = 0; i < HPAGE_PMD_NR; i++) { + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); + put_page(pages[i]); + } + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + spinlock_t *ptl; + int ret = 0; + struct page *page = NULL, *new_page; + struct mem_cgroup *memcg; + unsigned long haddr; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t huge_gfp; /* for allocation and charge */ + + ptl = pmd_lockptr(mm, pmd); + VM_BUG_ON_VMA(!vma->anon_vma, vma); + haddr = address & HPAGE_PMD_MASK; + if (is_huge_zero_pmd(orig_pmd)) + goto alloc; + spin_lock(ptl); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache_pmd(vma, address, pmd); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_user_huge_page(page); + spin_unlock(ptl); +alloc: + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) { + huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); + } else + new_page = NULL; + + if (unlikely(!new_page)) { + if (!page) { + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; + } else { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) { + split_huge_page(page); + ret |= VM_FAULT_FALLBACK; + } + put_user_huge_page(page); + } + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { + put_page(new_page); + if (page) { + split_huge_page(page); + put_user_huge_page(page); + } else + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + + count_vm_event(THP_FAULT_ALLOC); + + if (!page) + clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + else + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + spin_lock(ptl); + if (page) + put_user_huge_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) { + spin_unlock(ptl); + mem_cgroup_cancel_charge(new_page, memcg); + put_page(new_page); + goto out_mn; + } else { + pmd_t entry; + entry = mk_huge_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, pmd); + if (!page) { + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + put_huge_zero_page(); + } else { + VM_BUG_ON_PAGE(!PageHead(page), page); + page_remove_rmap(page); + put_page(page); + } + ret |= VM_FAULT_WRITE; + } + spin_unlock(ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out: + return ret; +out_unlock: + spin_unlock(ptl); + return ret; +} + +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) + return ERR_PTR(-EFAULT); + + /* Full NUMA hinting faults to serialise migration in fault paths */ + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!PageHead(page), page); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if (page->mapping && trylock_page(page)) { + lru_add_drain(); + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON_PAGE(!PageCompound(page), page); + if (flags & FOLL_GET) + get_page_foll(page); + +out: + return page; +} + +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ + spinlock_t *ptl; + struct anon_vma *anon_vma = NULL; + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + int page_nid = -1, this_nid = numa_node_id(); + int target_nid, last_cpupid = -1; + bool page_locked; + bool migrated = false; + bool was_writable; + int flags = 0; + + /* A PROT_NONE fault should not end up here */ + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + /* + * If there are potential migrations, wait for completion and retry + * without disrupting NUMA hinting information. Do not relock and + * check_same as the page may no longer be mapped. + */ + if (unlikely(pmd_trans_migrating(*pmdp))) { + page = pmd_page(*pmdp); + spin_unlock(ptl); + wait_on_page_locked(page); + goto out; + } + + page = pmd_page(pmd); + BUG_ON(is_huge_zero_page(page)); + page_nid = page_to_nid(page); + last_cpupid = page_cpupid_last(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == this_nid) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + flags |= TNF_FAULT_LOCAL; + } + + /* See similar comment in do_numa_page for explanation */ + if (!(vma->vm_flags & VM_WRITE)) + flags |= TNF_NO_GROUP; + + /* + * Acquire the page lock to serialise THP migrations but avoid dropping + * page_table_lock if at all possible + */ + page_locked = trylock_page(page); + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + /* If the page was locked, there are no parallel migrations */ + if (page_locked) + goto clear_pmdnuma; + } + + /* Migration could have started since the pmd_trans_migrating check */ + if (!page_locked) { + spin_unlock(ptl); + wait_on_page_locked(page); + page_nid = -1; + goto out; + } + + /* + * Page is misplaced. Page lock serialises migrations. Acquire anon_vma + * to serialises splits + */ + get_page(page); + spin_unlock(ptl); + anon_vma = page_lock_anon_vma_read(page); + + /* Confirm the PMD did not change while page_table_lock was released */ + spin_lock(ptl); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); + page_nid = -1; + goto out_unlock; + } + + /* Bail if we fail to protect against THP splits for any reason */ + if (unlikely(!anon_vma)) { + put_page(page); + page_nid = -1; + goto clear_pmdnuma; + } + + /* + * Migrate the THP to the requested node, returns with page unlocked + * and access rights restored. + */ + spin_unlock(ptl); + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, page, target_nid); + if (migrated) { + flags |= TNF_MIGRATED; + page_nid = target_nid; + } else + flags |= TNF_MIGRATE_FAIL; + + goto out; +clear_pmdnuma: + BUG_ON(!PageLocked(page)); + was_writable = pmd_write(pmd); + pmd = pmd_modify(pmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); + if (was_writable) + pmd = pmd_mkwrite(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + update_mmu_cache_pmd(vma, addr, pmdp); + unlock_page(page); +out_unlock: + spin_unlock(ptl); + +out: + if (anon_vma) + page_unlock_anon_vma_read(anon_vma); + + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + + return 0; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr) +{ + spinlock_t *ptl; + int ret = 0; + + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + struct page *page; + pgtable_t pgtable; + pmd_t orig_pmd; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ + orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); + if (is_huge_zero_pmd(orig_pmd)) { + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + put_huge_zero_page(); + } else { + page = pmd_page(orig_pmd); + page_remove_rmap(page); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON_PAGE(!PageHead(page), page); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + tlb_remove_page(tlb, page); + } + pte_free(tlb->mm, pgtable); + ret = 1; + } + return ret; +} + +int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, + unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + int ret = 0; + pmd_t pmd; + + struct mm_struct *mm = vma->vm_mm; + + if ((old_addr & ~HPAGE_PMD_MASK) || + (new_addr & ~HPAGE_PMD_MASK) || + old_end - old_addr < HPAGE_PMD_SIZE || + (new_vma->vm_flags & VM_NOHUGEPAGE)) + goto out; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) { + VM_BUG_ON(pmd_trans_huge(*new_pmd)); + goto out; + } + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); + if (ret == 1) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + VM_BUG_ON(!pmd_none(*new_pmd)); + + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + } +out: + return ret; +} + +/* + * Returns + * - 0 if PMD could not be locked + * - 1 if PMD was locked but protections unchange and TLB flush unnecessary + * - HPAGE_PMD_NR is protections changed and TLB flush necessary + */ +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot, int prot_numa) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + int ret = 0; + + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + pmd_t entry; + bool preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; + + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) { + spin_unlock(ptl); + return ret; + } + + if (!prot_numa || !pmd_protnone(*pmd)) { + entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mkwrite(entry); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(!preserve_write && pmd_write(entry)); + } + spin_unlock(ptl); + } + + return ret; +} + +/* + * Returns 1 if a given pmd maps a stable (not under splitting) thp. + * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. + * + * Note that if it returns 1, this routine returns without unlocking page + * table locks. So callers must unlock them. + */ +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) +{ + *ptl = pmd_lock(vma->vm_mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(*ptl); + wait_split_huge_page(vma->anon_vma, pmd); + return -1; + } else { + /* Thp mapped by 'pmd' is stable, so we can + * handle it as it is. */ + return 1; + } + } + spin_unlock(*ptl); + return 0; +} + +/* + * This function returns whether a given @page is mapped onto the @address + * in the virtual space of @mm. + * + * When it's true, this function returns *pmd with holding the page table lock + * and passing it back to the caller via @ptl. + * If it's false, returns NULL without holding the page table lock. + */ +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag, + spinlock_t **ptl) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (address & ~HPAGE_PMD_MASK) + return NULL; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + + *ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock; + if (pmd_page(*pmd) != page) + goto unlock; + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto unlock; + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + return pmd; + } +unlock: + spin_unlock(*ptl); + return NULL; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pmd_t *pmd; + int ret = 0; + /* For mmu_notifiers */ + const unsigned long mmun_start = address; + const unsigned long mmun_end = address + HPAGE_PMD_SIZE; + + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->rwsem to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush(vma, address, pmd); + + ret = 1; + spin_unlock(ptl); + } + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page, + struct list_head *list) +{ + int i; + struct zone *zone = page_zone(page); + struct lruvec *lruvec; + int tail_count = 0; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + + compound_lock(page); + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(page); + + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { + struct page *page_tail = page + i; + + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb__after_atomic(); + + /* + * retain hwpoison flag of the poisoned tail page: + * fix for the unsuitable process killed on Guest Machine(KVM) + * by the memory-failure. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_unevictable))); + page_tail->flags |= (1L << PG_dirty); + + /* clear PageTail before overwriting first_page */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = page->index + i; + page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + lru_add_page_tail(page, page_tail, lruvec, list); + } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); + + __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); + if (pmd) { + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + if (pmd_write(*pmd)) + BUG_ON(page_mapcount(page) != 1); + + haddr = address; + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + /* + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. + */ + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + pmdp_invalidate(vma, address, pmd); + pmd_populate(mm, pmd, pgtable); + ret = 1; + spin_unlock(ptl); + } + + return ret; +} + +/* must be called with anon_vma->root->rwsem held */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma, + struct list_head *list) +{ + int mapcount, mapcount2; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + mapcount += __split_huge_page_splitting(page, vma, addr); + } + /* + * It is critical that new vmas are added to the tail of the + * anon_vma list. This guarantes that if copy_huge_pmd() runs + * and establishes a child pmd before + * __split_huge_page_splitting() freezes the parent pmd (so if + * we fail to prevent copy_huge_pmd() from running until the + * whole __split_huge_page() is complete), we will still see + * the newly established pmd of the child later during the + * walk, to be able to set it as pmd_trans_splitting too. + */ + if (mapcount != page_mapcount(page)) { + pr_err("mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG(); + } + + __split_huge_page_refcount(page, list); + + mapcount2 = 0; + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + mapcount2 += __split_huge_page_map(page, vma, addr); + } + if (mapcount != mapcount2) { + pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG(); + } +} + +/* + * Split a hugepage into normal pages. This doesn't change the position of head + * page. If @list is null, tail pages will be added to LRU list, otherwise, to + * @list. Both head page and tail pages will inherit mapping, flags, and so on + * from the hugepage. + * Return 0 if the hugepage is split successfully otherwise return 1. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(is_huge_zero_page(page)); + BUG_ON(!PageAnon(page)); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(page); + if (!anon_vma) + goto out; + anon_vma_lock_write(anon_vma); + + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma, list); + count_vm_event(THP_SPLIT); + + BUG_ON(PageCompound(page)); +out_unlock: + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); +out: + return ret; +} + +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) + return -ENOMEM; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_exit(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + + hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) + if (mm == mm_slot->mm) + return mm_slot; + + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + mm_slot->mm = mm; + hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma, vm_flags); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hash_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + spin_unlock(&khugepaged_mm_lock); + + if (free) { + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) + release_pte_page(pte_page(pteval)); + } +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int none_or_zero = 0; + bool referenced = false, writable = false; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) + continue; + else + goto out; + } + if (!pte_present(pteval)) + goto out; + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) + goto out; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) + goto out; + + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) { + unlock_page(page); + goto out; + } + if (pte_write(pteval)) { + writable = true; + } else { + if (PageSwapCache(page) && !reuse_swap_page(page)) { + unlock_page(page); + goto out; + } + /* + * Page is not in the swap cache. It can be collapsed + * into a THP. + */ + } + + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = true; + } + if (likely(referenced && writable)) + return 1; +out: + release_pte_pages(pte, _pte); + return 0; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void khugepaged_alloc_sleep(void) +{ + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); +} + +static int khugepaged_node_load[MAX_NUMNODES]; + +static bool khugepaged_scan_abort(int nid) +{ + int i; + + /* + * If zone_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!zone_reclaim_mode) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (khugepaged_node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!khugepaged_node_load[i]) + continue; + if (node_distance(nid, i) > RECLAIM_DISTANCE) + return true; + } + return false; +} + +#ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (IS_ERR(*hpage)) { + if (!*wait) + return false; + + *wait = false; + *hpage = NULL; + khugepaged_alloc_sleep(); + } else if (*hpage) { + put_page(*hpage); + *hpage = NULL; + } + + return true; +} + +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + VM_BUG_ON_PAGE(*hpage, *hpage); + + /* + * Before allocating the hugepage, release the mmap_sem read lock. + * The allocation can take potentially a long time if it involves + * sync compaction, and we do not need to hold the mmap_sem during + * that. We will recheck the vma after taking it again in write mode. + */ + up_read(&mm->mmap_sem); + + *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + *hpage = ERR_PTR(-ENOMEM); + return NULL; + } + + count_vm_event(THP_COLLAPSE_ALLOC); + return *hpage; +} +#else +static int khugepaged_find_target_node(void) +{ + return 0; +} + +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} + +static struct page *khugepaged_alloc_hugepage(bool *wait) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (!*wait) + return NULL; + + *wait = false; + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && likely(khugepaged_enabled())); + + return hpage; +} + +static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +{ + if (!*hpage) + *hpage = khugepaged_alloc_hugepage(wait); + + if (unlikely(!*hpage)) + return false; + + return true; +} + +static struct page * +khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + int node) +{ + up_read(&mm->mmap_sem); + VM_BUG_ON(!*hpage); + + return *hpage; +} +#endif + +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); + return true; +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *pmd_ptl, *pte_ptl; + int isolated; + unsigned long hstart, hend; + struct mem_cgroup *memcg; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + gfp_t gfp; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* Only allocate from the target node */ + gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + + /* release the mmap_sem read lock. */ + new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); + if (!new_page) + return; + + if (unlikely(mem_cgroup_try_charge(new_page, mm, + gfp, &memcg))) + return; + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + if (!vma) + goto out; + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + if (!hugepage_vma_check(vma)) + goto out; + pmd = mm_find_pmd(mm, address); + if (!pmd) + goto out; + + anon_vma_lock_write(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + pte_ptl = pte_lockptr(mm, pmd); + + mmun_start = address; + mmun_end = address + HPAGE_PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush(vma, address, pmd); + spin_unlock(pmd_ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + spin_lock(pte_ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(pte_ptl); + + if (unlikely(!isolated)) { + pte_unmap(pte); + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + spin_unlock(pmd_ptl); + anon_vma_unlock_write(vma->anon_vma); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock_write(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + pte_unmap(pte); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache_pmd(vma, address, pmd); + spin_unlock(pmd_ptl); + + *hpage = NULL; + + khugepaged_pages_collapsed++; +out_up_write: + up_write(&mm->mmap_sem); + return; + +out: + mem_cgroup_cancel_charge(new_page, memcg); + goto out_up_write; +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, none_or_zero = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + int node = NUMA_NO_NODE; + bool writable = false, referenced = false; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pmd = mm_find_pmd(mm, address); + if (!pmd) + goto out; + + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval)) + goto out_unmap; + if (pte_write(pteval)) + writable = true; + + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + /* + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. + */ + node = page_to_nid(page); + if (khugepaged_scan_abort(node)) + goto out_unmap; + khugepaged_node_load[node]++; + VM_BUG_ON_PAGE(PageCompound(page), page); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* + * cannot use mapcount: can't collapse if there's a gup pin. + * The page must only be referenced by the scanned process + * and page swap cache. + */ + if (page_count(page) != 1 + !!PageSwapCache(page)) + goto out_unmap; + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = true; + } + if (referenced && writable) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) { + node = khugepaged_find_target_node(); + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma, node); + } +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hash_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) + __releases(&khugepaged_mm_lock) + __acquires(&khugepaged_mm_lock) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + if (!hugepage_vma_check(vma)) { +skip: + progress++; + continue; + } + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + kthread_should_stop(); +} + +static void khugepaged_do_scan(void) +{ + struct page *hpage = NULL; + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + bool wait = true; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + if (!khugepaged_prealloc_page(&hpage, &wait)) + break; + + cond_resched(); + + if (unlikely(kthread_should_stop() || freezing(current))) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + &hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } + + if (!IS_ERR_OR_NULL(hpage)) + put_page(hpage); +} + +static void khugepaged_wait_work(void) +{ + try_to_freeze(); + + if (khugepaged_has_work()) { + if (!khugepaged_scan_sleep_millisecs) + return; + + wait_event_freezable_timeout(khugepaged_wait, + kthread_should_stop(), + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + return; + } + + if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, MAX_NICE); + + while (!kthread_should_stop()) { + khugepaged_do_scan(); + khugepaged_wait_work(); + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable; + pmd_t _pmd; + int i; + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + put_huge_zero_page(); +} + +void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd) +{ + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PMD_MASK; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); + + mmun_start = haddr; + mmun_end = haddr + HPAGE_PMD_SIZE; +again: + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + return; + } + if (is_huge_zero_pmd(*pmd)) { + __split_huge_zero_page_pmd(vma, haddr, pmd); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + get_page(page); + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + split_huge_page(page); + + put_page(page); + + /* + * We don't always have down_write of mmap_sem here: a racing + * do_huge_pmd_wp_page() might have copied-on-write to another + * huge page before our split_huge_page() got the anon_vma lock. + */ + if (unlikely(pmd_trans_huge(*pmd))) + goto again; +} + +void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, + pmd_t *pmd) +{ + struct vm_area_struct *vma; + + vma = find_vma(mm, address); + BUG_ON(vma == NULL); + split_huge_page_pmd(vma, address, pmd); +} + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd_mm(mm, address, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/kernel/mm/hugetlb.c b/kernel/mm/hugetlb.c new file mode 100644 index 000000000..271e44327 --- /dev/null +++ b/kernel/mm/hugetlb.c @@ -0,0 +1,3957 @@ +/* + * Generic hugetlb support. + * (C) Nadia Yvette Chambers, April 2004 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include "internal.h" + +int hugepages_treat_as_movable; + +int hugetlb_max_hstate __read_mostly; +unsigned int default_hstate_idx; +struct hstate hstates[HUGE_MAX_HSTATE]; + +__initdata LIST_HEAD(huge_boot_pages); + +/* for command line parsing */ +static struct hstate * __initdata parsed_hstate; +static unsigned long __initdata default_hstate_max_huge_pages; +static unsigned long __initdata default_hstate_size; + +/* + * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, + * free_huge_pages, and surplus_huge_pages. + */ +DEFINE_SPINLOCK(hugetlb_lock); + +/* + * Serializes faults on the same logical page. This is used to + * prevent spurious OOMs when the hugepage pool is fully utilized. + */ +static int num_fault_mutexes; +static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; + +/* Forward declaration */ +static int hugetlb_acct_memory(struct hstate *h, long delta); + +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +{ + bool free = (spool->count == 0) && (spool->used_hpages == 0); + + spin_unlock(&spool->lock); + + /* If no pages are used, and no other handles to the subpool + * remain, give up any reservations mased on minimum size and + * free the subpool */ + if (free) { + if (spool->min_hpages != -1) + hugetlb_acct_memory(spool->hstate, + -spool->min_hpages); + kfree(spool); + } +} + +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages) +{ + struct hugepage_subpool *spool; + + spool = kzalloc(sizeof(*spool), GFP_KERNEL); + if (!spool) + return NULL; + + spin_lock_init(&spool->lock); + spool->count = 1; + spool->max_hpages = max_hpages; + spool->hstate = h; + spool->min_hpages = min_hpages; + + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + kfree(spool); + return NULL; + } + spool->rsv_hpages = min_hpages; + + return spool; +} + +void hugepage_put_subpool(struct hugepage_subpool *spool) +{ + spin_lock(&spool->lock); + BUG_ON(!spool->count); + spool->count--; + unlock_or_release_subpool(spool); +} + +/* + * Subpool accounting for allocating and reserving pages. + * Return -ENOMEM if there are not enough resources to satisfy the + * the request. Otherwise, return the number of pages by which the + * global pools must be adjusted (upward). The returned value may + * only be different than the passed value (delta) in the case where + * a subpool minimum size must be manitained. + */ +static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, + long delta) +{ + long ret = delta; + + if (!spool) + return ret; + + spin_lock(&spool->lock); + + if (spool->max_hpages != -1) { /* maximum size accounting */ + if ((spool->used_hpages + delta) <= spool->max_hpages) + spool->used_hpages += delta; + else { + ret = -ENOMEM; + goto unlock_ret; + } + } + + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (delta > spool->rsv_hpages) { + /* + * Asking for more reserves than those already taken on + * behalf of subpool. Return difference. + */ + ret = delta - spool->rsv_hpages; + spool->rsv_hpages = 0; + } else { + ret = 0; /* reserves already accounted for */ + spool->rsv_hpages -= delta; + } + } + +unlock_ret: + spin_unlock(&spool->lock); + return ret; +} + +/* + * Subpool accounting for freeing and unreserving pages. + * Return the number of global page reservations that must be dropped. + * The return value may only be different than the passed value (delta) + * in the case where a subpool minimum size must be maintained. + */ +static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, + long delta) +{ + long ret = delta; + + if (!spool) + return delta; + + spin_lock(&spool->lock); + + if (spool->max_hpages != -1) /* maximum size accounting */ + spool->used_hpages -= delta; + + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (spool->rsv_hpages + delta <= spool->min_hpages) + ret = 0; + else + ret = spool->rsv_hpages + delta - spool->min_hpages; + + spool->rsv_hpages += delta; + if (spool->rsv_hpages > spool->min_hpages) + spool->rsv_hpages = spool->min_hpages; + } + + /* + * If hugetlbfs_put_super couldn't free spool due to an outstanding + * quota reference, free it now. + */ + unlock_or_release_subpool(spool); + + return ret; +} + +static inline struct hugepage_subpool *subpool_inode(struct inode *inode) +{ + return HUGETLBFS_SB(inode->i_sb)->spool; +} + +static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) +{ + return subpool_inode(file_inode(vma->vm_file)); +} + +/* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. + * + * The region data structures are embedded into a resv_map and + * protected by a resv_map's lock + */ +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct resv_map *resv, long f, long t) +{ + struct list_head *head = &resv->regions; + struct file_region *rg, *nrg, *trg; + + spin_lock(&resv->lock); + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + spin_unlock(&resv->lock); + return 0; +} + +static long region_chg(struct resv_map *resv, long f, long t) +{ + struct list_head *head = &resv->regions; + struct file_region *rg, *nrg = NULL; + long chg = 0; + +retry: + spin_lock(&resv->lock); + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarantee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + goto retry; + } + + list_add(&nrg->link, rg->link.prev); + chg = t - f; + goto out_nrg; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + goto out; + + /* We overlap with this area, if it extends further than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + +out: + spin_unlock(&resv->lock); + /* We already know we raced and no longer need the new region */ + kfree(nrg); + return chg; +out_nrg: + spin_unlock(&resv->lock); + return chg; +} + +static long region_truncate(struct resv_map *resv, long end) +{ + struct list_head *head = &resv->regions; + struct file_region *rg, *trg; + long chg = 0; + + spin_lock(&resv->lock); + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + goto out; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + +out: + spin_unlock(&resv->lock); + return chg; +} + +static long region_count(struct resv_map *resv, long f, long t) +{ + struct list_head *head = &resv->regions; + struct file_region *rg; + long chg = 0; + + spin_lock(&resv->lock); + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + long seg_from; + long seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + spin_unlock(&resv->lock); + + return chg; +} + +/* + * Convert the address within this vma to the page offset within + * the mapping, in pagecache page units; huge pages here. + */ +static pgoff_t vma_hugecache_offset(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + +/* + * Return the size of the pages allocated when backing a VMA. In the majority + * cases this will be same size as used by the page table entries. + */ +unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate; + + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + hstate = hstate_vma(vma); + + return 1UL << huge_page_shift(hstate); +} +EXPORT_SYMBOL_GPL(vma_kernel_pagesize); + +/* + * Return the page size being used by the MMU to back a VMA. In the majority + * of cases, the page size used by the kernel matches the MMU size. On + * architectures where it differs, an architecture-specific version of this + * function is required. + */ +#ifndef vma_mmu_pagesize +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} +#endif + +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) +#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + +/* + * These helpers are used to track how many pages are reserved for + * faults in a MAP_PRIVATE mapping. Only the process that called mmap() + * is guaranteed to have their future faults succeed. + * + * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * the reserve counters are updated with the hugetlb_lock held. It is safe + * to reset the VMA at fork() time as it is not in use yet and there is no + * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. + */ +static unsigned long get_vma_private_data(struct vm_area_struct *vma) +{ + return (unsigned long)vma->vm_private_data; +} + +static void set_vma_private_data(struct vm_area_struct *vma, + unsigned long value) +{ + vma->vm_private_data = (void *)value; +} + +struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + if (!resv_map) + return NULL; + + kref_init(&resv_map->refs); + spin_lock_init(&resv_map->lock); + INIT_LIST_HEAD(&resv_map->regions); + + return resv_map; +} + +void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + + /* Clear out any active regions before we release the map. */ + region_truncate(resv_map, 0); + kfree(resv_map); +} + +static inline struct resv_map *inode_resv_map(struct inode *inode) +{ + return inode->i_mapping->private_data; +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + if (vma->vm_flags & VM_MAYSHARE) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + return inode_resv_map(inode); + + } else { + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); + } +} + +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + + set_vma_private_data(vma, (get_vma_private_data(vma) & + HPAGE_RESV_MASK) | (unsigned long)map); +} + +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + + set_vma_private_data(vma, get_vma_private_data(vma) | flags); +} + +static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + + return (get_vma_private_data(vma) & flag) != 0; +} + +/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ +void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + if (!(vma->vm_flags & VM_MAYSHARE)) + vma->vm_private_data = (void *)0; +} + +/* Returns true if the VMA has associated reserve pages */ +static int vma_has_reserves(struct vm_area_struct *vma, long chg) +{ + if (vma->vm_flags & VM_NORESERVE) { + /* + * This address is already reserved by other process(chg == 0), + * so, we should decrement reserved count. Without decrementing, + * reserve count remains after releasing inode, because this + * allocated page will go into page cache and is regarded as + * coming from reserved pool in releasing step. Currently, we + * don't have any other solution to deal with this situation + * properly, so add work-around here. + */ + if (vma->vm_flags & VM_MAYSHARE && chg == 0) + return 1; + else + return 0; + } + + /* Shared mappings always use reserves */ + if (vma->vm_flags & VM_MAYSHARE) + return 1; + + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + + return 0; +} + +static void enqueue_huge_page(struct hstate *h, struct page *page) +{ + int nid = page_to_nid(page); + list_move(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; +} + +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) + if (!is_migrate_isolate_page(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, + * the allocation fails. + */ + if (&h->hugepage_freelists[nid] == &page->lru) + return NULL; + list_move(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; +} + +/* Movability of hugepages depends on migration support. */ +static inline gfp_t htlb_alloc_mask(struct hstate *h) +{ + if (hugepages_treat_as_movable || hugepage_migration_supported(h)) + return GFP_HIGHUSER_MOVABLE; + else + return GFP_HIGHUSER; +} + +static struct page *dequeue_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address, int avoid_reserve, + long chg) +{ + struct page *page = NULL; + struct mempolicy *mpol; + nodemask_t *nodemask; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + unsigned int cpuset_mems_cookie; + + /* + * A child process with MAP_PRIVATE mappings created by their parent + * have no page reserves. This check ensures that reservations are + * not "stolen". The child may still get SIGKILLed + */ + if (!vma_has_reserves(vma, chg) && + h->free_huge_pages - h->resv_huge_pages == 0) + goto err; + + /* If reserves cannot be used, ensure enough pages are in the pool */ + if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + goto err; + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask(h), &mpol, &nodemask); + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + MAX_NR_ZONES - 1, nodemask) { + if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { + page = dequeue_huge_page_node(h, zone_to_nid(zone)); + if (page) { + if (avoid_reserve) + break; + if (!vma_has_reserves(vma, chg)) + break; + + SetPagePrivate(page); + h->resv_huge_pages--; + break; + } + } + } + + mpol_cond_put(mpol); + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return page; + +err: + return NULL; +} + +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +static void destroy_compound_gigantic_page(struct page *page, + unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __ClearPageTail(p); + set_page_refcounted(p); + p->first_page = NULL; + } + + set_compound_order(page, 0); + __ClearPageHead(page); +} + +static void free_gigantic_page(struct page *page, unsigned order) +{ + free_contig_range(page_to_pfn(page), 1 << order); +} + +static int __alloc_gigantic_page(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long end_pfn = start_pfn + nr_pages; + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); +} + +static bool pfn_range_valid_gigantic(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + if (!pfn_valid(i)) + return false; + + page = pfn_to_page(i); + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + return zone_spans_pfn(zone, last_pfn); +} + +static struct page *alloc_gigantic_page(int nid, unsigned order) +{ + unsigned long nr_pages = 1 << order; + unsigned long ret, pfn, flags; + struct zone *z; + + z = NODE_DATA(nid)->node_zones; + for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { + spin_lock_irqsave(&z->lock, flags); + + pfn = ALIGN(z->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(z, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&z->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&z->lock, flags); + } + pfn += nr_pages; + } + + spin_unlock_irqrestore(&z->lock, flags); + } + + return NULL; +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); +static void prep_compound_gigantic_page(struct page *page, unsigned long order); + +static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) +{ + struct page *page; + + page = alloc_gigantic_page(nid, huge_page_order(h)); + if (page) { + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, nid); + } + + return page; +} + +static int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) +{ + struct page *page = NULL; + int nr_nodes, node; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_gigantic_page_node(h, node); + if (page) + return 1; + } + + return 0; +} + +static inline bool gigantic_page_supported(void) { return true; } +#else +static inline bool gigantic_page_supported(void) { return false; } +static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void destroy_compound_gigantic_page(struct page *page, + unsigned long order) { } +static inline int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) { return 0; } +#endif + +static void update_and_free_page(struct hstate *h, struct page *page) +{ + int i; + + if (hstate_is_gigantic(h) && !gigantic_page_supported()) + return; + + h->nr_huge_pages--; + h->nr_huge_pages_node[page_to_nid(page)]--; + for (i = 0; i < pages_per_huge_page(h); i++) { + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active | 1 << PG_private | + 1 << PG_writeback); + } + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + set_compound_page_dtor(page, NULL); + set_page_refcounted(page); + if (hstate_is_gigantic(h)) { + destroy_compound_gigantic_page(page, huge_page_order(h)); + free_gigantic_page(page, huge_page_order(h)); + } else { + arch_release_hugepage(page); + __free_pages(page, huge_page_order(h)); + } +} + +struct hstate *size_to_hstate(unsigned long size) +{ + struct hstate *h; + + for_each_hstate(h) { + if (huge_page_size(h) == size) + return h; + } + return NULL; +} + +/* + * Test to determine whether the hugepage is "active/in-use" (i.e. being linked + * to hstate->hugepage_activelist.) + * + * This function can be called for tail pages, but never returns true for them. + */ +bool page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHuge(page), page); + return PageHead(page) && PagePrivate(&page[1]); +} + +/* never called for tail page */ +static void set_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + SetPagePrivate(&page[1]); +} + +static void clear_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + ClearPagePrivate(&page[1]); +} + +void free_huge_page(struct page *page) +{ + /* + * Can't pass hstate in here because it is called from the + * compound page destructor. + */ + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + struct hugepage_subpool *spool = + (struct hugepage_subpool *)page_private(page); + bool restore_reserve; + + set_page_private(page, 0); + page->mapping = NULL; + BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); + restore_reserve = PagePrivate(page); + ClearPagePrivate(page); + + /* + * A return code of zero implies that the subpool will be under its + * minimum size if the reservation is not restored after page is free. + * Therefore, force restore_reserve operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + + spin_lock(&hugetlb_lock); + clear_page_huge_active(page); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); + if (restore_reserve) + h->resv_huge_pages++; + + if (h->surplus_huge_pages_node[nid]) { + /* remove the page from active list */ + list_del(&page->lru); + update_and_free_page(h, page); + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } else { + arch_clear_hugepage_flags(page); + enqueue_huge_page(h, page); + } + spin_unlock(&hugetlb_lock); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + INIT_LIST_HEAD(&page->lru); + set_compound_page_dtor(page, free_huge_page); + spin_lock(&hugetlb_lock); + set_hugetlb_cgroup(page, NULL); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; + spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ +} + +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + __ClearPageReserved(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwse drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + __ClearPageReserved(p); + set_page_count(p, 0); + p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); + } +} + +/* + * PageHuge() only returns true for hugetlbfs pages, but not for normal or + * transparent huge pages. See the PageTransHuge() documentation for more + * details. + */ +int PageHuge(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return get_compound_page_dtor(page) == free_huge_page; +} +EXPORT_SYMBOL_GPL(PageHuge); + +/* + * PageHeadHuge() only returns true for hugetlbfs head page, but not for + * normal or transparent huge pages. + */ +int PageHeadHuge(struct page *page_head) +{ + if (!PageHead(page_head)) + return 0; + + return get_compound_page_dtor(page_head) == free_huge_page; +} + +pgoff_t __basepage_index(struct page *page) +{ + struct page *page_head = compound_head(page); + pgoff_t index = page_index(page_head); + unsigned long compound_idx; + + if (!PageHuge(page_head)) + return page_index(page); + + if (compound_order(page_head) >= MAX_ORDER) + compound_idx = page_to_pfn(page) - page_to_pfn(page_head); + else + compound_idx = page - page_head; + + return (index << compound_order(page_head)) + compound_idx; +} + +static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + page = alloc_pages_exact_node(nid, + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + if (page) { + if (arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + prep_new_huge_page(h, page, nid); + } + + return page; +} + +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +{ + struct page *page; + int nr_nodes, node; + int ret = 0; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_huge_page_node(h, node); + if (page) { + ret = 1; + break; + } + } + + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return ret; +} + +/* + * Free huge page from pool from next node to free. + * Attempt to keep persistent huge pages more or less + * balanced over allowed nodes. + * Called with hugetlb_lock locked. + */ +static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + bool acct_surplus) +{ + int nr_nodes, node; + int ret = 0; + + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[node]) && + !list_empty(&h->hugepage_freelists[node])) { + struct page *page = + list_entry(h->hugepage_freelists[node].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[node]--; + if (acct_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[node]--; + } + update_and_free_page(h, page); + ret = 1; + break; + } + } + + return ret; +} + +/* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use (including surplus) hugepages. + */ +static void dissolve_free_huge_page(struct page *page) +{ + spin_lock(&hugetlb_lock); + if (PageHuge(page) && !page_count(page)) { + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + update_and_free_page(h, page); + } + spin_unlock(&hugetlb_lock); +} + +/* + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to + * make specified memory blocks removable from the system. + * Note that start_pfn should aligned with (minimum) hugepage size. + */ +void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned int order = 8 * sizeof(void *); + unsigned long pfn; + struct hstate *h; + + if (!hugepages_supported()) + return; + + /* Set scan step to minimum hugepage size */ + for_each_hstate(h) + if (order > huge_page_order(h)) + order = huge_page_order(h); + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + dissolve_free_huge_page(pfn_to_page(pfn)); +} + +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) +{ + struct page *page; + unsigned int r_nid; + + if (hstate_is_gigantic(h)) + return NULL; + + /* + * Assume we will successfully allocate the surplus page to + * prevent racing processes from causing the surplus to exceed + * overcommit + * + * This however introduces a different race, where a process B + * tries to grow the static hugepage pool while alloc_pages() is + * called by process A. B will only examine the per-node + * counters in determining if surplus huge pages can be + * converted to normal huge pages in adjust_pool_surplus(). A + * won't be able to increment the per-node counter, until the + * lock is dropped by B, but B doesn't drop hugetlb_lock until + * no more huge pages can be converted from surplus to normal + * state (and doesn't try to convert again). Thus, we have a + * case where a surplus huge page exists, the pool is grown, and + * the surplus huge page still exists after, even though it + * should just have been converted to a normal huge page. This + * does not leak memory, though, as the hugepage will be freed + * once it is out of use. It also does not allow the counters to + * go out of whack in adjust_pool_surplus() as we don't modify + * the node values until we've gotten the hugepage and only the + * per-node value is checked there. + */ + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { + spin_unlock(&hugetlb_lock); + return NULL; + } else { + h->nr_huge_pages++; + h->surplus_huge_pages++; + } + spin_unlock(&hugetlb_lock); + + if (nid == NUMA_NO_NODE) + page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + else + page = alloc_pages_exact_node(nid, + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + page = NULL; + } + + spin_lock(&hugetlb_lock); + if (page) { + INIT_LIST_HEAD(&page->lru); + r_nid = page_to_nid(page); + set_compound_page_dtor(page, free_huge_page); + set_hugetlb_cgroup(page, NULL); + /* + * We incremented the global counters already + */ + h->nr_huge_pages_node[r_nid]++; + h->surplus_huge_pages_node[r_nid]++; + __count_vm_event(HTLB_BUDDY_PGALLOC); + } else { + h->nr_huge_pages--; + h->surplus_huge_pages--; + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + } + spin_unlock(&hugetlb_lock); + + return page; +} + +/* + * This allocation function is useful in the context where vma is irrelevant. + * E.g. soft-offlining uses this function because it only cares physical + * address of error page. + */ +struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + struct page *page = NULL; + + spin_lock(&hugetlb_lock); + if (h->free_huge_pages - h->resv_huge_pages > 0) + page = dequeue_huge_page_node(h, nid); + spin_unlock(&hugetlb_lock); + + if (!page) + page = alloc_buddy_huge_page(h, nid); + + return page; +} + +/* + * Increase the hugetlb pool such that it can accommodate a reservation + * of size 'delta'. + */ +static int gather_surplus_pages(struct hstate *h, int delta) +{ + struct list_head surplus_list; + struct page *page, *tmp; + int ret, i; + int needed, allocated; + bool alloc_ok = true; + + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; + if (needed <= 0) { + h->resv_huge_pages += delta; + return 0; + } + + allocated = 0; + INIT_LIST_HEAD(&surplus_list); + + ret = -ENOMEM; +retry: + spin_unlock(&hugetlb_lock); + for (i = 0; i < needed; i++) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) { + alloc_ok = false; + break; + } + list_add(&page->lru, &surplus_list); + } + allocated += i; + + /* + * After retaking hugetlb_lock, we need to recalculate 'needed' + * because either resv_huge_pages or free_huge_pages may have changed. + */ + spin_lock(&hugetlb_lock); + needed = (h->resv_huge_pages + delta) - + (h->free_huge_pages + allocated); + if (needed > 0) { + if (alloc_ok) + goto retry; + /* + * We were not able to allocate enough pages to + * satisfy the entire reservation so we free what + * we've allocated so far. + */ + goto free; + } + /* + * The surplus_list now contains _at_least_ the number of extra pages + * needed to accommodate the reservation. Add the appropriate number + * of pages to the hugetlb pool and free the extras back to the buddy + * allocator. Commit the entire reservation here to prevent another + * process from stealing the pages as they are added to the pool but + * before they are reserved. + */ + needed += allocated; + h->resv_huge_pages += delta; + ret = 0; + + /* Free the needed pages to the hugetlb pool */ + list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + if ((--needed) < 0) + break; + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON_PAGE(page_count(page), page); + enqueue_huge_page(h, page); + } +free: + spin_unlock(&hugetlb_lock); + + /* Free unnecessary surplus pages to the buddy allocator */ + list_for_each_entry_safe(page, tmp, &surplus_list, lru) + put_page(page); + spin_lock(&hugetlb_lock); + + return ret; +} + +/* + * When releasing a hugetlb pool reservation, any surplus pages that were + * allocated to satisfy the reservation must be explicitly freed if they were + * never used. + * Called with hugetlb_lock held. + */ +static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) +{ + unsigned long nr_pages; + + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; + + /* Cannot return gigantic pages currently */ + if (hstate_is_gigantic(h)) + return; + + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); + + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes with memory. Iterate across these nodes + * until we can no longer free unreserved surplus pages. This occurs + * when the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. + */ + while (nr_pages--) { + if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) + break; + cond_resched_lock(&hugetlb_lock); + } +} + +/* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase + * reservation and actually increase subpool usage before an allocation + * can occur. Where any new reservation would be required the + * reservation change is prepared, but not committed. Once the page + * has been allocated from the subpool and instantiated the change should + * be committed via vma_commit_reservation. No action is required on + * failure. + */ +static long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + struct resv_map *resv; + pgoff_t idx; + long chg; + + resv = vma_resv_map(vma); + if (!resv) + return 1; + + idx = vma_hugecache_offset(h, vma, addr); + chg = region_chg(resv, idx, idx + 1); + + if (vma->vm_flags & VM_MAYSHARE) + return chg; + else + return chg < 0 ? chg : 0; +} +static void vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + struct resv_map *resv; + pgoff_t idx; + + resv = vma_resv_map(vma); + if (!resv) + return; + + idx = vma_hugecache_offset(h, vma, addr); + region_add(resv, idx, idx + 1); +} + +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct hugepage_subpool *spool = subpool_vma(vma); + struct hstate *h = hstate_vma(vma); + struct page *page; + long chg; + int ret, idx; + struct hugetlb_cgroup *h_cg; + + idx = hstate_index(h); + /* + * Processes that did not create the mapping will have no + * reserves and will not have accounted against subpool + * limit. Check that the subpool limit can be made before + * satisfying the allocation MAP_NORESERVE mappings may also + * need pages and subpool limit allocated allocated if no reserve + * mapping overlaps. + */ + chg = vma_needs_reservation(h, vma, addr); + if (chg < 0) + return ERR_PTR(-ENOMEM); + if (chg || avoid_reserve) + if (hugepage_subpool_get_pages(spool, 1) < 0) + return ERR_PTR(-ENOSPC); + + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); + if (ret) + goto out_subpool_put; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); + if (!page) { + spin_unlock(&hugetlb_lock); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) + goto out_uncharge_cgroup; + + spin_lock(&hugetlb_lock); + list_move(&page->lru, &h->hugepage_activelist); + /* Fall through */ + } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + spin_unlock(&hugetlb_lock); + + set_page_private(page, (unsigned long)spool); + + vma_commit_reservation(h, vma, addr); + return page; + +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); +out_subpool_put: + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + return ERR_PTR(-ENOSPC); +} + +/* + * alloc_huge_page()'s wrapper which simply returns the page if allocation + * succeeds, otherwise NULL. This function is called from new_vma_page(), + * where no ERR_VALUE is expected to be returned. + */ +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct page *page = alloc_huge_page(vma, addr, avoid_reserve); + if (IS_ERR(page)) + page = NULL; + return page; +} + +int __weak alloc_bootmem_huge_page(struct hstate *h) +{ + struct huge_bootmem_page *m; + int nr_nodes, node; + + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { + void *addr; + + addr = memblock_virt_alloc_try_nid_nopanic( + huge_page_size(h), huge_page_size(h), + 0, BOOTMEM_ALLOC_ACCESSIBLE, node); + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + goto found; + } + } + return 0; + +found: + BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; +} + +static void __init prep_compound_huge_page(struct page *page, int order) +{ + if (unlikely(order > (MAX_ORDER - 1))) + prep_compound_gigantic_page(page, order); + else + prep_compound_page(page, order); +} + +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) +{ + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct hstate *h = m->hstate; + struct page *page; + +#ifdef CONFIG_HIGHMEM + page = pfn_to_page(m->phys >> PAGE_SHIFT); + memblock_free_late(__pa(m), + sizeof(struct huge_bootmem_page)); +#else + page = virt_to_page(m); +#endif + WARN_ON(page_count(page) != 1); + prep_compound_huge_page(page, h->order); + WARN_ON(PageReserved(page)); + prep_new_huge_page(h, page, page_to_nid(page)); + /* + * If we had gigantic hugepages allocated at boot time, we need + * to restore the 'stolen' pages to totalram_pages in order to + * fix confusing memory reports from free(1) and another + * side-effects, like CommitLimit going negative. + */ + if (hstate_is_gigantic(h)) + adjust_managed_page_count(page, 1 << h->order); + } +} + +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) +{ + unsigned long i; + + for (i = 0; i < h->max_huge_pages; ++i) { + if (hstate_is_gigantic(h)) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h, + &node_states[N_MEMORY])) + break; + } + h->max_huge_pages = i; +} + +static void __init hugetlb_init_hstates(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* oversize hugepages were init'ed in early boot */ + if (!hstate_is_gigantic(h)) + hugetlb_hstate_alloc_pages(h); + } +} + +static char * __init memfmt(char *buf, unsigned long n) +{ + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); + else + sprintf(buf, "%lu KB", n >> 10); + return buf; +} + +static void __init report_hugepages(void) +{ + struct hstate *h; + + for_each_hstate(h) { + char buf[32]; + pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", + memfmt(buf, huge_page_size(h)), + h->free_huge_pages); + } +} + +#ifdef CONFIG_HIGHMEM +static void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) +{ + int i; + + if (hstate_is_gigantic(h)) + return; + + for_each_node_mask(i, *nodes_allowed) { + struct page *page, *next; + struct list_head *freel = &h->hugepage_freelists[i]; + list_for_each_entry_safe(page, next, freel, lru) { + if (count >= h->nr_huge_pages) + return; + if (PageHighMem(page)) + continue; + list_del(&page->lru); + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[page_to_nid(page)]--; + } + } +} +#else +static inline void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) +{ +} +#endif + +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, + int delta) +{ + int nr_nodes, node; + + VM_BUG_ON(delta != -1 && delta != 1); + + if (delta < 0) { + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node]) + goto found; + } + } else { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node] < + h->nr_huge_pages_node[node]) + goto found; + } + } + return 0; + +found: + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[node] += delta; + return 1; +} + +#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) +{ + unsigned long min_count, ret; + + if (hstate_is_gigantic(h) && !gigantic_page_supported()) + return h->max_huge_pages; + + /* + * Increase the pool size + * First take pages out of surplus state. Then make up the + * remaining difference by allocating fresh huge pages. + * + * We might race with alloc_buddy_huge_page() here and be unable + * to convert a surplus huge page to a normal huge page. That is + * not critical, though, it just means the overall size of the + * pool might be one hugepage larger than it needs to be, but + * within all the constraints specified by the sysctls. + */ + spin_lock(&hugetlb_lock); + while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, nodes_allowed, -1)) + break; + } + + while (count > persistent_huge_pages(h)) { + /* + * If this allocation races such that we no longer need the + * page, free_huge_page will handle it by freeing the page + * and reducing the surplus. + */ + spin_unlock(&hugetlb_lock); + if (hstate_is_gigantic(h)) + ret = alloc_fresh_gigantic_page(h, nodes_allowed); + else + ret = alloc_fresh_huge_page(h, nodes_allowed); + spin_lock(&hugetlb_lock); + if (!ret) + goto out; + + /* Bail for signals. Probably ctrl-c from user */ + if (signal_pending(current)) + goto out; + } + + /* + * Decrease the pool size + * First return free pages to the buddy allocator (being careful + * to keep enough around to satisfy reservations). Then place + * pages into surplus state as needed so the pool will shrink + * to the desired size as pages become free. + * + * By placing pages into the surplus state independent of the + * overcommit value, we are allowing the surplus pool size to + * exceed overcommit. There are few sane options here. Since + * alloc_buddy_huge_page() is checking the global counter, + * though, we'll note that we're not allowed to exceed surplus + * and won't grow the pool anywhere else. Not until one of the + * sysctls are changed, or the surplus pages go out of use. + */ + min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; + min_count = max(count, min_count); + try_to_free_low(h, min_count, nodes_allowed); + while (min_count < persistent_huge_pages(h)) { + if (!free_pool_huge_page(h, nodes_allowed, 0)) + break; + cond_resched_lock(&hugetlb_lock); + } + while (count < persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, nodes_allowed, 1)) + break; + } +out: + ret = persistent_huge_pages(h); + spin_unlock(&hugetlb_lock); + return ret; +} + +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); + +static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) +{ + int i; + + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = NUMA_NO_NODE; + return &hstates[i]; + } + + return kobj_to_node_hstate(kobj, nidp); +} + +static ssize_t nr_hugepages_show_common(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long nr_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + nr_huge_pages = h->nr_huge_pages; + else + nr_huge_pages = h->nr_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", nr_huge_pages); +} + +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) +{ + int err; + NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); + + if (hstate_is_gigantic(h) && !gigantic_page_supported()) { + err = -EINVAL; + goto out; + } + + if (nid == NUMA_NO_NODE) { + /* + * global hstate attribute + */ + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_MEMORY]; + } + } else if (nodes_allowed) { + /* + * per node hstate attribute: adjust count to global, + * but restrict alloc/free to the specified node. + */ + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + init_nodemask_of_node(nodes_allowed, nid); + } else + nodes_allowed = &node_states[N_MEMORY]; + + h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); + + if (nodes_allowed != &node_states[N_MEMORY]) + NODEMASK_FREE(nodes_allowed); + + return len; +out: + NODEMASK_FREE(nodes_allowed); + return err; +} + +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages); + +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} + +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj, NULL); + + if (hstate_is_gigantic(h)) + return -EINVAL; + + err = kstrtoul(buf, 10, &input); + if (err) + return err; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long free_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + free_huge_pages = h->free_huge_pages; + else + free_huge_pages = h->free_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long surplus_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + surplus_huge_pages = h->surplus_huge_pages; + else + surplus_huge_pages = h->surplus_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) +{ + int retval; + int hi = hstate_index(h); + + hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); + if (!hstate_kobjs[hi]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[hi]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("Hugetlb: Unable to add hstate %s", h->name); + } +} + +#ifdef CONFIG_NUMA + +/* + * node_hstate/s - associate per node hstate attributes, via their kobjects, + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in + * the base kernel, on the hugetlb module. + */ +struct node_hstate { + struct kobject *hugepages_kobj; + struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; +}; +struct node_hstate node_hstates[MAX_NUMNODES]; + +/* + * A subset of global hstate attributes for node devices + */ +static struct attribute *per_node_hstate_attrs[] = { + &nr_hugepages_attr.attr, + &free_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group per_node_hstate_attr_group = { + .attrs = per_node_hstate_attrs, +}; + +/* + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. + * Returns node id via non-NULL nidp. + */ +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) { + struct node_hstate *nhs = &node_hstates[nid]; + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (nhs->hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = nid; + return &hstates[i]; + } + } + + BUG(); + return NULL; +} + +/* + * Unregister hstate attributes from a single node device. + * No-op if no hstate attributes attached. + */ +static void hugetlb_unregister_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + + if (!nhs->hugepages_kobj) + return; /* no hstate attributes */ + + for_each_hstate(h) { + int idx = hstate_index(h); + if (nhs->hstate_kobjs[idx]) { + kobject_put(nhs->hstate_kobjs[idx]); + nhs->hstate_kobjs[idx] = NULL; + } + } + + kobject_put(nhs->hugepages_kobj); + nhs->hugepages_kobj = NULL; +} + +/* + * hugetlb module exit: unregister hstate attributes from node devices + * that have them. + */ +static void hugetlb_unregister_all_nodes(void) +{ + int nid; + + /* + * disable node device registrations. + */ + register_hugetlbfs_with_node(NULL, NULL); + + /* + * remove hstate attributes from any nodes that have them. + */ + for (nid = 0; nid < nr_node_ids; nid++) + hugetlb_unregister_node(node_devices[nid]); +} + +/* + * Register hstate attributes for a single node device. + * No-op if attributes already registered. + */ +static void hugetlb_register_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + int err; + + if (nhs->hugepages_kobj) + return; /* already allocated */ + + nhs->hugepages_kobj = kobject_create_and_add("hugepages", + &node->dev.kobj); + if (!nhs->hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, + nhs->hstate_kobjs, + &per_node_hstate_attr_group); + if (err) { + pr_err("Hugetlb: Unable to add hstate %s for node %d\n", + h->name, node->dev.id); + hugetlb_unregister_node(node); + break; + } + } +} + +/* + * hugetlb init time: register hstate attributes for all registered node + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. + */ +static void __init hugetlb_register_all_nodes(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + struct node *node = node_devices[nid]; + if (node->dev.id == nid) + hugetlb_register_node(node); + } + + /* + * Let the node device driver know we're here so it can + * [un]register hstate attributes on node hotplug. + */ + register_hugetlbfs_with_node(hugetlb_register_node, + hugetlb_unregister_node); +} +#else /* !CONFIG_NUMA */ + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + BUG(); + if (nidp) + *nidp = -1; + return NULL; +} + +static void hugetlb_unregister_all_nodes(void) { } + +static void hugetlb_register_all_nodes(void) { } + +#endif + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + hugetlb_unregister_all_nodes(); + + for_each_hstate(h) { + kobject_put(hstate_kobjs[hstate_index(h)]); + } + + kobject_put(hugepages_kobj); + kfree(htlb_fault_mutex_table); +} +module_exit(hugetlb_exit); + +static int __init hugetlb_init(void) +{ + int i; + + if (!hugepages_supported()) + return 0; + + if (!size_to_hstate(default_hstate_size)) { + default_hstate_size = HPAGE_SIZE; + if (!size_to_hstate(default_hstate_size)) + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + } + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); + if (default_hstate_max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + + hugetlb_init_hstates(); + gather_bootmem_prealloc(); + report_hugepages(); + + hugetlb_sysfs_init(); + hugetlb_register_all_nodes(); + hugetlb_cgroup_file_init(); + +#ifdef CONFIG_SMP + num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); +#else + num_fault_mutexes = 1; +#endif + htlb_fault_mutex_table = + kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); + BUG_ON(!htlb_fault_mutex_table); + + for (i = 0; i < num_fault_mutexes; i++) + mutex_init(&htlb_fault_mutex_table[i]); + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + unsigned long i; + + if (size_to_hstate(PAGE_SIZE << order)) { + pr_warning("hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[hugetlb_max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->nr_huge_pages = 0; + h->free_huge_pages = 0; + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&h->hugepage_freelists[i]); + INIT_LIST_HEAD(&h->hugepage_activelist); + h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); + h->next_nid_to_free = first_node(node_states[N_MEMORY]); + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/1024); + + parsed_hstate = h; +} + +static int __init hugetlb_nrpages_setup(char *s) +{ + unsigned long *mhp; + static unsigned long *last_mhp; + + /* + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!hugetlb_max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (mhp == last_mhp) { + pr_warning("hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); + return 1; + } + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) + hugetlb_hstate_alloc_pages(parsed_hstate); + + last_mhp = mhp; + + return 1; +} +__setup("hugepages=", hugetlb_nrpages_setup); + +static int __init hugetlb_default_setup(char *s) +{ + default_hstate_size = memparse(s, &s); + return 1; +} +__setup("default_hugepagesz=", hugetlb_default_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + +#ifdef CONFIG_SYSCTL +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp = h->max_huge_pages; + int ret; + + if (!hugepages_supported()) + return -ENOTSUPP; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; + + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); +out: + return ret; +} + +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + +int hugetlb_overcommit_handler(struct ctl_table *table, int write, + void __user *buffer, + size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp; + int ret; + + if (!hugepages_supported()) + return -ENOTSUPP; + + tmp = h->nr_overcommit_huge_pages; + + if (write && hstate_is_gigantic(h)) + return -EINVAL; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; + + if (write) { + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock(&hugetlb_lock); + } +out: + return ret; +} + +#endif /* CONFIG_SYSCTL */ + +void hugetlb_report_meminfo(struct seq_file *m) +{ + struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + h->nr_huge_pages, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); +} + +int hugetlb_report_node_meminfo(int nid, char *buf) +{ + struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; + return sprintf(buf, + "Node %d HugePages_Total: %5u\n" + "Node %d HugePages_Free: %5u\n" + "Node %d HugePages_Surp: %5u\n", + nid, h->nr_huge_pages_node[nid], + nid, h->free_huge_pages_node[nid], + nid, h->surplus_huge_pages_node[nid]); +} + +void hugetlb_show_meminfo(void) +{ + struct hstate *h; + int nid; + + if (!hugepages_supported()) + return; + + for_each_node_state(nid, N_MEMORY) + for_each_hstate(h) + pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); +} + +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; +} + +static int hugetlb_acct_memory(struct hstate *h, long delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + if (delta > 0) { + if (gather_surplus_pages(h, delta) < 0) + goto out; + + if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { + return_unused_surplus_pages(h, delta); + goto out; + } + } + + ret = 0; + if (delta < 0) + return_unused_surplus_pages(h, (unsigned long) -delta); + +out: + spin_unlock(&hugetlb_lock); + return ret; +} + +static void hugetlb_vm_op_open(struct vm_area_struct *vma) +{ + struct resv_map *resv = vma_resv_map(vma); + + /* + * This new VMA should share its siblings reservation map if present. + * The VMA will only ever have a valid reservation map pointer where + * it is being copied for another still existing VMA. As that VMA + * has a reference to the reservation map it cannot disappear until + * after this open call completes. It is therefore safe to take a + * new reference here without additional locking. + */ + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_get(&resv->refs); +} + +static void hugetlb_vm_op_close(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + struct resv_map *resv = vma_resv_map(vma); + struct hugepage_subpool *spool = subpool_vma(vma); + unsigned long reserve, start, end; + long gbl_reserve; + + if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return; + + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); + + reserve = (end - start) - region_count(resv, start, end); + + kref_put(&resv->refs, resv_map_release); + + if (reserve) { + /* + * Decrement reserve counts. The global reserve count may be + * adjusted if the subpool has a minimum size. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, reserve); + hugetlb_acct_memory(h, -gbl_reserve); + } +} + +/* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the + * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * this far. + */ +static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); + return 0; +} + +const struct vm_operations_struct hugetlb_vm_ops = { + .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, + .close = hugetlb_vm_op_close, +}; + +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) +{ + pte_t entry; + + if (writable) { + entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, + vma->vm_page_prot))); + } else { + entry = huge_pte_wrprotect(mk_huge_pte(page, + vma->vm_page_prot)); + } + entry = pte_mkyoung(entry); + entry = pte_mkhuge(entry); + entry = arch_make_huge_pte(entry, vma, page, writable); + + return entry; +} + +static void set_huge_ptep_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t entry; + + entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) + update_mmu_cache(vma, address, ptep); +} + +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) + return 1; + else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) + return 1; + else + return 0; +} + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr; + int cow; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + int ret = 0; + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + + mmun_start = vma->vm_start; + mmun_end = vma->vm_end; + if (cow) + mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); + + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + spinlock_t *src_ptl, *dst_ptl; + src_pte = huge_pte_offset(src, addr); + if (!src_pte) + continue; + dst_pte = huge_pte_alloc(dst, addr, sz); + if (!dst_pte) { + ret = -ENOMEM; + break; + } + + /* If the pagetables are shared don't copy or take references */ + if (dst_pte == src_pte) + continue; + + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + entry = huge_ptep_get(src_pte); + if (huge_pte_none(entry)) { /* skip none entry */ + ; + } else if (unlikely(is_hugetlb_entry_migration(entry) || + is_hugetlb_entry_hwpoisoned(entry))) { + swp_entry_t swp_entry = pte_to_swp_entry(entry); + + if (is_write_migration_entry(swp_entry) && cow) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&swp_entry); + entry = swp_entry_to_pte(swp_entry); + set_huge_pte_at(src, addr, src_pte, entry); + } + set_huge_pte_at(dst, addr, dst_pte, entry); + } else { + if (cow) { + huge_ptep_set_wrprotect(src, addr, src_pte); + mmu_notifier_invalidate_range(src, mmun_start, + mmun_end); + } + entry = huge_ptep_get(src_pte); + ptepage = pte_page(entry); + get_page(ptepage); + page_dup_rmap(ptepage); + set_huge_pte_at(dst, addr, dst_pte, entry); + } + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + } + + if (cow) + mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); + + return ret; +} + +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) +{ + int force_flush = 0; + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *ptep; + pte_t pte; + spinlock_t *ptl; + struct page *page; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + const unsigned long mmun_start = start; /* For mmu_notifiers */ + const unsigned long mmun_end = end; /* For mmu_notifiers */ + + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(start & ~huge_page_mask(h)); + BUG_ON(end & ~huge_page_mask(h)); + + tlb_start_vma(tlb, vma); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + address = start; +again: + for (; address < end; address += sz) { + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + + ptl = huge_pte_lock(h, mm, ptep); + if (huge_pmd_unshare(mm, &address, ptep)) + goto unlock; + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte)) + goto unlock; + + /* + * Migrating hugepage or HWPoisoned hugepage is already + * unmapped and its refcount is dropped, so just clear pte here. + */ + if (unlikely(!pte_present(pte))) { + huge_pte_clear(mm, address, ptep); + goto unlock; + } + + page = pte_page(pte); + /* + * If a reference page is supplied, it is because a specific + * page is being unmapped, not a range. Ensure the page we + * are about to unmap is the actual page of interest. + */ + if (ref_page) { + if (page != ref_page) + goto unlock; + + /* + * Mark the VMA as having unmapped its page so that + * future faults in this VMA will fail rather than + * looking like data was lost + */ + set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } + + pte = huge_ptep_get_and_clear(mm, address, ptep); + tlb_remove_tlb_entry(tlb, ptep, address); + if (huge_pte_dirty(pte)) + set_page_dirty(page); + + page_remove_rmap(page); + force_flush = !__tlb_remove_page(tlb, page); + if (force_flush) { + address += sz; + spin_unlock(ptl); + break; + } + /* Bail out after unmapping reference page if supplied */ + if (ref_page) { + spin_unlock(ptl); + break; + } +unlock: + spin_unlock(ptl); + } + /* + * mmu_gather ran out of room to batch pages, we break out of + * the PTE lock to avoid doing the potential expensive TLB invalidate + * and page-free while holding it. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu(tlb); + if (address < end && !ref_page) + goto again; + } + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + tlb_end_vma(tlb, vma); +} + +void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + __unmap_hugepage_range(tlb, vma, start, end, ref_page); + + /* + * Clear this flag so that x86's huge_pmd_share page_table_shareable + * test will fail on a vma being torn down, and not grab a page table + * on its way out. We're lucky that the flag has such an appropriate + * name, and can in fact be safely cleared here. We could clear it + * before the __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_rwsem. This works + * because in the context this is called, the VMA is about to be + * destroyed and the i_mmap_rwsem is held. + */ + vma->vm_flags &= ~VM_MAYSHARE; +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + struct mm_struct *mm; + struct mmu_gather tlb; + + mm = vma->vm_mm; + + tlb_gather_mmu(&tlb, mm, start, end); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page); + tlb_finish_mmu(&tlb, start, end); +} + +/* + * This is called when the original mapper is failing to COW a MAP_PRIVATE + * mappping it owns the reserve page for. The intention is to unmap the page + * from other VMAs and let the children be SIGKILLed if they are faulting the + * same region. + */ +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) +{ + struct hstate *h = hstate_vma(vma); + struct vm_area_struct *iter_vma; + struct address_space *mapping; + pgoff_t pgoff; + + /* + * vm_pgoff is in PAGE_SIZE units, hence the different calculation + * from page cache lookup which is in HPAGE_SIZE units. + */ + address = address & huge_page_mask(h); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + mapping = file_inode(vma->vm_file)->i_mapping; + + /* + * Take the mapping lock for the duration of the table walk. As + * this mapping should be shared between all the VMAs, + * __unmap_hugepage_range() is called as the lock is already held + */ + i_mmap_lock_write(mapping); + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { + /* Do not unmap the current VMA */ + if (iter_vma == vma) + continue; + + /* + * Unmap the page from other VMAs without their own reserves. + * They get marked to be SIGKILLed if they fault in these + * areas. This is because a future no-page fault on this VMA + * could insert a zeroed page instead of the data existing + * from the time of fork. This would look like data corruption + */ + if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) + unmap_hugepage_range(iter_vma, address, + address + huge_page_size(h), page); + } + i_mmap_unlock_write(mapping); +} + +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + * Called with hugetlb_instantiation_mutex held and pte_page locked so we + * cannot race with other handlers or page migration. + * Keep the pte_same checks anyway to make transition from the mutex easier. + */ +static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pte_t pte, + struct page *pagecache_page, spinlock_t *ptl) +{ + struct hstate *h = hstate_vma(vma); + struct page *old_page, *new_page; + int ret = 0, outside_reserve = 0; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + old_page = pte_page(pte); + +retry_avoidcopy: + /* If no-one else is actually using this page, avoid the copy + * and just make the page writable */ + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { + page_move_anon_rmap(old_page, vma, address); + set_huge_ptep_writable(vma, address, ptep); + return 0; + } + + /* + * If the process that created a MAP_PRIVATE mapping is about to + * perform a COW due to a shared page count, attempt to satisfy + * the allocation without using the existing reserves. The pagecache + * page is used to determine if the reserve at this address was + * consumed or not. If reserves were used, a partial faulted mapping + * at the time of fork() could consume its reserves on COW instead + * of the full address range. + */ + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + old_page != pagecache_page) + outside_reserve = 1; + + page_cache_get(old_page); + + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ + spin_unlock(ptl); + new_page = alloc_huge_page(vma, address, outside_reserve); + + if (IS_ERR(new_page)) { + /* + * If a process owning a MAP_PRIVATE mapping fails to COW, + * it is due to references held by a child and an insufficient + * huge page pool. To guarantee the original mappers + * reliability, unmap the page from child processes. The child + * may get SIGKILLed if it later faults. + */ + if (outside_reserve) { + page_cache_release(old_page); + BUG_ON(huge_pte_none(pte)); + unmap_ref_private(mm, vma, old_page, address); + BUG_ON(huge_pte_none(pte)); + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + return 0; + } + + ret = (PTR_ERR(new_page) == -ENOMEM) ? + VM_FAULT_OOM : VM_FAULT_SIGBUS; + goto out_release_old; + } + + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out_release_all; + } + + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); + __SetPageUptodate(new_page); + set_page_huge_active(new_page); + + mmun_start = address & huge_page_mask(h); + mmun_end = mmun_start + huge_page_size(h); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + /* + * Retake the page table lock to check for racing updates + * before the page tables are altered + */ + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + ClearPagePrivate(new_page); + + /* Break COW */ + huge_ptep_clear_flush(vma, address, ptep); + mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_new_anon_rmap(new_page, vma, address); + /* Make the old page be freed below */ + new_page = old_page; + } + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out_release_all: + page_cache_release(new_page); +out_release_old: + page_cache_release(old_page); + + spin_lock(ptl); /* Caller expects lock to be held */ + return ret; +} + +/* Return the pagecache page at a given address within a VMA */ +static struct page *hugetlbfs_pagecache_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + return find_lock_page(mapping, idx); +} + +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + +static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + int ret = VM_FAULT_SIGBUS; + int anon_rmap = 0; + unsigned long size; + struct page *page; + pte_t new_pte; + spinlock_t *ptl; + + /* + * Currently, we are forced to kill the process in the event the + * original mapper has unmapped pages from the child due to a failed + * COW. Warn that such a situation has occurred as it may not be obvious + */ + if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { + pr_warning("PID %d killed due to inadequate hugepage pool\n", + current->pid); + return ret; + } + + /* + * Use page lock to guard against racing truncation + * before we get page_table_lock. + */ +retry: + page = find_lock_page(mapping, idx); + if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + page = alloc_huge_page(vma, address, 0); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; + goto out; + } + clear_huge_page(page, address, pages_per_huge_page(h)); + __SetPageUptodate(page); + set_page_huge_active(page); + + if (vma->vm_flags & VM_MAYSHARE) { + int err; + struct inode *inode = mapping->host; + + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + if (err == -EEXIST) + goto retry; + goto out; + } + ClearPagePrivate(page); + + spin_lock(&inode->i_lock); + inode->i_blocks += blocks_per_huge_page(h); + spin_unlock(&inode->i_lock); + } else { + lock_page(page); + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + anon_rmap = 1; + } + } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON | + VM_FAULT_SET_HINDEX(hstate_index(h)); + goto backout_unlocked; + } + } + + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; + + ret = 0; + if (!huge_pte_none(huge_ptep_get(ptep))) + goto backout; + + if (anon_rmap) { + ClearPagePrivate(page); + hugepage_add_new_anon_rmap(page, vma, address); + } else + page_dup_rmap(page); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, address, ptep, new_pte); + + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); + } + + spin_unlock(ptl); + unlock_page(page); +out: + return ret; + +backout: + spin_unlock(ptl); +backout_unlocked: + unlock_page(page); + put_page(page); + goto out; +} + +#ifdef CONFIG_SMP +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + unsigned long key[2]; + u32 hash; + + if (vma->vm_flags & VM_SHARED) { + key[0] = (unsigned long) mapping; + key[1] = idx; + } else { + key[0] = (unsigned long) mm; + key[1] = address >> huge_page_shift(h); + } + + hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + + return hash & (num_fault_mutexes - 1); +} +#else +/* + * For uniprocesor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + return 0; +} +#endif + +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pte_t *ptep, entry; + spinlock_t *ptl; + int ret; + u32 hash; + pgoff_t idx; + struct page *page = NULL; + struct page *pagecache_page = NULL; + struct hstate *h = hstate_vma(vma); + struct address_space *mapping; + int need_wait_lock = 0; + + address &= huge_page_mask(h); + + ptep = huge_pte_offset(mm, address); + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait_huge(vma, mm, ptep); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + } + + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&htlb_fault_mutex_table[hash]); + + entry = huge_ptep_get(ptep); + if (huge_pte_none(entry)) { + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); + goto out_mutex; + } + + ret = 0; + + /* + * entry could be a migration/hwpoison entry at this point, so this + * check prevents the kernel from going below assuming that we have + * a active hugepage in pagecache. This goto expects the 2nd page fault, + * and is_hugetlb_entry_(migration|hwpoisoned) check will properly + * handle it. + */ + if (!pte_present(entry)) + goto out_mutex; + + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_mutex; + } + + if (!(vma->vm_flags & VM_MAYSHARE)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + + ptl = huge_pte_lock(h, mm, ptep); + + /* Check for a racing update before calling hugetlb_cow */ + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_ptl; + + /* + * hugetlb_cow() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + */ + page = pte_page(entry); + if (page != pagecache_page) + if (!trylock_page(page)) { + need_wait_lock = 1; + goto out_ptl; + } + + get_page(page); + + if (flags & FAULT_FLAG_WRITE) { + if (!huge_pte_write(entry)) { + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page, ptl); + goto out_put_page; + } + entry = huge_pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, + flags & FAULT_FLAG_WRITE)) + update_mmu_cache(vma, address, ptep); +out_put_page: + if (page != pagecache_page) + unlock_page(page); + put_page(page); +out_ptl: + spin_unlock(ptl); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } +out_mutex: + mutex_unlock(&htlb_fault_mutex_table[hash]); + /* + * Generally it's safe to hold refcount during waiting page lock. But + * here we just wait to defer the next page fault to avoid busy loop and + * the page is not used after unlocked before returning from the current + * page fault. So we are safe from accessing freed page, even if we wait + * here without taking refcount. + */ + if (need_wait_lock) + wait_on_page_locked(page); + return ret; +} + +long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags) +{ + unsigned long pfn_offset; + unsigned long vaddr = *position; + unsigned long remainder = *nr_pages; + struct hstate *h = hstate_vma(vma); + + while (vaddr < vma->vm_end && remainder) { + pte_t *pte; + spinlock_t *ptl = NULL; + int absent; + struct page *page; + + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) { + remainder = 0; + break; + } + + /* + * Some archs (sparc64, sh*) have multiple pte_ts to + * each hugepage. We have to make sure we get the + * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. + */ + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (pte) + ptl = huge_pte_lock(h, mm, pte); + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); + remainder = 0; + break; + } + + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || is_swap_pte(huge_ptep_get(pte)) || + ((flags & FOLL_WRITE) && + !huge_pte_write(huge_ptep_get(pte)))) { + int ret; + + if (pte) + spin_unlock(ptl); + ret = hugetlb_fault(mm, vma, vaddr, + (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); + if (!(ret & VM_FAULT_ERROR)) + continue; + + remainder = 0; + break; + } + + pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; + page = pte_page(huge_ptep_get(pte)); +same_page: + if (pages) { + pages[i] = mem_map_offset(page, pfn_offset); + get_page_foll(pages[i]); + } + + if (vmas) + vmas[i] = vma; + + vaddr += PAGE_SIZE; + ++pfn_offset; + --remainder; + ++i; + if (vaddr < vma->vm_end && remainder && + pfn_offset < pages_per_huge_page(h)) { + /* + * We use pfn_offset to avoid touching the pageframes + * of this compound page. + */ + goto same_page; + } + spin_unlock(ptl); + } + *nr_pages = remainder; + *position = vaddr; + + return i ? i : -EFAULT; +} + +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + struct hstate *h = hstate_vma(vma); + unsigned long pages = 0; + + BUG_ON(address >= end); + flush_cache_range(vma, address, end); + + mmu_notifier_invalidate_range_start(mm, start, end); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (; address < end; address += huge_page_size(h)) { + spinlock_t *ptl; + ptep = huge_pte_offset(mm, address); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + if (huge_pmd_unshare(mm, &address, ptep)) { + pages++; + spin_unlock(ptl); + continue; + } + pte = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + spin_unlock(ptl); + continue; + } + if (unlikely(is_hugetlb_entry_migration(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (is_write_migration_entry(entry)) { + pte_t newpte; + + make_migration_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + set_huge_pte_at(mm, address, ptep, newpte); + pages++; + } + spin_unlock(ptl); + continue; + } + if (!huge_pte_none(pte)) { + pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = pte_mkhuge(huge_pte_modify(pte, newprot)); + pte = arch_make_huge_pte(pte, vma, NULL, 0); + set_huge_pte_at(mm, address, ptep, pte); + pages++; + } + spin_unlock(ptl); + } + /* + * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_rwsem, another task can do the final put_page + * and that page table be reused and filled with junk. + */ + flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range(mm, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + mmu_notifier_invalidate_range_end(mm, start, end); + + return pages << h->order; +} + +int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + long ret, chg; + struct hstate *h = hstate_inode(inode); + struct hugepage_subpool *spool = subpool_inode(inode); + struct resv_map *resv_map; + long gbl_reserve; + + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * without using reserves + */ + if (vm_flags & VM_NORESERVE) + return 0; + + /* + * Shared mappings base their reservation on the number of pages that + * are already allocated on behalf of the file. Private mappings need + * to reserve the full area even if read-only as mprotect() may be + * called to make the mapping read-write. Assume !vma is a shm mapping + */ + if (!vma || vma->vm_flags & VM_MAYSHARE) { + resv_map = inode_resv_map(inode); + + chg = region_chg(resv_map, from, to); + + } else { + resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + + chg = to - from; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + + if (chg < 0) { + ret = chg; + goto out_err; + } + + /* + * There must be enough pages in the subpool for the mapping. If + * the subpool has a minimum size, there may be some global + * reservations already in place (gbl_reserve). + */ + gbl_reserve = hugepage_subpool_get_pages(spool, chg); + if (gbl_reserve < 0) { + ret = -ENOSPC; + goto out_err; + } + + /* + * Check enough hugepages are available for the reservation. + * Hand the pages back to the subpool if there are not + */ + ret = hugetlb_acct_memory(h, gbl_reserve); + if (ret < 0) { + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); + goto out_err; + } + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ + if (!vma || vma->vm_flags & VM_MAYSHARE) + region_add(resv_map, from, to); + return 0; +out_err: + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&resv_map->refs, resv_map_release); + return ret; +} + +void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +{ + struct hstate *h = hstate_inode(inode); + struct resv_map *resv_map = inode_resv_map(inode); + long chg = 0; + struct hugepage_subpool *spool = subpool_inode(inode); + long gbl_reserve; + + if (resv_map) + chg = region_truncate(resv_map, offset); + spin_lock(&inode->i_lock); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); + spin_unlock(&inode->i_lock); + + /* + * If the subpool has a minimum size, the number of global + * reservations to be released may be adjusted. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -gbl_reserve); +} + +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static unsigned long page_table_shareable(struct vm_area_struct *svma, + struct vm_area_struct *vma, + unsigned long addr, pgoff_t idx) +{ + unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + + svma->vm_start; + unsigned long sbase = saddr & PUD_MASK; + unsigned long s_end = sbase + PUD_SIZE; + + /* Allow segments to share if only one is marked locked */ + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + + /* + * match the virtual addresses, permission and the alignment of the + * page table page. + */ + if (pmd_index(addr) != pmd_index(saddr) || + vm_flags != svm_flags || + sbase < svma->vm_start || svma->vm_end < s_end) + return 0; + + return saddr; +} + +static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & PUD_MASK; + unsigned long end = base + PUD_SIZE; + + /* + * check on proper vm_flags and page table alignment + */ + if (vma->vm_flags & VM_MAYSHARE && + vma->vm_start <= base && end <= vma->vm_end) + return 1; + return 0; +} + +/* + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. + */ +pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + struct vm_area_struct *svma; + unsigned long saddr; + pte_t *spte = NULL; + pte_t *pte; + spinlock_t *ptl; + + if (!vma_shareable(vma, addr)) + return (pte_t *)pmd_alloc(mm, pud, addr); + + i_mmap_lock_write(mapping); + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { + if (svma == vma) + continue; + + saddr = page_table_shareable(svma, vma, addr, idx); + if (saddr) { + spte = huge_pte_offset(svma->vm_mm, saddr); + if (spte) { + mm_inc_nr_pmds(mm); + get_page(virt_to_page(spte)); + break; + } + } + } + + if (!spte) + goto out; + + ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); + spin_lock(ptl); + if (pud_none(*pud)) { + pud_populate(mm, pud, + (pmd_t *)((unsigned long)spte & PAGE_MASK)); + } else { + put_page(virt_to_page(spte)); + mm_inc_nr_pmds(mm); + } + spin_unlock(ptl); +out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_write(mapping); + return pte; +} + +/* + * unmap huge page backed by shared pte. + * + * Hugetlb pte page is ref counted at the time of mapping. If pte is shared + * indicated by page_count > 1, unmap is achieved by clearing pud and + * decrementing the ref count. If count == 1, the pte page is not shared. + * + * called with page table lock held. + * + * returns: 1 successfully unmapped a shared pte page + * 0 the underlying pte page is not shared, or it is the last user + */ +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + pgd_t *pgd = pgd_offset(mm, *addr); + pud_t *pud = pud_offset(pgd, *addr); + + BUG_ON(page_count(virt_to_page(ptep)) == 0); + if (page_count(virt_to_page(ptep)) == 1) + return 0; + + pud_clear(pud); + put_page(virt_to_page(ptep)); + mm_dec_nr_pmds(mm); + *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; + return 1; +} +#define want_pmd_share() (1) +#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + return NULL; +} +#define want_pmd_share() (0) +#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (pud) { + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (want_pmd_share() && pud_none(*pud)) + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } + } + BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); + + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (pud_present(*pud)) { + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + } + } + return (pte_t *) pmd; +} + +#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ + +/* + * These functions are overwritable if your architecture needs its own + * behavior. + */ +struct page * __weak +follow_huge_addr(struct mm_struct *mm, unsigned long address, + int write) +{ + return ERR_PTR(-EINVAL); +} + +struct page * __weak +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int flags) +{ + struct page *page = NULL; + spinlock_t *ptl; +retry: + ptl = pmd_lockptr(mm, pmd); + spin_lock(ptl); + /* + * make sure that the address range covered by this pmd is not + * unmapped from other threads. + */ + if (!pmd_huge(*pmd)) + goto out; + if (pmd_present(*pmd)) { + page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); + if (flags & FOLL_GET) + get_page(page); + } else { + if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { + spin_unlock(ptl); + __migration_entry_wait(mm, (pte_t *)pmd, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; +} + +struct page * __weak +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int flags) +{ + if (flags & FOLL_GET) + return NULL; + + return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +} + +#ifdef CONFIG_MEMORY_FAILURE + +/* + * This function is called from memory failure code. + * Assume the caller holds page lock of the head page. + */ +int dequeue_hwpoisoned_huge_page(struct page *hpage) +{ + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + int ret = -EBUSY; + + spin_lock(&hugetlb_lock); + /* + * Just checking !page_huge_active is not enough, because that could be + * an isolated/hwpoisoned hugepage (which have >0 refcount). + */ + if (!page_huge_active(hpage) && !page_count(hpage)) { + /* + * Hwpoisoned hugepage isn't linked to activelist or freelist, + * but dangling hpage->lru can trigger list-debug warnings + * (this happens when we call unpoison_memory() on it), + * so let it point to itself with list_del_init(). + */ + list_del_init(&hpage->lru); + set_page_refcounted(hpage); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + ret = 0; + } + spin_unlock(&hugetlb_lock); + return ret; +} +#endif + +bool isolate_huge_page(struct page *page, struct list_head *list) +{ + bool ret = true; + + VM_BUG_ON_PAGE(!PageHead(page), page); + spin_lock(&hugetlb_lock); + if (!page_huge_active(page) || !get_page_unless_zero(page)) { + ret = false; + goto unlock; + } + clear_page_huge_active(page); + list_move_tail(&page->lru, list); +unlock: + spin_unlock(&hugetlb_lock); + return ret; +} + +void putback_active_hugepage(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHead(page), page); + spin_lock(&hugetlb_lock); + set_page_huge_active(page); + list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + spin_unlock(&hugetlb_lock); + put_page(page); +} diff --git a/kernel/mm/hugetlb_cgroup.c b/kernel/mm/hugetlb_cgroup.c new file mode 100644 index 000000000..6e0057439 --- /dev/null +++ b/kernel/mm/hugetlb_cgroup.c @@ -0,0 +1,422 @@ +/* + * + * Copyright IBM Corporation, 2012 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include +#include + +struct hugetlb_cgroup { + struct cgroup_subsys_state css; + /* + * the counter to account for hugepages from hugetlb. + */ + struct page_counter hugepage[HUGE_MAX_HSTATE]; +}; + +#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) +#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) + +static struct hugetlb_cgroup *root_h_cgroup __read_mostly; + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; +} + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) +{ + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); +} + +static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) +{ + return (h_cg == root_h_cgroup); +} + +static inline struct hugetlb_cgroup * +parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) +{ + return hugetlb_cgroup_from_css(h_cg->css.parent); +} + +static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) +{ + int idx; + + for (idx = 0; idx < hugetlb_max_hstate; idx++) { + if (page_counter_read(&h_cg->hugepage[idx])) + return true; + } + return false; +} + +static struct cgroup_subsys_state * +hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); + struct hugetlb_cgroup *h_cgroup; + int idx; + + h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); + if (!h_cgroup) + return ERR_PTR(-ENOMEM); + + if (parent_h_cgroup) { + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) + page_counter_init(&h_cgroup->hugepage[idx], + &parent_h_cgroup->hugepage[idx]); + } else { + root_h_cgroup = h_cgroup; + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) + page_counter_init(&h_cgroup->hugepage[idx], NULL); + } + return &h_cgroup->css; +} + +static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) +{ + struct hugetlb_cgroup *h_cgroup; + + h_cgroup = hugetlb_cgroup_from_css(css); + kfree(h_cgroup); +} + + +/* + * Should be called with hugetlb_lock held. + * Since we are holding hugetlb_lock, pages cannot get moved from + * active list or uncharged from the cgroup, So no need to get + * page reference and test for page active here. This function + * cannot fail. + */ +static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, + struct page *page) +{ + unsigned int nr_pages; + struct page_counter *counter; + struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); + + page_hcg = hugetlb_cgroup_from_page(page); + /* + * We can have pages in active list without any cgroup + * ie, hugepage with less than 3 pages. We can safely + * ignore those pages. + */ + if (!page_hcg || page_hcg != h_cg) + goto out; + + nr_pages = 1 << compound_order(page); + if (!parent) { + parent = root_h_cgroup; + /* root has no limit */ + page_counter_charge(&parent->hugepage[idx], nr_pages); + } + counter = &h_cg->hugepage[idx]; + /* Take the pages off the local counter */ + page_counter_cancel(counter, nr_pages); + + set_hugetlb_cgroup(page, parent); +out: + return; +} + +/* + * Force the hugetlb cgroup to empty the hugetlb resources by moving them to + * the parent cgroup. + */ +static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hstate *h; + struct page *page; + int idx = 0; + + do { + for_each_hstate(h) { + spin_lock(&hugetlb_lock); + list_for_each_entry(page, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(idx, h_cg, page); + + spin_unlock(&hugetlb_lock); + idx++; + } + cond_resched(); + } while (hugetlb_cgroup_have_usage(h_cg)); +} + +int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + int ret = 0; + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = NULL; + + if (hugetlb_cgroup_disabled()) + goto done; + /* + * We don't charge any cgroup if the compound page have less + * than 3 pages. + */ + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + goto done; +again: + rcu_read_lock(); + h_cg = hugetlb_cgroup_from_task(current); + if (!css_tryget_online(&h_cg->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); + css_put(&h_cg->css); +done: + *ptr = h_cg; + return ret; +} + +/* Should be called with hugetlb_lock held */ +void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + set_hugetlb_cgroup(page, h_cg); + return; +} + +/* + * Should be called with hugetlb_lock held + */ +void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page) +{ + struct hugetlb_cgroup *h_cg; + + if (hugetlb_cgroup_disabled()) + return; + lockdep_assert_held(&hugetlb_lock); + h_cg = hugetlb_cgroup_from_page(page); + if (unlikely(!h_cg)) + return; + set_hugetlb_cgroup(page, NULL); + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); + return; +} + +void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + return; + + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); + return; +} + +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, +}; + +static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + + counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + default: + BUG(); + } +} + +static DEFINE_MUTEX(hugetlb_limit_mutex); + +static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + int ret, idx; + unsigned long nr_pages; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + + if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ + return -EINVAL; + + buf = strstrip(buf); + ret = page_counter_memparse(buf, "-1", &nr_pages); + if (ret) + return ret; + + idx = MEMFILE_IDX(of_cft(of)->private); + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_LIMIT: + mutex_lock(&hugetlb_limit_mutex); + ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + mutex_unlock(&hugetlb_limit_mutex); + break; + default: + ret = -EINVAL; + break; + } + return ret ?: nbytes; +} + +static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + int ret = 0; + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + + counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_MAX_USAGE: + page_counter_reset_watermark(counter); + break; + case RES_FAILCNT: + counter->failcnt = 0; + break; + default: + ret = -EINVAL; + break; + } + return ret ?: nbytes; +} + +static char *mem_fmt(char *buf, int size, unsigned long hsize) +{ + if (hsize >= (1UL << 30)) + snprintf(buf, size, "%luGB", hsize >> 30); + else if (hsize >= (1UL << 20)) + snprintf(buf, size, "%luMB", hsize >> 20); + else + snprintf(buf, size, "%luKB", hsize >> 10); + return buf; +} + +static void __init __hugetlb_cgroup_file_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, 32, huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->read_u64 = hugetlb_cgroup_read_u64; + cft->write = hugetlb_cgroup_write; + + /* Add the usage file */ + cft = &h->cgroup_files[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the MAX usage file */ + cft = &h->cgroup_files[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the failcntfile */ + cft = &h->cgroup_files[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files[4]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files)); +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } +} + +/* + * hugetlb_lock will make sure a parallel cgroup rmdir won't happen + * when we migrate hugepages + */ +void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +{ + struct hugetlb_cgroup *h_cg; + struct hstate *h = page_hstate(oldhpage); + + if (hugetlb_cgroup_disabled()) + return; + + VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); + spin_lock(&hugetlb_lock); + h_cg = hugetlb_cgroup_from_page(oldhpage); + set_hugetlb_cgroup(oldhpage, NULL); + + /* move the h_cg details to new cgroup */ + set_hugetlb_cgroup(newhpage, h_cg); + list_move(&newhpage->lru, &h->hugepage_activelist); + spin_unlock(&hugetlb_lock); + return; +} + +struct cgroup_subsys hugetlb_cgrp_subsys = { + .css_alloc = hugetlb_cgroup_css_alloc, + .css_offline = hugetlb_cgroup_css_offline, + .css_free = hugetlb_cgroup_css_free, +}; diff --git a/kernel/mm/hwpoison-inject.c b/kernel/mm/hwpoison-inject.c new file mode 100644 index 000000000..4ca5fe004 --- /dev/null +++ b/kernel/mm/hwpoison-inject.c @@ -0,0 +1,144 @@ +/* Inject a hwpoison memory failure on a arbitrary pfn */ +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct dentry *hwpoison_dir; + +static int hwpoison_inject(void *data, u64 val) +{ + unsigned long pfn = val; + struct page *p; + struct page *hpage; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + hpage = compound_head(p); + /* + * This implies unable to support free buddy pages. + */ + if (!get_page_unless_zero(hpage)) + return 0; + + if (!hwpoison_filter_enable) + goto inject; + + if (!PageLRU(hpage) && !PageHuge(p)) + shake_page(hpage, 0); + /* + * This implies unable to support non-LRU pages. + */ + if (!PageLRU(hpage) && !PageHuge(p)) + goto put_out; + + /* + * do a racy check with elevated page count, to make sure PG_hwpoison + * will only be set for the targeted owner (or on a free page). + * We temporarily take page lock for try_get_mem_cgroup_from_page(). + * memory_failure() will redo the check reliably inside page lock. + */ + lock_page(hpage); + err = hwpoison_filter(hpage); + unlock_page(hpage); + if (err) + goto put_out; + +inject: + pr_info("Injecting memory failure at pfn %#lx\n", pfn); + return memory_failure(pfn, 18, MF_COUNT_INCREASED); +put_out: + put_page(hpage); + return 0; +} + +static int hwpoison_unpoison(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return unpoison_memory(val); +} + +DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); + +static void pfn_inject_exit(void) +{ + debugfs_remove_recursive(hwpoison_dir); +} + +static int pfn_inject_init(void) +{ + struct dentry *dentry; + + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + if (hwpoison_dir == NULL) + return -ENOMEM; + + /* + * Note that the below poison/unpoison interfaces do not involve + * hardware status change, hence do not require hardware support. + * They are mainly for testing hwpoison in software level. + */ + dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, + NULL, &hwpoison_fops); + if (!dentry) + goto fail; + + dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, + NULL, &unpoison_fops); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-enable", 0600, + hwpoison_dir, &hwpoison_filter_enable); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, + hwpoison_dir, &hwpoison_filter_dev_major); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, + hwpoison_dir, &hwpoison_filter_dev_minor); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, + hwpoison_dir, &hwpoison_filter_flags_mask); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, + hwpoison_dir, &hwpoison_filter_flags_value); + if (!dentry) + goto fail; + +#ifdef CONFIG_MEMCG_SWAP + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, + hwpoison_dir, &hwpoison_filter_memcg); + if (!dentry) + goto fail; +#endif + + return 0; +fail: + pfn_inject_exit(); + return -ENOMEM; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/kernel/mm/init-mm.c b/kernel/mm/init-mm.c new file mode 100644 index 000000000..a56a85190 --- /dev/null +++ b/kernel/mm/init-mm.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef INIT_MM_CONTEXT +#define INIT_MM_CONTEXT(name) +#endif + +struct mm_struct init_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + INIT_MM_CONTEXT(init_mm) +}; diff --git a/kernel/mm/internal.h b/kernel/mm/internal.h new file mode 100644 index 000000000..a25e359a4 --- /dev/null +++ b/kernel/mm/internal.h @@ -0,0 +1,436 @@ +/* internal.h: mm/ internal definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __MM_INTERNAL_H +#define __MM_INTERNAL_H + +#include +#include + +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + +static inline void set_page_count(struct page *page, int v) +{ + atomic_set(&page->_count, v); +} + +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + +/* + * Turn a non-refcounted page (->_count == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(atomic_read(&page->_count), page); + set_page_count(page, 1); +} + +static inline void __get_page_tail_foll(struct page *page, + bool get_page_head) +{ + /* + * If we're getting a tail page, the elevated page->_count is + * required only in the head page and we will elevate the head + * page->_count and tail page->_mapcount. + * + * We elevate page_tail->_mapcount for tail pages to force + * page_tail->_count to be zero at all times to avoid getting + * false positives from get_page_unless_zero() with + * speculative page access (like in + * page_cache_get_speculative()) on tail pages. + */ + VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); + if (get_page_head) + atomic_inc(&page->first_page->_count); + get_huge_page_tail(page); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ + if (unlikely(PageTail(page))) + /* + * This is safe only because + * __split_huge_page_refcount() can't run under + * get_page_foll() because we hold the proper PT lock. + */ + __get_page_tail_foll(page, true); + else { + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); + atomic_inc(&page->_count); + } +} + +extern unsigned long highest_memmap_pfn; + +/* + * in mm/vmscan.c: + */ +extern int isolate_lru_page(struct page *page); +extern void putback_lru_page(struct page *page); +extern bool zone_reclaimable(struct zone *zone); + +/* + * in mm/rmap.c: + */ +extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); + +/* + * in mm/page_alloc.c + */ + +/* + * Structure for holding the mostly immutable allocation parameters passed + * between functions involved in allocations, including the alloc_pages* + * family of functions. + * + * nodemask, migratetype and high_zoneidx are initialized only once in + * __alloc_pages_nodemask() and then never change. + * + * zonelist, preferred_zone and classzone_idx are set first in + * __alloc_pages_nodemask() for the fast path, and might be later changed + * in __alloc_pages_slowpath(). All other functions pass the whole strucure + * by a const pointer. + */ +struct alloc_context { + struct zonelist *zonelist; + nodemask_t *nodemask; + struct zone *preferred_zone; + int classzone_idx; + int migratetype; + enum zone_type high_zoneidx; +}; + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline unsigned long +__find_buddy_index(unsigned long page_idx, unsigned int order) +{ + return page_idx ^ (1 << order); +} + +extern int __isolate_free_page(struct page *page, unsigned int order); +extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void prep_compound_page(struct page *page, unsigned long order); +#ifdef CONFIG_MEMORY_FAILURE +extern bool is_free_buddy_page(struct page *page); +#endif +extern int user_min_free_kbytes; + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + +/* + * in mm/compaction.c + */ +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + enum migrate_mode mode; /* Async or sync migration mode */ + bool ignore_skip_hint; /* Scan blocks even if marked skip */ + int order; /* order a direct compactor needs */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + const int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ + struct zone *zone; + int contended; /* Signal need_sched() or lock + * contention detected during + * compaction + */ +}; + +unsigned long +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn); +unsigned long +isolate_migratepages_range(struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn); +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal); + +#endif + +/* + * This function returns the order of a free page in the buddy system. In + * general, page_zone(page)->lock must be held by the caller to prevent the + * page from being allocated in parallel and returning garbage as the order. + * If a caller does not hold page_zone(page)->lock, it must guarantee that the + * page cannot be allocated or merged in parallel. Alternatively, it must + * handle invalid values gracefully, and use page_order_unsafe() below. + */ +static inline unsigned long page_order(struct page *page) +{ + /* PageBuddy() must be checked by the caller */ + return page_private(page); +} + +/* + * Like page_order(), but for callers who cannot afford to hold the zone lock. + * PageBuddy() should be checked first by the caller to minimize race window, + * and invalid values must be handled gracefully. + * + * READ_ONCE is used so that if the caller assigns the result into a local + * variable and e.g. tests it for valid range before using, the compiler cannot + * decide to remove the variable and inline the page_private(page) multiple + * times, potentially observing different values in the tests and the actual + * use of the result. + */ +#define page_order_unsafe(page) READ_ONCE(page_private(page)) + +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +/* mm/util.c */ +void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent); + +#ifdef CONFIG_MMU +extern long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking); +extern void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +static inline void munlock_vma_pages_all(struct vm_area_struct *vma) +{ + munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); +} + +/* + * must be called with vma's mmap_sem held for read or write, and page locked. + */ +extern void mlock_vma_page(struct page *page); +extern unsigned int munlock_vma_page(struct page *page); + +/* + * Clear the page's PageMlocked(). This can be useful in a situation where + * we want to unconditionally remove a page from the pagecache -- e.g., + * on truncation or freeing. + * + * It is legal to call this function for any page, mlocked or not. + * If called for a page that is still mapped by mlocked vmas, all we do + * is revert to lazy LRU behaviour -- semantics are not broken. + */ +extern void clear_page_mlock(struct page *page); + +/* + * mlock_migrate_page - called only from migrate_page_copy() to + * migrate the Mlocked page flag; update statistics. + */ +static inline void mlock_migrate_page(struct page *newpage, struct page *page) +{ + if (TestClearPageMlocked(page)) { + unsigned long flags; + int nr_pages = hpage_nr_pages(page); + + local_irq_save(flags); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + SetPageMlocked(newpage); + __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); + local_irq_restore(flags); + } +} + +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif +#else /* !CONFIG_MMU */ +static inline void clear_page_mlock(struct page *page) { } +static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_migrate_page(struct page *new, struct page *old) { } + +#endif /* !CONFIG_MMU */ + +/* + * Return the mem_map entry representing the 'offset' subpage within + * the maximally aligned gigantic page 'base'. Handle any discontiguity + * in the mem_map at MAX_ORDER_NR_PAGES boundaries. + */ +static inline struct page *mem_map_offset(struct page *base, int offset) +{ + if (unlikely(offset >= MAX_ORDER_NR_PAGES)) + return nth_page(base, offset); + return base + offset; +} + +/* + * Iterator over all subpages within the maximally aligned gigantic + * page 'base'. Handle any discontiguity in the mem_map. + */ +static inline struct page *mem_map_next(struct page *iter, + struct page *base, int offset) +{ + if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { + unsigned long pfn = page_to_pfn(base) + offset; + if (!pfn_valid(pfn)) + return NULL; + return pfn_to_page(pfn); + } + return iter + 1; +} + +/* + * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, + * so all functions starting at paging_init should be marked __init + * in those cases. SPARSEMEM, however, allows for memory hotplug, + * and alloc_bootmem_node is not used. + */ +#ifdef CONFIG_SPARSEMEM +#define __paginginit __meminit +#else +#define __paginginit __init +#endif + +/* Memory initialisation debug and verification */ +enum mminit_level { + MMINIT_WARNING, + MMINIT_VERIFY, + MMINIT_TRACE +}; + +#ifdef CONFIG_DEBUG_MEMORY_INIT + +extern int mminit_loglevel; + +#define mminit_dprintk(level, prefix, fmt, arg...) \ +do { \ + if (level < mminit_loglevel) { \ + if (level <= MMINIT_WARNING) \ + printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ + else \ + printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ + } \ +} while (0) + +extern void mminit_verify_pageflags_layout(void); +extern void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn); +extern void mminit_verify_zonelist(void); + +#else + +static inline void mminit_dprintk(enum mminit_level level, + const char *prefix, const char *fmt, ...) +{ +} + +static inline void mminit_verify_pageflags_layout(void) +{ +} + +static inline void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn) +{ +} + +static inline void mminit_verify_zonelist(void) +{ +} +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ +#if defined(CONFIG_SPARSEMEM) +extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn); +#else +static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) +{ +} +#endif /* CONFIG_SPARSEMEM */ + +#define ZONE_RECLAIM_NOSCAN -2 +#define ZONE_RECLAIM_FULL -1 +#define ZONE_RECLAIM_SOME 0 +#define ZONE_RECLAIM_SUCCESS 1 + +extern int hwpoison_filter(struct page *p); + +extern u32 hwpoison_filter_dev_major; +extern u32 hwpoison_filter_dev_minor; +extern u64 hwpoison_filter_flags_mask; +extern u64 hwpoison_filter_flags_value; +extern u64 hwpoison_filter_memcg; +extern u32 hwpoison_filter_enable; + +extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); + +extern void set_pageblock_order(void); +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list); +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_FAIR 0x100 /* fair zone allocation */ + +#endif /* __MM_INTERNAL_H */ diff --git a/kernel/mm/interval_tree.c b/kernel/mm/interval_tree.c new file mode 100644 index 000000000..f2c249268 --- /dev/null +++ b/kernel/mm/interval_tree.c @@ -0,0 +1,112 @@ +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse + * + * This file is released under the GPL v2. + */ + +#include +#include +#include +#include + +static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff; +} + +static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; +} + +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, + unsigned long, shared.rb_subtree_last, + vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) + +/* Insert node immediately after prev in the interval tree */ +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root *root) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last = vma_last_pgoff(node); + + VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); + + if (!prev->shared.rb.rb_right) { + parent = prev; + link = &prev->shared.rb.rb_right; + } else { + parent = rb_entry(prev->shared.rb.rb_right, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; + while (parent->shared.rb.rb_left) { + parent = rb_entry(parent->shared.rb.rb_left, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; + } + link = &parent->shared.rb.rb_left; + } + + node->shared.rb_subtree_last = last; + rb_link_node(&node->shared.rb, &parent->shared.rb, link); + rb_insert_augmented(&node->shared.rb, root, + &vma_interval_tree_augment); +} + +static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) +{ + return vma_start_pgoff(avc->vma); +} + +static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) +{ + return vma_last_pgoff(avc->vma); +} + +INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, + avc_start_pgoff, avc_last_pgoff, + static inline, __anon_vma_interval_tree) + +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root *root) +{ +#ifdef CONFIG_DEBUG_VM_RB + node->cached_vma_start = avc_start_pgoff(node); + node->cached_vma_last = avc_last_pgoff(node); +#endif + __anon_vma_interval_tree_insert(node, root); +} + +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root *root) +{ + __anon_vma_interval_tree_remove(node, root); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root *root, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_first(root, first, last); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_next(node, first, last); +} + +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node) +{ + WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); + WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); +} +#endif diff --git a/kernel/mm/kasan/Makefile b/kernel/mm/kasan/Makefile new file mode 100644 index 000000000..bd837b8c2 --- /dev/null +++ b/kernel/mm/kasan/Makefile @@ -0,0 +1,8 @@ +KASAN_SANITIZE := n + +CFLAGS_REMOVE_kasan.o = -pg +# Function splitter causes unnecessary splits in __asan_load1/__asan_store1 +# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 +CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) + +obj-y := kasan.o report.o diff --git a/kernel/mm/kasan/kasan.c b/kernel/mm/kasan/kasan.c new file mode 100644 index 000000000..6c513a63e --- /dev/null +++ b/kernel/mm/kasan/kasan.c @@ -0,0 +1,537 @@ +/* + * This file contains shadow memory manipulation code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some of code borrowed from https://github.com/xairy/linux by + * Andrey Konovalov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define DISABLE_BRANCH_PROFILING + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kasan.h" +#include "../slab.h" + +/* + * Poisons the shadow memory for 'size' bytes starting from 'addr'. + * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. + */ +static void kasan_poison_shadow(const void *address, size_t size, u8 value) +{ + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(address); + shadow_end = kasan_mem_to_shadow(address + size); + + memset(shadow_start, value, shadow_end - shadow_start); +} + +void kasan_unpoison_shadow(const void *address, size_t size) +{ + kasan_poison_shadow(address, size, 0); + + if (size & KASAN_SHADOW_MASK) { + u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + *shadow = size & KASAN_SHADOW_MASK; + } +} + + +/* + * All functions below always inlined so compiler could + * perform better optimizations in each of __asan_loadX/__assn_storeX + * depending on memory access size X. + */ + +static __always_inline bool memory_is_poisoned_1(unsigned long addr) +{ + s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(shadow_value)) { + s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; + return unlikely(last_accessible_byte >= shadow_value); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_2(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + if (memory_is_poisoned_1(addr + 1)) + return true; + + if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) + return false; + + return unlikely(*(u8 *)shadow_addr); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_4(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + if (memory_is_poisoned_1(addr + 3)) + return true; + + if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) + return false; + + return unlikely(*(u8 *)shadow_addr); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_8(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + if (memory_is_poisoned_1(addr + 7)) + return true; + + if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) + return false; + + return unlikely(*(u8 *)shadow_addr); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_16(unsigned long addr) +{ + u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(*shadow_addr)) { + u16 shadow_first_bytes = *(u16 *)shadow_addr; + s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK; + + if (unlikely(shadow_first_bytes)) + return true; + + if (likely(!last_byte)) + return false; + + return memory_is_poisoned_1(addr + 15); + } + + return false; +} + +static __always_inline unsigned long bytes_is_zero(const u8 *start, + size_t size) +{ + while (size) { + if (unlikely(*start)) + return (unsigned long)start; + start++; + size--; + } + + return 0; +} + +static __always_inline unsigned long memory_is_zero(const void *start, + const void *end) +{ + unsigned int words; + unsigned long ret; + unsigned int prefix = (unsigned long)start % 8; + + if (end - start <= 16) + return bytes_is_zero(start, end - start); + + if (prefix) { + prefix = 8 - prefix; + ret = bytes_is_zero(start, prefix); + if (unlikely(ret)) + return ret; + start += prefix; + } + + words = (end - start) / 8; + while (words) { + if (unlikely(*(u64 *)start)) + return bytes_is_zero(start, 8); + start += 8; + words--; + } + + return bytes_is_zero(start, (end - start) % 8); +} + +static __always_inline bool memory_is_poisoned_n(unsigned long addr, + size_t size) +{ + unsigned long ret; + + ret = memory_is_zero(kasan_mem_to_shadow((void *)addr), + kasan_mem_to_shadow((void *)addr + size - 1) + 1); + + if (unlikely(ret)) { + unsigned long last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + + if (unlikely(ret != (unsigned long)last_shadow || + ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + return true; + } + return false; +} + +static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +{ + if (__builtin_constant_p(size)) { + switch (size) { + case 1: + return memory_is_poisoned_1(addr); + case 2: + return memory_is_poisoned_2(addr); + case 4: + return memory_is_poisoned_4(addr); + case 8: + return memory_is_poisoned_8(addr); + case 16: + return memory_is_poisoned_16(addr); + default: + BUILD_BUG(); + } + } + + return memory_is_poisoned_n(addr, size); +} + + +static __always_inline void check_memory_region(unsigned long addr, + size_t size, bool write) +{ + struct kasan_access_info info; + + if (unlikely(size == 0)) + return; + + if (unlikely((void *)addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + info.access_addr = (void *)addr; + info.access_size = size; + info.is_write = write; + info.ip = _RET_IP_; + kasan_report_user_access(&info); + return; + } + + if (likely(!memory_is_poisoned(addr, size))) + return; + + kasan_report(addr, size, write, _RET_IP_); +} + +void __asan_loadN(unsigned long addr, size_t size); +void __asan_storeN(unsigned long addr, size_t size); + +#undef memset +void *memset(void *addr, int c, size_t len) +{ + __asan_storeN((unsigned long)addr, len); + + return __memset(addr, c, len); +} + +#undef memmove +void *memmove(void *dest, const void *src, size_t len) +{ + __asan_loadN((unsigned long)src, len); + __asan_storeN((unsigned long)dest, len); + + return __memmove(dest, src, len); +} + +#undef memcpy +void *memcpy(void *dest, const void *src, size_t len) +{ + __asan_loadN((unsigned long)src, len); + __asan_storeN((unsigned long)dest, len); + + return __memcpy(dest, src, len); +} + +void kasan_alloc_pages(struct page *page, unsigned int order) +{ + if (likely(!PageHighMem(page))) + kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); +} + +void kasan_free_pages(struct page *page, unsigned int order) +{ + if (likely(!PageHighMem(page))) + kasan_poison_shadow(page_address(page), + PAGE_SIZE << order, + KASAN_FREE_PAGE); +} + +void kasan_poison_slab(struct page *page) +{ + kasan_poison_shadow(page_address(page), + PAGE_SIZE << compound_order(page), + KASAN_KMALLOC_REDZONE); +} + +void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) +{ + kasan_unpoison_shadow(object, cache->object_size); +} + +void kasan_poison_object_data(struct kmem_cache *cache, void *object) +{ + kasan_poison_shadow(object, + round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), + KASAN_KMALLOC_REDZONE); +} + +void kasan_slab_alloc(struct kmem_cache *cache, void *object) +{ + kasan_kmalloc(cache, object, cache->object_size); +} + +void kasan_slab_free(struct kmem_cache *cache, void *object) +{ + unsigned long size = cache->object_size; + unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + return; + + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); +} + +void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) +{ + unsigned long redzone_start; + unsigned long redzone_end; + + if (unlikely(object == NULL)) + return; + + redzone_start = round_up((unsigned long)(object + size), + KASAN_SHADOW_SCALE_SIZE); + redzone_end = round_up((unsigned long)object + cache->object_size, + KASAN_SHADOW_SCALE_SIZE); + + kasan_unpoison_shadow(object, size); + kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, + KASAN_KMALLOC_REDZONE); +} +EXPORT_SYMBOL(kasan_kmalloc); + +void kasan_kmalloc_large(const void *ptr, size_t size) +{ + struct page *page; + unsigned long redzone_start; + unsigned long redzone_end; + + if (unlikely(ptr == NULL)) + return; + + page = virt_to_page(ptr); + redzone_start = round_up((unsigned long)(ptr + size), + KASAN_SHADOW_SCALE_SIZE); + redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); + + kasan_unpoison_shadow(ptr, size); + kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, + KASAN_PAGE_REDZONE); +} + +void kasan_krealloc(const void *object, size_t size) +{ + struct page *page; + + if (unlikely(object == ZERO_SIZE_PTR)) + return; + + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) + kasan_kmalloc_large(object, size); + else + kasan_kmalloc(page->slab_cache, object, size); +} + +void kasan_kfree(void *ptr) +{ + struct page *page; + + page = virt_to_head_page(ptr); + + if (unlikely(!PageSlab(page))) + kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), + KASAN_FREE_PAGE); + else + kasan_slab_free(page->slab_cache, ptr); +} + +void kasan_kfree_large(const void *ptr) +{ + struct page *page = virt_to_page(ptr); + + kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), + KASAN_FREE_PAGE); +} + +int kasan_module_alloc(void *addr, size_t size) +{ + void *ret; + size_t shadow_size; + unsigned long shadow_start; + + shadow_start = (unsigned long)kasan_mem_to_shadow(addr); + shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT, + PAGE_SIZE); + + if (WARN_ON(!PAGE_ALIGNED(shadow_start))) + return -EINVAL; + + ret = __vmalloc_node_range(shadow_size, 1, shadow_start, + shadow_start + shadow_size, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, + __builtin_return_address(0)); + + if (ret) { + find_vm_area(addr)->flags |= VM_KASAN; + return 0; + } + + return -ENOMEM; +} + +void kasan_free_shadow(const struct vm_struct *vm) +{ + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); +} + +static void register_global(struct kasan_global *global) +{ + size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); + + kasan_unpoison_shadow(global->beg, global->size); + + kasan_poison_shadow(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE); +} + +void __asan_register_globals(struct kasan_global *globals, size_t size) +{ + int i; + + for (i = 0; i < size; i++) + register_global(&globals[i]); +} +EXPORT_SYMBOL(__asan_register_globals); + +void __asan_unregister_globals(struct kasan_global *globals, size_t size) +{ +} +EXPORT_SYMBOL(__asan_unregister_globals); + +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_memory_region(addr, size, false); \ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_memory_region(addr, size, true); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_store##size##_noabort) + +DEFINE_ASAN_LOAD_STORE(1); +DEFINE_ASAN_LOAD_STORE(2); +DEFINE_ASAN_LOAD_STORE(4); +DEFINE_ASAN_LOAD_STORE(8); +DEFINE_ASAN_LOAD_STORE(16); + +void __asan_loadN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, false); +} +EXPORT_SYMBOL(__asan_loadN); + +__alias(__asan_loadN) +void __asan_loadN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_loadN_noabort); + +void __asan_storeN(unsigned long addr, size_t size) +{ + check_memory_region(addr, size, true); +} +EXPORT_SYMBOL(__asan_storeN); + +__alias(__asan_storeN) +void __asan_storeN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_storeN_noabort); + +/* to shut up compiler complaints */ +void __asan_handle_no_return(void) {} +EXPORT_SYMBOL(__asan_handle_no_return); + +#ifdef CONFIG_MEMORY_HOTPLUG +static int kasan_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK; +} + +static int __init kasan_memhotplug_init(void) +{ + pr_err("WARNING: KASan doesn't support memory hot-add\n"); + pr_err("Memory hot-add will be disabled\n"); + + hotplug_memory_notifier(kasan_mem_notifier, 0); + + return 0; +} + +module_init(kasan_memhotplug_init); +#endif diff --git a/kernel/mm/kasan/kasan.h b/kernel/mm/kasan/kasan.h new file mode 100644 index 000000000..4986b0aca --- /dev/null +++ b/kernel/mm/kasan/kasan.h @@ -0,0 +1,75 @@ +#ifndef __MM_KASAN_KASAN_H +#define __MM_KASAN_KASAN_H + +#include + +#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) +#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) + +#define KASAN_FREE_PAGE 0xFF /* page was freed */ +#define KASAN_FREE_PAGE 0xFF /* page was freed */ +#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ +#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ +#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ +#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ + +/* + * Stack redzone shadow values + * (Those are compiler's ABI, don't change them) + */ +#define KASAN_STACK_LEFT 0xF1 +#define KASAN_STACK_MID 0xF2 +#define KASAN_STACK_RIGHT 0xF3 +#define KASAN_STACK_PARTIAL 0xF4 + +/* Don't break randconfig/all*config builds */ +#ifndef KASAN_ABI_VERSION +#define KASAN_ABI_VERSION 1 +#endif + +struct kasan_access_info { + const void *access_addr; + const void *first_bad_addr; + size_t access_size; + bool is_write; + unsigned long ip; +}; + +/* The layout of struct dictated by compiler */ +struct kasan_source_location { + const char *filename; + int line_no; + int column_no; +}; + +/* The layout of struct dictated by compiler */ +struct kasan_global { + const void *beg; /* Address of the beginning of the global variable. */ + size_t size; /* Size of the global variable. */ + size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */ + const void *name; + const void *module_name; /* Name of the module where the global variable is declared. */ + unsigned long has_dynamic_init; /* This needed for C++ */ +#if KASAN_ABI_VERSION >= 4 + struct kasan_source_location *location; +#endif +}; + +void kasan_report_error(struct kasan_access_info *info); +void kasan_report_user_access(struct kasan_access_info *info); + +static inline const void *kasan_shadow_to_mem(const void *shadow_addr) +{ + return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) + << KASAN_SHADOW_SCALE_SHIFT); +} + +static inline bool kasan_enabled(void) +{ + return !current->kasan_depth; +} + +void kasan_report(unsigned long addr, size_t size, + bool is_write, unsigned long ip); + +#endif diff --git a/kernel/mm/kasan/report.c b/kernel/mm/kasan/report.c new file mode 100644 index 000000000..680ceedf8 --- /dev/null +++ b/kernel/mm/kasan/report.c @@ -0,0 +1,269 @@ +/* + * This file contains error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some of code borrowed from https://github.com/xairy/linux by + * Andrey Konovalov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kasan.h" +#include "../slab.h" + +/* Shadow layout customization. */ +#define SHADOW_BYTES_PER_BLOCK 1 +#define SHADOW_BLOCKS_PER_ROW 16 +#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) +#define SHADOW_ROWS_AROUND_ADDR 2 + +static const void *find_first_bad_addr(const void *addr, size_t size) +{ + u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr); + const void *first_bad_addr = addr; + + while (!shadow_val && first_bad_addr < addr + size) { + first_bad_addr += KASAN_SHADOW_SCALE_SIZE; + shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr); + } + return first_bad_addr; +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = "unknown crash"; + u8 shadow_val; + + info->first_bad_addr = find_first_bad_addr(info->access_addr, + info->access_size); + + shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); + + switch (shadow_val) { + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + bug_type = "use after free"; + break; + case KASAN_PAGE_REDZONE: + case KASAN_KMALLOC_REDZONE: + case KASAN_GLOBAL_REDZONE: + case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + bug_type = "out of bounds access"; + break; + case KASAN_STACK_LEFT: + case KASAN_STACK_MID: + case KASAN_STACK_RIGHT: + case KASAN_STACK_PARTIAL: + bug_type = "out of bounds on stack"; + break; + } + + pr_err("BUG: KASan: %s in %pS at addr %p\n", + bug_type, (void *)info->ip, + info->access_addr); + pr_err("%s of size %zu by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_size, current->comm, task_pid_nr(current)); +} + +static inline bool kernel_or_module_addr(const void *addr) +{ + return (addr >= (void *)_stext && addr < (void *)_end) + || (addr >= (void *)MODULES_VADDR + && addr < (void *)MODULES_END); +} + +static inline bool init_task_stack_addr(const void *addr) +{ + return addr >= (void *)&init_thread_union.stack && + (addr <= (void *)&init_thread_union.stack + + sizeof(init_thread_union.stack)); +} + +static void print_address_description(struct kasan_access_info *info) +{ + const void *addr = info->access_addr; + + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) { + struct page *page = virt_to_head_page(addr); + + if (PageSlab(page)) { + void *object; + struct kmem_cache *cache = page->slab_cache; + void *last_object; + + object = virt_to_obj(cache, page_address(page), addr); + last_object = page_address(page) + + page->objects * cache->size; + + if (unlikely(object > last_object)) + object = last_object; /* we hit into padding */ + + object_err(cache, page, object, + "kasan: bad access detected"); + return; + } + dump_page(page, "kasan: bad access detected"); + } + + if (kernel_or_module_addr(addr)) { + if (!init_task_stack_addr(addr)) + pr_err("Address belongs to variable %pS\n", addr); + } + + dump_stack(); +} + +static bool row_is_guilty(const void *row, const void *guilty) +{ + return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW); +} + +static int shadow_pointer_offset(const void *row, const void *shadow) +{ + /* The length of ">ff00ff00ff00ff00: " is + * 3 + (BITS_PER_LONG/8)*2 chars. + */ + return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 + + (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1; +} + +static void print_shadow_for_address(const void *addr) +{ + int i; + const void *shadow = kasan_mem_to_shadow(addr); + const void *shadow_row; + + shadow_row = (void *)round_down((unsigned long)shadow, + SHADOW_BYTES_PER_ROW) + - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW; + + pr_err("Memory state around the buggy address:\n"); + + for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { + const void *kaddr = kasan_shadow_to_mem(shadow_row); + char buffer[4 + (BITS_PER_LONG/8)*2]; + + snprintf(buffer, sizeof(buffer), + (i == 0) ? ">%p: " : " %p: ", kaddr); + + kasan_disable_current(); + print_hex_dump(KERN_ERR, buffer, + DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, + shadow_row, SHADOW_BYTES_PER_ROW, 0); + kasan_enable_current(); + + if (row_is_guilty(shadow_row, shadow)) + pr_err("%*c\n", + shadow_pointer_offset(shadow_row, shadow), + '^'); + + shadow_row += SHADOW_BYTES_PER_ROW; + } +} + +static DEFINE_SPINLOCK(report_lock); + +void kasan_report_error(struct kasan_access_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(&report_lock, flags); + pr_err("=================================" + "=================================\n"); + print_error_description(info); + print_address_description(info); + print_shadow_for_address(info->first_bad_addr); + pr_err("=================================" + "=================================\n"); + spin_unlock_irqrestore(&report_lock, flags); +} + +void kasan_report_user_access(struct kasan_access_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(&report_lock, flags); + pr_err("=================================" + "=================================\n"); + pr_err("BUG: KASan: user-memory-access on address %p\n", + info->access_addr); + pr_err("%s of size %zu by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_size, current->comm, task_pid_nr(current)); + dump_stack(); + pr_err("=================================" + "=================================\n"); + spin_unlock_irqrestore(&report_lock, flags); +} + +void kasan_report(unsigned long addr, size_t size, + bool is_write, unsigned long ip) +{ + struct kasan_access_info info; + + if (likely(!kasan_enabled())) + return; + + info.access_addr = (void *)addr; + info.access_size = size; + info.is_write = is_write; + info.ip = ip; + kasan_report_error(&info); +} + + +#define DEFINE_ASAN_REPORT_LOAD(size) \ +void __asan_report_load##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, false, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_load##size##_noabort) + +#define DEFINE_ASAN_REPORT_STORE(size) \ +void __asan_report_store##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, true, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_store##size##_noabort) + +DEFINE_ASAN_REPORT_LOAD(1); +DEFINE_ASAN_REPORT_LOAD(2); +DEFINE_ASAN_REPORT_LOAD(4); +DEFINE_ASAN_REPORT_LOAD(8); +DEFINE_ASAN_REPORT_LOAD(16); +DEFINE_ASAN_REPORT_STORE(1); +DEFINE_ASAN_REPORT_STORE(2); +DEFINE_ASAN_REPORT_STORE(4); +DEFINE_ASAN_REPORT_STORE(8); +DEFINE_ASAN_REPORT_STORE(16); + +void __asan_report_load_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_load_n_noabort); + +void __asan_report_store_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/kernel/mm/kmemcheck.c b/kernel/mm/kmemcheck.c new file mode 100644 index 000000000..cab58bb59 --- /dev/null +++ b/kernel/mm/kmemcheck.c @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include "slab.h" +#include + +void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) +{ + struct page *shadow; + int pages; + int i; + + pages = 1 << order; + + /* + * With kmemcheck enabled, we need to allocate a memory area for the + * shadow bits as well. + */ + shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); + if (!shadow) { + if (printk_ratelimit()) + printk(KERN_ERR "kmemcheck: failed to allocate " + "shadow bitmap\n"); + return; + } + + for(i = 0; i < pages; ++i) + page[i].shadow = page_address(&shadow[i]); + + /* + * Mark it as non-present for the MMU so that our accesses to + * this memory will trigger a page fault and let us analyze + * the memory accesses. + */ + kmemcheck_hide_pages(page, pages); +} + +void kmemcheck_free_shadow(struct page *page, int order) +{ + struct page *shadow; + int pages; + int i; + + if (!kmemcheck_page_is_tracked(page)) + return; + + pages = 1 << order; + + kmemcheck_show_pages(page, pages); + + shadow = virt_to_page(page[0].shadow); + + for(i = 0; i < pages; ++i) + page[i].shadow = NULL; + + __free_pages(shadow, order); +} + +void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, + size_t size) +{ + /* + * Has already been memset(), which initializes the shadow for us + * as well. + */ + if (gfpflags & __GFP_ZERO) + return; + + /* No need to initialize the shadow of a non-tracked slab. */ + if (s->flags & SLAB_NOTRACK) + return; + + if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { + /* + * Allow notracked objects to be allocated from + * tracked caches. Note however that these objects + * will still get page faults on access, they just + * won't ever be flagged as uninitialized. If page + * faults are not acceptable, the slab cache itself + * should be marked NOTRACK. + */ + kmemcheck_mark_initialized(object, size); + } else if (!s->ctor) { + /* + * New objects should be marked uninitialized before + * they're returned to the called. + */ + kmemcheck_mark_uninitialized(object, size); + } +} + +void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) +{ + /* TODO: RCU freeing is unsupported for now; hide false positives. */ + if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + kmemcheck_mark_freed(object, size); +} + +void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, + gfp_t gfpflags) +{ + int pages; + + if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) + return; + + pages = 1 << order; + + /* + * NOTE: We choose to track GFP_ZERO pages too; in fact, they + * can become uninitialized by copying uninitialized memory + * into them. + */ + + /* XXX: Can use zone->node for node? */ + kmemcheck_alloc_shadow(page, order, gfpflags, -1); + + if (gfpflags & __GFP_ZERO) + kmemcheck_mark_initialized_pages(page, pages); + else + kmemcheck_mark_uninitialized_pages(page, pages); +} diff --git a/kernel/mm/kmemleak-test.c b/kernel/mm/kmemleak-test.c new file mode 100644 index 000000000..dcdcadb69 --- /dev/null +++ b/kernel/mm/kmemleak-test.c @@ -0,0 +1,111 @@ +/* + * mm/kmemleak-test.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define pr_fmt(fmt) "kmemleak: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct test_node { + long header[25]; + struct list_head list; + long footer[25]; +}; + +static LIST_HEAD(test_list); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); + +/* + * Some very simple testing. This function needs to be extended for + * proper testing. + */ +static int __init kmemleak_test_init(void) +{ + struct test_node *elem; + int i; + + printk(KERN_INFO "Kmemleak testing\n"); + + /* make some orphan objects */ + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); +#ifndef CONFIG_MODULES + pr_info("kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); + pr_info("kmem_cache_alloc(files_cachep) = %p\n", + kmem_cache_alloc(files_cachep, GFP_KERNEL)); +#endif + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = %p\n", vmalloc(64)); + + /* + * Add elements to a list. They should only appear as orphan + * after the module is removed. + */ + for (i = 0; i < 10; i++) { + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); + if (!elem) + return -ENOMEM; + INIT_LIST_HEAD(&elem->list); + list_add_tail(&elem->list, &test_list); + } + + for_each_possible_cpu(i) { + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); + pr_info("kmalloc(129) = %p\n", + per_cpu(kmemleak_test_pointer, i)); + } + + return 0; +} +module_init(kmemleak_test_init); + +static void __exit kmemleak_test_exit(void) +{ + struct test_node *elem, *tmp; + + /* + * Remove the list elements without actually freeing the + * memory. + */ + list_for_each_entry_safe(elem, tmp, &test_list, list) + list_del(&elem->list); +} +module_exit(kmemleak_test_exit); + +MODULE_LICENSE("GPL"); diff --git a/kernel/mm/kmemleak.c b/kernel/mm/kmemleak.c new file mode 100644 index 000000000..3716cdb8b --- /dev/null +++ b/kernel/mm/kmemleak.c @@ -0,0 +1,1941 @@ +/* + * mm/kmemleak.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * For more information on the algorithm and kmemleak usage, please see + * Documentation/kmemleak.txt. + * + * Notes on locking + * ---------------- + * + * The following locks and mutexes are used by kmemleak: + * + * - kmemleak_lock (rwlock): protects the object_list modifications and + * accesses to the object_tree_root. The object_list is the main list + * holding the metadata (struct kmemleak_object) for the allocated memory + * blocks. The object_tree_root is a red black tree used to look-up + * metadata based on a pointer to the corresponding memory block. The + * kmemleak_object structures are added to the object_list and + * object_tree_root in the create_object() function called from the + * kmemleak_alloc() callback and removed in delete_object() called from the + * kmemleak_free() callback + * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to + * the metadata (e.g. count) are protected by this lock. Note that some + * members of this structure may be protected by other means (atomic or + * kmemleak_lock). This lock is also held when scanning the corresponding + * memory block to avoid the kernel freeing it via the kmemleak_free() + * callback. This is less heavyweight than holding a global lock like + * kmemleak_lock during scanning + * - scan_mutex (mutex): ensures that only one thread may scan the memory for + * unreferenced objects at a time. The gray_list contains the objects which + * are already referenced or marked as false positives and need to be + * scanned. This list is only modified during a scanning episode when the + * scan_mutex is held. At the end of a scan, the gray_list is always empty. + * Note that the kmemleak_object.use_count is incremented when an object is + * added to the gray_list and therefore cannot be freed. This mutex also + * prevents multiple users of the "kmemleak" debugfs file together with + * modifications to the memory scanning parameters including the scan_thread + * pointer + * + * The kmemleak_object structures have a use_count incremented or decremented + * using the get_object()/put_object() functions. When the use_count becomes + * 0, this count can no longer be incremented and put_object() schedules the + * kmemleak_object freeing via an RCU callback. All calls to the get_object() + * function must be protected by rcu_read_lock() to avoid accessing a freed + * structure. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * Kmemleak configuration and common defines. + */ +#define MAX_TRACE 16 /* stack trace length */ +#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ +#define SECS_FIRST_SCAN 60 /* delay before the first scan */ +#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ +#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ + +#define BYTES_PER_POINTER sizeof(void *) + +/* GFP bitmask for kmemleak internal allocations */ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOACCOUNT)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) + +/* scanning area inside a memory block */ +struct kmemleak_scan_area { + struct hlist_node node; + unsigned long start; + size_t size; +}; + +#define KMEMLEAK_GREY 0 +#define KMEMLEAK_BLACK -1 + +/* + * Structure holding the metadata for each allocated memory block. + * Modifications to such objects should be made while holding the + * object->lock. Insertions or deletions from object_list, gray_list or + * rb_node are already protected by the corresponding locks or mutex (see + * the notes on locking above). These objects are reference-counted + * (use_count) and freed using the RCU mechanism. + */ +struct kmemleak_object { + spinlock_t lock; + unsigned long flags; /* object status flags */ + struct list_head object_list; + struct list_head gray_list; + struct rb_node rb_node; + struct rcu_head rcu; /* object_list lockless traversal */ + /* object usage count; object freed when use_count == 0 */ + atomic_t use_count; + unsigned long pointer; + size_t size; + /* minimum number of a pointers found before it is considered leak */ + int min_count; + /* the total number of pointers found pointing to this object */ + int count; + /* checksum for detecting modified objects */ + u32 checksum; + /* memory ranges to be scanned inside an object (empty for all) */ + struct hlist_head area_list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; + unsigned long jiffies; /* creation timestamp */ + pid_t pid; /* pid of the current task */ + char comm[TASK_COMM_LEN]; /* executable name */ +}; + +/* flag representing the memory block allocation status */ +#define OBJECT_ALLOCATED (1 << 0) +/* flag set after the first reporting of an unreference object */ +#define OBJECT_REPORTED (1 << 1) +/* flag set to not scan the object */ +#define OBJECT_NO_SCAN (1 << 2) + +/* number of bytes to print per line; must be 16 or 32 */ +#define HEX_ROW_SIZE 16 +/* number of bytes to print at a time (1, 2, 4, 8) */ +#define HEX_GROUP_SIZE 1 +/* include ASCII after the hex output */ +#define HEX_ASCII 1 +/* max number of lines to be printed */ +#define HEX_MAX_LINES 2 + +/* the list of all allocated objects */ +static LIST_HEAD(object_list); +/* the list of gray-colored objects (see color_gray comment below) */ +static LIST_HEAD(gray_list); +/* search tree for object boundaries */ +static struct rb_root object_tree_root = RB_ROOT; +/* rw_lock protecting the access to object_list and object_tree_root */ +static DEFINE_RWLOCK(kmemleak_lock); + +/* allocation caches for kmemleak internal data */ +static struct kmem_cache *object_cache; +static struct kmem_cache *scan_area_cache; + +/* set if tracing memory operations is enabled */ +static int kmemleak_enabled; +/* same as above but only for the kmemleak_free() callback */ +static int kmemleak_free_enabled; +/* set in the late_initcall if there were no errors */ +static int kmemleak_initialized; +/* enables or disables early logging of the memory operations */ +static int kmemleak_early_log = 1; +/* set if a kmemleak warning was issued */ +static int kmemleak_warning; +/* set if a fatal kmemleak error has occurred */ +static int kmemleak_error; + +/* minimum and maximum address that may be valid pointers */ +static unsigned long min_addr = ULONG_MAX; +static unsigned long max_addr; + +static struct task_struct *scan_thread; +/* used to avoid reporting of recently allocated objects */ +static unsigned long jiffies_min_age; +static unsigned long jiffies_last_scan; +/* delay between automatic memory scannings */ +static signed long jiffies_scan_wait; +/* enables or disables the task stacks scanning */ +static int kmemleak_stack_scan = 1; +/* protects the memory scanning, parameters and debug/kmemleak file access */ +static DEFINE_MUTEX(scan_mutex); +/* setting kmemleak=on, will set this var, skipping the disable */ +static int kmemleak_skip_disable; +/* If there are leaks that can be reported */ +static bool kmemleak_found_leaks; + +/* + * Early object allocation/freeing logging. Kmemleak is initialized after the + * kernel allocator. However, both the kernel allocator and kmemleak may + * allocate memory blocks which need to be tracked. Kmemleak defines an + * arbitrary buffer to hold the allocation/freeing information before it is + * fully initialized. + */ + +/* kmemleak operation type for early logging */ +enum { + KMEMLEAK_ALLOC, + KMEMLEAK_ALLOC_PERCPU, + KMEMLEAK_FREE, + KMEMLEAK_FREE_PART, + KMEMLEAK_FREE_PERCPU, + KMEMLEAK_NOT_LEAK, + KMEMLEAK_IGNORE, + KMEMLEAK_SCAN_AREA, + KMEMLEAK_NO_SCAN +}; + +/* + * Structure holding the information passed to kmemleak callbacks during the + * early logging. + */ +struct early_log { + int op_type; /* kmemleak operation type */ + const void *ptr; /* allocated/freed memory block */ + size_t size; /* memory block size */ + int min_count; /* minimum reference count */ + unsigned long trace[MAX_TRACE]; /* stack trace */ + unsigned int trace_len; /* stack trace length */ +}; + +/* early logging buffer and current position */ +static struct early_log + early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; +static int crt_early_log __initdata; + +static void kmemleak_disable(void); + +/* + * Print a warning and dump the stack trace. + */ +#define kmemleak_warn(x...) do { \ + pr_warning(x); \ + dump_stack(); \ + kmemleak_warning = 1; \ +} while (0) + +/* + * Macro invoked when a serious kmemleak condition occurred and cannot be + * recovered from. Kmemleak will be disabled and further allocation/freeing + * tracing no longer available. + */ +#define kmemleak_stop(x...) do { \ + kmemleak_warn(x); \ + kmemleak_disable(); \ +} while (0) + +/* + * Printing of the objects hex dump to the seq file. The number of lines to be + * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The + * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called + * with the object->lock held. + */ +static void hex_dump_object(struct seq_file *seq, + struct kmemleak_object *object) +{ + const u8 *ptr = (const u8 *)object->pointer; + int i, len, remaining; + unsigned char linebuf[HEX_ROW_SIZE * 5]; + + /* limit the number of lines to HEX_MAX_LINES */ + remaining = len = + min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); + + seq_printf(seq, " hex dump (first %d bytes):\n", len); + for (i = 0; i < len; i += HEX_ROW_SIZE) { + int linelen = min(remaining, HEX_ROW_SIZE); + + remaining -= HEX_ROW_SIZE; + hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, + HEX_GROUP_SIZE, linebuf, sizeof(linebuf), + HEX_ASCII); + seq_printf(seq, " %s\n", linebuf); + } +} + +/* + * Object colors, encoded with count and min_count: + * - white - orphan object, not enough references to it (count < min_count) + * - gray - not orphan, not marked as false positive (min_count == 0) or + * sufficient references to it (count >= min_count) + * - black - ignore, it doesn't contain references (e.g. text section) + * (min_count == -1). No function defined for this color. + * Newly created objects don't have any color assigned (object->count == -1) + * before the next memory scan when they become white. + */ +static bool color_white(const struct kmemleak_object *object) +{ + return object->count != KMEMLEAK_BLACK && + object->count < object->min_count; +} + +static bool color_gray(const struct kmemleak_object *object) +{ + return object->min_count != KMEMLEAK_BLACK && + object->count >= object->min_count; +} + +/* + * Objects are considered unreferenced only if their color is white, they have + * not be deleted and have a minimum age to avoid false positives caused by + * pointers temporarily stored in CPU registers. + */ +static bool unreferenced_object(struct kmemleak_object *object) +{ + return (color_white(object) && object->flags & OBJECT_ALLOCATED) && + time_before_eq(object->jiffies + jiffies_min_age, + jiffies_last_scan); +} + +/* + * Printing of the unreferenced objects information to the seq file. The + * print_unreferenced function must be called with the object->lock held. + */ +static void print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object) +{ + int i; + unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); + + seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + object->comm, object->pid, object->jiffies, + msecs_age / 1000, msecs_age % 1000); + hex_dump_object(seq, object); + seq_printf(seq, " backtrace:\n"); + + for (i = 0; i < object->trace_len; i++) { + void *ptr = (void *)object->trace[i]; + seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + } +} + +/* + * Print the kmemleak_object information. This function is used mainly for + * debugging special cases when kmemleak operations. It must be called with + * the object->lock held. + */ +static void dump_object_info(struct kmemleak_object *object) +{ + struct stack_trace trace; + + trace.nr_entries = object->trace_len; + trace.entries = object->trace; + + pr_notice("Object 0x%08lx (size %zu):\n", + object->pointer, object->size); + pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + pr_notice(" min_count = %d\n", object->min_count); + pr_notice(" count = %d\n", object->count); + pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" checksum = %u\n", object->checksum); + pr_notice(" backtrace:\n"); + print_stack_trace(&trace, 4); +} + +/* + * Look-up a memory block metadata (kmemleak_object) in the object search + * tree based on a pointer value. If alias is 0, only values pointing to the + * beginning of the memory block are allowed. The kmemleak_lock must be held + * when calling this function. + */ +static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) +{ + struct rb_node *rb = object_tree_root.rb_node; + + while (rb) { + struct kmemleak_object *object = + rb_entry(rb, struct kmemleak_object, rb_node); + if (ptr < object->pointer) + rb = object->rb_node.rb_left; + else if (object->pointer + object->size <= ptr) + rb = object->rb_node.rb_right; + else if (object->pointer == ptr || alias) + return object; + else { + kmemleak_warn("Found object by alias at 0x%08lx\n", + ptr); + dump_object_info(object); + break; + } + } + return NULL; +} + +/* + * Increment the object use_count. Return 1 if successful or 0 otherwise. Note + * that once an object's use_count reached 0, the RCU freeing was already + * registered and the object should no longer be used. This function must be + * called under the protection of rcu_read_lock(). + */ +static int get_object(struct kmemleak_object *object) +{ + return atomic_inc_not_zero(&object->use_count); +} + +/* + * RCU callback to free a kmemleak_object. + */ +static void free_object_rcu(struct rcu_head *rcu) +{ + struct hlist_node *tmp; + struct kmemleak_scan_area *area; + struct kmemleak_object *object = + container_of(rcu, struct kmemleak_object, rcu); + + /* + * Once use_count is 0 (guaranteed by put_object), there is no other + * code accessing this object, hence no need for locking. + */ + hlist_for_each_entry_safe(area, tmp, &object->area_list, node) { + hlist_del(&area->node); + kmem_cache_free(scan_area_cache, area); + } + kmem_cache_free(object_cache, object); +} + +/* + * Decrement the object use_count. Once the count is 0, free the object using + * an RCU callback. Since put_object() may be called via the kmemleak_free() -> + * delete_object() path, the delayed RCU freeing ensures that there is no + * recursive call to the kernel allocator. Lock-less RCU object_list traversal + * is also possible. + */ +static void put_object(struct kmemleak_object *object) +{ + if (!atomic_dec_and_test(&object->use_count)) + return; + + /* should only get here after delete_object was called */ + WARN_ON(object->flags & OBJECT_ALLOCATED); + + call_rcu(&object->rcu, free_object_rcu); +} + +/* + * Look up an object in the object search tree and increase its use_count. + */ +static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) +{ + unsigned long flags; + struct kmemleak_object *object = NULL; + + rcu_read_lock(); + read_lock_irqsave(&kmemleak_lock, flags); + if (ptr >= min_addr && ptr < max_addr) + object = lookup_object(ptr, alias); + read_unlock_irqrestore(&kmemleak_lock, flags); + + /* check whether the object is still available */ + if (object && !get_object(object)) + object = NULL; + rcu_read_unlock(); + + return object; +} + +/* + * Save stack trace to the given array of MAX_TRACE size. + */ +static int __save_stack_trace(unsigned long *trace) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + + return stack_trace.nr_entries; +} + +/* + * Create the metadata (struct kmemleak_object) corresponding to an allocated + * memory block and add it to the object_list and object_tree_root. + */ +static struct kmemleak_object *create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object, *parent; + struct rb_node **link, *rb_parent; + + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + if (!object) { + pr_warning("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); + return NULL; + } + + INIT_LIST_HEAD(&object->object_list); + INIT_LIST_HEAD(&object->gray_list); + INIT_HLIST_HEAD(&object->area_list); + spin_lock_init(&object->lock); + atomic_set(&object->use_count, 1); + object->flags = OBJECT_ALLOCATED; + object->pointer = ptr; + object->size = size; + object->min_count = min_count; + object->count = 0; /* white color initially */ + object->jiffies = jiffies; + object->checksum = 0; + + /* task information */ + if (in_irq()) { + object->pid = 0; + strncpy(object->comm, "hardirq", sizeof(object->comm)); + } else if (in_softirq()) { + object->pid = 0; + strncpy(object->comm, "softirq", sizeof(object->comm)); + } else { + object->pid = current->pid; + /* + * There is a small chance of a race with set_task_comm(), + * however using get_task_comm() here may cause locking + * dependency issues with current->alloc_lock. In the worst + * case, the command line is not correct. + */ + strncpy(object->comm, current->comm, sizeof(object->comm)); + } + + /* kernel backtrace */ + object->trace_len = __save_stack_trace(object->trace); + + write_lock_irqsave(&kmemleak_lock, flags); + + min_addr = min(min_addr, ptr); + max_addr = max(max_addr, ptr + size); + link = &object_tree_root.rb_node; + rb_parent = NULL; + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); + if (ptr + size <= parent->pointer) + link = &parent->rb_node.rb_left; + else if (parent->pointer + parent->size <= ptr) + link = &parent->rb_node.rb_right; + else { + kmemleak_stop("Cannot insert 0x%lx into the object " + "search tree (overlaps existing)\n", + ptr); + kmem_cache_free(object_cache, object); + object = parent; + spin_lock(&object->lock); + dump_object_info(object); + spin_unlock(&object->lock); + goto out; + } + } + rb_link_node(&object->rb_node, rb_parent, link); + rb_insert_color(&object->rb_node, &object_tree_root); + + list_add_tail_rcu(&object->object_list, &object_list); +out: + write_unlock_irqrestore(&kmemleak_lock, flags); + return object; +} + +/* + * Remove the metadata (struct kmemleak_object) for a memory block from the + * object_list and object_tree_root and decrement its use_count. + */ +static void __delete_object(struct kmemleak_object *object) +{ + unsigned long flags; + + write_lock_irqsave(&kmemleak_lock, flags); + rb_erase(&object->rb_node, &object_tree_root); + list_del_rcu(&object->object_list); + write_unlock_irqrestore(&kmemleak_lock, flags); + + WARN_ON(!(object->flags & OBJECT_ALLOCATED)); + WARN_ON(atomic_read(&object->use_count) < 2); + + /* + * Locking here also ensures that the corresponding memory block + * cannot be freed when it is being scanned. + */ + spin_lock_irqsave(&object->lock, flags); + object->flags &= ~OBJECT_ALLOCATED; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. + */ +static void delete_object_full(unsigned long ptr) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", + ptr); +#endif + return; + } + __delete_object(object); + put_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. + */ +static void delete_object_part(unsigned long ptr, size_t size) +{ + struct kmemleak_object *object; + unsigned long start, end; + + object = find_and_get_object(ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx " + "(size %zu)\n", ptr, size); +#endif + return; + } + __delete_object(object); + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. The path below is + * only executed during early log recording in kmemleak_init(), so + * GFP_KERNEL is enough. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + create_object(start, ptr - start, object->min_count, + GFP_KERNEL); + if (ptr + size < end) + create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL); + + put_object(object); +} + +static void __paint_it(struct kmemleak_object *object, int color) +{ + object->min_count = color; + if (color == KMEMLEAK_BLACK) + object->flags |= OBJECT_NO_SCAN; +} + +static void paint_it(struct kmemleak_object *object, int color) +{ + unsigned long flags; + + spin_lock_irqsave(&object->lock, flags); + __paint_it(object, color); + spin_unlock_irqrestore(&object->lock, flags); +} + +static void paint_ptr(unsigned long ptr, int color) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Trying to color unknown object " + "at 0x%08lx as %s\n", ptr, + (color == KMEMLEAK_GREY) ? "Grey" : + (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); + return; + } + paint_it(object, color); + put_object(object); +} + +/* + * Mark an object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_GREY); +} + +/* + * Mark the object as black-colored so that it is ignored from scans and + * reporting. + */ +static void make_black_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_BLACK); +} + +/* + * Add a scanning area to the object. If at least one such area is added, + * kmemleak will only scan these ranges rather than the whole memory block. + */ +static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + struct kmemleak_scan_area *area; + + object = find_and_get_object(ptr, 1); + if (!object) { + kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", + ptr); + return; + } + + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + if (!area) { + pr_warning("Cannot allocate a scan area\n"); + goto out; + } + + spin_lock_irqsave(&object->lock, flags); + if (size == SIZE_MAX) { + size = object->pointer + object->size - ptr; + } else if (ptr + size > object->pointer + object->size) { + kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); + dump_object_info(object); + kmem_cache_free(scan_area_cache, area); + goto out_unlock; + } + + INIT_HLIST_NODE(&area->node); + area->start = ptr; + area->size = size; + + hlist_add_head(&area->node, &object->area_list); +out_unlock: + spin_unlock_irqrestore(&object->lock, flags); +out: + put_object(object); +} + +/* + * Set the OBJECT_NO_SCAN flag for the object corresponding to the give + * pointer. Such object will not be scanned by kmemleak but references to it + * are searched. + */ +static void object_no_scan(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->flags |= OBJECT_NO_SCAN; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Log an early kmemleak_* call to the early_log buffer. These calls will be + * processed later once kmemleak is fully initialized. + */ +static void __init log_early(int op_type, const void *ptr, size_t size, + int min_count) +{ + unsigned long flags; + struct early_log *log; + + if (kmemleak_error) { + /* kmemleak stopped recording, just count the requests */ + crt_early_log++; + return; + } + + if (crt_early_log >= ARRAY_SIZE(early_log)) { + kmemleak_disable(); + return; + } + + /* + * There is no need for locking since the kernel is still in UP mode + * at this stage. Disabling the IRQs is enough. + */ + local_irq_save(flags); + log = &early_log[crt_early_log]; + log->op_type = op_type; + log->ptr = ptr; + log->size = size; + log->min_count = min_count; + log->trace_len = __save_stack_trace(log->trace); + crt_early_log++; + local_irq_restore(flags); +} + +/* + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc(struct early_log *log) +{ + struct kmemleak_object *object; + unsigned long flags; + int i; + + if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) + return; + + /* + * RCU locking needed to ensure object is not freed via put_object(). + */ + rcu_read_lock(); + object = create_object((unsigned long)log->ptr, log->size, + log->min_count, GFP_ATOMIC); + if (!object) + goto out; + spin_lock_irqsave(&object->lock, flags); + for (i = 0; i < log->trace_len; i++) + object->trace[i] = log->trace[i]; + object->trace_len = log->trace_len; + spin_unlock_irqrestore(&object->lock, flags); +out: + rcu_read_unlock(); +} + +/* + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc_percpu(struct early_log *log) +{ + unsigned int cpu; + const void __percpu *ptr = log->ptr; + + for_each_possible_cpu(cpu) { + log->ptr = per_cpu_ptr(ptr, cpu); + early_alloc(log); + } +} + +/** + * kmemleak_alloc - register a newly allocated object + * @ptr: pointer to beginning of the object + * @size: size of the object + * @min_count: minimum number of references to this object. If during memory + * scanning a number of references less than @min_count is found, + * the object is reported as a memory leak. If @min_count is 0, + * the object is never reported as a leak. If @min_count is -1, + * the object is ignored (not scanned and not reported as a leak) + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the kernel allocators when a new object + * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + */ +void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + create_object((unsigned long)ptr, size, min_count, gfp); + else if (kmemleak_early_log) + log_early(KMEMLEAK_ALLOC, ptr, size, min_count); +} +EXPORT_SYMBOL_GPL(kmemleak_alloc); + +/** + * kmemleak_alloc_percpu - register a newly allocated __percpu object + * @ptr: __percpu pointer to beginning of the object + * @size: size of the object + * @gfp: flags used for kmemleak internal memory allocations + * + * This function is called from the kernel percpu allocator when a new object + * (memory block) is allocated (alloc_percpu). + */ +void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) +{ + unsigned int cpu; + + pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size); + + /* + * Percpu allocations are only scanned and not reported as leaks + * (min_count is set to 0). + */ + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + create_object((unsigned long)per_cpu_ptr(ptr, cpu), + size, 0, gfp); + else if (kmemleak_early_log) + log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); + +/** + * kmemleak_free - unregister a previously registered object + * @ptr: pointer to beginning of the object + * + * This function is called from the kernel allocators when an object (memory + * block) is freed (kmem_cache_free, kfree, vfree etc.). + */ +void __ref kmemleak_free(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) + delete_object_full((unsigned long)ptr); + else if (kmemleak_early_log) + log_early(KMEMLEAK_FREE, ptr, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free); + +/** + * kmemleak_free_part - partially unregister a previously registered object + * @ptr: pointer to the beginning or inside the object. This also + * represents the start of the range to be freed + * @size: size to be unregistered + * + * This function is called when only a part of a memory block is freed + * (usually from the bootmem allocator). + */ +void __ref kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size); + else if (kmemleak_early_log) + log_early(KMEMLEAK_FREE_PART, ptr, size, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + +/** + * kmemleak_free_percpu - unregister a previously registered __percpu object + * @ptr: __percpu pointer to beginning of the object + * + * This function is called from the kernel percpu allocator when an object + * (memory block) is freed (free_percpu). + */ +void __ref kmemleak_free_percpu(const void __percpu *ptr) +{ + unsigned int cpu; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + delete_object_full((unsigned long)per_cpu_ptr(ptr, + cpu)); + else if (kmemleak_early_log) + log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_percpu); + +/** + * kmemleak_update_trace - update object allocation stack trace + * @ptr: pointer to beginning of the object + * + * Override the object allocation stack trace for cases where the actual + * allocation place is not always useful. + */ +void __ref kmemleak_update_trace(const void *ptr) +{ + struct kmemleak_object *object; + unsigned long flags; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) + return; + + object = find_and_get_object((unsigned long)ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Updating stack trace for unknown object at %p\n", + ptr); +#endif + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->trace_len = __save_stack_trace(object->trace); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); +} +EXPORT_SYMBOL(kmemleak_update_trace); + +/** + * kmemleak_not_leak - mark an allocated object as false positive + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to no longer + * be reported as leak and always be scanned. + */ +void __ref kmemleak_not_leak(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + make_gray_object((unsigned long)ptr); + else if (kmemleak_early_log) + log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); +} +EXPORT_SYMBOL(kmemleak_not_leak); + +/** + * kmemleak_ignore - ignore an allocated object + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to be + * ignored (not scanned and not reported as a leak). This is usually done when + * it is known that the corresponding block is not a leak and does not contain + * any references to other allocated memory blocks. + */ +void __ref kmemleak_ignore(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + make_black_object((unsigned long)ptr); + else if (kmemleak_early_log) + log_early(KMEMLEAK_IGNORE, ptr, 0, 0); +} +EXPORT_SYMBOL(kmemleak_ignore); + +/** + * kmemleak_scan_area - limit the range to be scanned in an allocated object + * @ptr: pointer to beginning or inside the object. This also + * represents the start of the scan area + * @size: size of the scan area + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is used when it is known that only certain parts of an object + * contain references to other objects. Kmemleak will only scan these areas + * reducing the number false negatives. + */ +void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) + add_scan_area((unsigned long)ptr, size, gfp); + else if (kmemleak_early_log) + log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); +} +EXPORT_SYMBOL(kmemleak_scan_area); + +/** + * kmemleak_no_scan - do not scan an allocated object + * @ptr: pointer to beginning of the object + * + * This function notifies kmemleak not to scan the given memory block. Useful + * in situations where it is known that the given object does not contain any + * references to other objects. Kmemleak will not scan such objects reducing + * the number of false negatives. + */ +void __ref kmemleak_no_scan(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + object_no_scan((unsigned long)ptr); + else if (kmemleak_early_log) + log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); +} +EXPORT_SYMBOL(kmemleak_no_scan); + +/* + * Update an object's checksum and return true if it was modified. + */ +static bool update_checksum(struct kmemleak_object *object) +{ + u32 old_csum = object->checksum; + + if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) + return false; + + kasan_disable_current(); + object->checksum = crc32(0, (void *)object->pointer, object->size); + kasan_enable_current(); + + return object->checksum != old_csum; +} + +/* + * Memory scanning is a long process and it needs to be interruptable. This + * function checks whether such interrupt condition occurred. + */ +static int scan_should_stop(void) +{ + if (!kmemleak_enabled) + return 1; + + /* + * This function may be called from either process or kthread context, + * hence the need to check for both stop conditions. + */ + if (current->mm) + return signal_pending(current); + else + return kthread_should_stop(); + + return 0; +} + +/* + * Scan a memory block (exclusive range) for valid pointers and add those + * found to the gray list. + */ +static void scan_block(void *_start, void *_end, + struct kmemleak_object *scanned, int allow_resched) +{ + unsigned long *ptr; + unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); + unsigned long *end = _end - (BYTES_PER_POINTER - 1); + + for (ptr = start; ptr < end; ptr++) { + struct kmemleak_object *object; + unsigned long flags; + unsigned long pointer; + + if (allow_resched) + cond_resched(); + if (scan_should_stop()) + break; + + /* don't scan uninitialized memory */ + if (!kmemcheck_is_obj_initialized((unsigned long)ptr, + BYTES_PER_POINTER)) + continue; + + kasan_disable_current(); + pointer = *ptr; + kasan_enable_current(); + + object = find_and_get_object(pointer, 1); + if (!object) + continue; + if (object == scanned) { + /* self referenced, ignore */ + put_object(object); + continue; + } + + /* + * Avoid the lockdep recursive warning on object->lock being + * previously acquired in scan_object(). These locks are + * enclosed by scan_mutex. + */ + spin_lock_irqsave_nested(&object->lock, flags, + SINGLE_DEPTH_NESTING); + if (!color_white(object)) { + /* non-orphan, ignored or new */ + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + continue; + } + + /* + * Increase the object's reference count (number of pointers + * to the memory block). If this count reaches the required + * minimum, the object's color will become gray and it will be + * added to the gray_list. + */ + object->count++; + if (color_gray(object)) { + list_add_tail(&object->gray_list, &gray_list); + spin_unlock_irqrestore(&object->lock, flags); + continue; + } + + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + } +} + +/* + * Scan a memory block corresponding to a kmemleak_object. A condition is + * that object->use_count >= 1. + */ +static void scan_object(struct kmemleak_object *object) +{ + struct kmemleak_scan_area *area; + unsigned long flags; + + /* + * Once the object->lock is acquired, the corresponding memory block + * cannot be freed (the same lock is acquired in delete_object). + */ + spin_lock_irqsave(&object->lock, flags); + if (object->flags & OBJECT_NO_SCAN) + goto out; + if (!(object->flags & OBJECT_ALLOCATED)) + /* already freed object */ + goto out; + if (hlist_empty(&object->area_list)) { + void *start = (void *)object->pointer; + void *end = (void *)(object->pointer + object->size); + + while (start < end && (object->flags & OBJECT_ALLOCATED) && + !(object->flags & OBJECT_NO_SCAN)) { + scan_block(start, min(start + MAX_SCAN_SIZE, end), + object, 0); + start += MAX_SCAN_SIZE; + + spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + spin_lock_irqsave(&object->lock, flags); + } + } else + hlist_for_each_entry(area, &object->area_list, node) + scan_block((void *)area->start, + (void *)(area->start + area->size), + object, 0); +out: + spin_unlock_irqrestore(&object->lock, flags); +} + +/* + * Scan the objects already referenced (gray objects). More objects will be + * referenced and, if there are no memory leaks, all the objects are scanned. + */ +static void scan_gray_list(void) +{ + struct kmemleak_object *object, *tmp; + + /* + * The list traversal is safe for both tail additions and removals + * from inside the loop. The kmemleak objects cannot be freed from + * outside the loop because their use_count was incremented. + */ + object = list_entry(gray_list.next, typeof(*object), gray_list); + while (&object->gray_list != &gray_list) { + cond_resched(); + + /* may add new objects to the list */ + if (!scan_should_stop()) + scan_object(object); + + tmp = list_entry(object->gray_list.next, typeof(*object), + gray_list); + + /* remove the object from the list and release it */ + list_del(&object->gray_list); + put_object(object); + + object = tmp; + } + WARN_ON(!list_empty(&gray_list)); +} + +/* + * Scan data sections and all the referenced memory blocks allocated via the + * kernel's standard allocators. This function must be called with the + * scan_mutex held. + */ +static void kmemleak_scan(void) +{ + unsigned long flags; + struct kmemleak_object *object; + int i; + int new_leaks = 0; + + jiffies_last_scan = jiffies; + + /* prepare the kmemleak_object's */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); +#ifdef DEBUG + /* + * With a few exceptions there should be a maximum of + * 1 reference to any object at this point. + */ + if (atomic_read(&object->use_count) > 1) { + pr_debug("object->use_count = %d\n", + atomic_read(&object->use_count)); + dump_object_info(object); + } +#endif + /* reset the reference count (whiten the object) */ + object->count = 0; + if (color_gray(object) && get_object(object)) + list_add_tail(&object->gray_list, &gray_list); + + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + /* data/bss scanning */ + scan_block(_sdata, _edata, NULL, 1); + scan_block(__bss_start, __bss_stop, NULL, 1); + +#ifdef CONFIG_SMP + /* per-cpu sections scanning */ + for_each_possible_cpu(i) + scan_block(__per_cpu_start + per_cpu_offset(i), + __per_cpu_end + per_cpu_offset(i), NULL, 1); +#endif + + /* + * Struct page scanning for each node. + */ + get_online_mems(); + for_each_online_node(i) { + unsigned long start_pfn = node_start_pfn(i); + unsigned long end_pfn = node_end_pfn(i); + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* only scan if page is in use */ + if (page_count(page) == 0) + continue; + scan_block(page, page + 1, NULL, 1); + } + } + put_online_mems(); + + /* + * Scanning the task stacks (may introduce false negatives). + */ + if (kmemleak_stack_scan) { + struct task_struct *p, *g; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + scan_block(task_stack_page(p), task_stack_page(p) + + THREAD_SIZE, NULL, 0); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } + + /* + * Scan the objects already referenced from the sections scanned + * above. + */ + scan_gray_list(); + + /* + * Check for new or unreferenced objects modified since the previous + * scan and color them gray until the next scan. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (color_white(object) && (object->flags & OBJECT_ALLOCATED) + && update_checksum(object) && get_object(object)) { + /* color it gray temporarily */ + object->count = object->min_count; + list_add_tail(&object->gray_list, &gray_list); + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + /* + * Re-scan the gray list for modified unreferenced objects. + */ + scan_gray_list(); + + /* + * If scanning was stopped do not report any new unreferenced objects. + */ + if (scan_should_stop()) + return; + + /* + * Scanning result reporting. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; + new_leaks++; + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (new_leaks) { + kmemleak_found_leaks = true; + + pr_info("%d new suspected memory leaks (see " + "/sys/kernel/debug/kmemleak)\n", new_leaks); + } + +} + +/* + * Thread function performing automatic memory scanning. Unreferenced objects + * at the end of a memory scan are reported but only the first time. + */ +static int kmemleak_scan_thread(void *arg) +{ + static int first_run = 1; + + pr_info("Automatic memory scanning thread started\n"); + set_user_nice(current, 10); + + /* + * Wait before the first scan to allow the system to fully initialize. + */ + if (first_run) { + first_run = 0; + ssleep(SECS_FIRST_SCAN); + } + + while (!kthread_should_stop()) { + signed long timeout = jiffies_scan_wait; + + mutex_lock(&scan_mutex); + kmemleak_scan(); + mutex_unlock(&scan_mutex); + + /* wait before the next scan */ + while (timeout && !kthread_should_stop()) + timeout = schedule_timeout_interruptible(timeout); + } + + pr_info("Automatic memory scanning thread ended\n"); + + return 0; +} + +/* + * Start the automatic memory scanning thread. This function must be called + * with the scan_mutex held. + */ +static void start_scan_thread(void) +{ + if (scan_thread) + return; + scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); + if (IS_ERR(scan_thread)) { + pr_warning("Failed to create the scan thread\n"); + scan_thread = NULL; + } +} + +/* + * Stop the automatic memory scanning thread. This function must be called + * with the scan_mutex held. + */ +static void stop_scan_thread(void) +{ + if (scan_thread) { + kthread_stop(scan_thread); + scan_thread = NULL; + } +} + +/* + * Iterate over the object_list and return the first valid object at or after + * the required position with its use_count incremented. The function triggers + * a memory scanning when the pos argument points to the first position. + */ +static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct kmemleak_object *object; + loff_t n = *pos; + int err; + + err = mutex_lock_interruptible(&scan_mutex); + if (err < 0) + return ERR_PTR(err); + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + if (n-- > 0) + continue; + if (get_object(object)) + goto out; + } + object = NULL; +out: + return object; +} + +/* + * Return the next object in the object_list. The function decrements the + * use_count of the previous object and increases that of the next one. + */ +static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct kmemleak_object *prev_obj = v; + struct kmemleak_object *next_obj = NULL; + struct kmemleak_object *obj = prev_obj; + + ++(*pos); + + list_for_each_entry_continue_rcu(obj, &object_list, object_list) { + if (get_object(obj)) { + next_obj = obj; + break; + } + } + + put_object(prev_obj); + return next_obj; +} + +/* + * Decrement the use_count of the last object required, if any. + */ +static void kmemleak_seq_stop(struct seq_file *seq, void *v) +{ + if (!IS_ERR(v)) { + /* + * kmemleak_seq_start may return ERR_PTR if the scan_mutex + * waiting was interrupted, so only release it if !IS_ERR. + */ + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + if (v) + put_object(v); + } +} + +/* + * Print the information for an unreferenced object to the seq file. + */ +static int kmemleak_seq_show(struct seq_file *seq, void *v) +{ + struct kmemleak_object *object = v; + unsigned long flags; + + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) + print_unreferenced(seq, object); + spin_unlock_irqrestore(&object->lock, flags); + return 0; +} + +static const struct seq_operations kmemleak_seq_ops = { + .start = kmemleak_seq_start, + .next = kmemleak_seq_next, + .stop = kmemleak_seq_stop, + .show = kmemleak_seq_show, +}; + +static int kmemleak_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &kmemleak_seq_ops); +} + +static int dump_str_object_info(const char *str) +{ + unsigned long flags; + struct kmemleak_object *object; + unsigned long addr; + + if (kstrtoul(str, 0, &addr)) + return -EINVAL; + object = find_and_get_object(addr, 0); + if (!object) { + pr_info("Unknown object at 0x%08lx\n", addr); + return -EINVAL; + } + + spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +} + +/* + * We use grey instead of black to ensure we can do future scans on the same + * objects. If we did not do future scans these black objects could + * potentially contain references to newly allocated objects in the future and + * we'd end up with false positives. + */ +static void kmemleak_clear(void) +{ + struct kmemleak_object *object; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + kmemleak_found_leaks = false; +} + +static void __kmemleak_do_cleanup(void); + +/* + * File write operation to configure kmemleak at run-time. The following + * commands can be written to the /sys/kernel/debug/kmemleak file: + * off - disable kmemleak (irreversible) + * stack=on - enable the task stacks scanning + * stack=off - disable the tasks stacks scanning + * scan=on - start the automatic memory scanning thread + * scan=off - stop the automatic memory scanning thread + * scan=... - set the automatic memory scanning period in seconds (0 to + * disable it) + * scan - trigger a memory scan + * clear - mark all current reported unreferenced kmemleak objects as + * grey to ignore printing them, or free all kmemleak objects + * if kmemleak has been disabled. + * dump=... - dump information about the object found at the given address + */ +static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, + size_t size, loff_t *ppos) +{ + char buf[64]; + int buf_size; + int ret; + + buf_size = min(size, (sizeof(buf) - 1)); + if (strncpy_from_user(buf, user_buf, buf_size) < 0) + return -EFAULT; + buf[buf_size] = 0; + + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + return ret; + + if (strncmp(buf, "clear", 5) == 0) { + if (kmemleak_enabled) + kmemleak_clear(); + else + __kmemleak_do_cleanup(); + goto out; + } + + if (!kmemleak_enabled) { + ret = -EBUSY; + goto out; + } + + if (strncmp(buf, "off", 3) == 0) + kmemleak_disable(); + else if (strncmp(buf, "stack=on", 8) == 0) + kmemleak_stack_scan = 1; + else if (strncmp(buf, "stack=off", 9) == 0) + kmemleak_stack_scan = 0; + else if (strncmp(buf, "scan=on", 7) == 0) + start_scan_thread(); + else if (strncmp(buf, "scan=off", 8) == 0) + stop_scan_thread(); + else if (strncmp(buf, "scan=", 5) == 0) { + unsigned long secs; + + ret = kstrtoul(buf + 5, 0, &secs); + if (ret < 0) + goto out; + stop_scan_thread(); + if (secs) { + jiffies_scan_wait = msecs_to_jiffies(secs * 1000); + start_scan_thread(); + } + } else if (strncmp(buf, "scan", 4) == 0) + kmemleak_scan(); + else if (strncmp(buf, "dump=", 5) == 0) + ret = dump_str_object_info(buf + 5); + else + ret = -EINVAL; + +out: + mutex_unlock(&scan_mutex); + if (ret < 0) + return ret; + + /* ignore the rest of the buffer, only one command at a time */ + *ppos += size; + return size; +} + +static const struct file_operations kmemleak_fops = { + .owner = THIS_MODULE, + .open = kmemleak_open, + .read = seq_read, + .write = kmemleak_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void __kmemleak_do_cleanup(void) +{ + struct kmemleak_object *object; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); +} + +/* + * Stop the memory scanning thread and free the kmemleak internal objects if + * no previous scan thread (otherwise, kmemleak may still have some useful + * information on memory leaks). + */ +static void kmemleak_do_cleanup(struct work_struct *work) +{ + mutex_lock(&scan_mutex); + stop_scan_thread(); + + /* + * Once the scan thread has stopped, it is safe to no longer track + * object freeing. Ordering of the scan thread stopping and the memory + * accesses below is guaranteed by the kthread_stop() function. + */ + kmemleak_free_enabled = 0; + + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else + pr_info("Kmemleak disabled without freeing internal data. " + "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); + mutex_unlock(&scan_mutex); +} + +static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); + +/* + * Disable kmemleak. No memory allocation/freeing will be traced once this + * function is called. Disabling kmemleak is an irreversible operation. + */ +static void kmemleak_disable(void) +{ + /* atomically check whether it was already invoked */ + if (cmpxchg(&kmemleak_error, 0, 1)) + return; + + /* stop any memory operation tracing */ + kmemleak_enabled = 0; + + /* check whether it is too early for a kernel thread */ + if (kmemleak_initialized) + schedule_work(&cleanup_work); + else + kmemleak_free_enabled = 0; + + pr_info("Kernel memory leak detector disabled\n"); +} + +/* + * Allow boot-time kmemleak disabling (enabled by default). + */ +static int kmemleak_boot_config(char *str) +{ + if (!str) + return -EINVAL; + if (strcmp(str, "off") == 0) + kmemleak_disable(); + else if (strcmp(str, "on") == 0) + kmemleak_skip_disable = 1; + else + return -EINVAL; + return 0; +} +early_param("kmemleak", kmemleak_boot_config); + +static void __init print_log_trace(struct early_log *log) +{ + struct stack_trace trace; + + trace.nr_entries = log->trace_len; + trace.entries = log->trace; + + pr_notice("Early log backtrace:\n"); + print_stack_trace(&trace, 2); +} + +/* + * Kmemleak initialization. + */ +void __init kmemleak_init(void) +{ + int i; + unsigned long flags; + +#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF + if (!kmemleak_skip_disable) { + kmemleak_early_log = 0; + kmemleak_disable(); + return; + } +#endif + + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); + jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); + + object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); + scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); + + if (crt_early_log >= ARRAY_SIZE(early_log)) + pr_warning("Early log buffer exceeded (%d), please increase " + "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); + + /* the kernel is still in UP mode, so disabling the IRQs is enough */ + local_irq_save(flags); + kmemleak_early_log = 0; + if (kmemleak_error) { + local_irq_restore(flags); + return; + } else { + kmemleak_enabled = 1; + kmemleak_free_enabled = 1; + } + local_irq_restore(flags); + + /* + * This is the point where tracking allocations is safe. Automatic + * scanning is started during the late initcall. Add the early logged + * callbacks to the kmemleak infrastructure. + */ + for (i = 0; i < crt_early_log; i++) { + struct early_log *log = &early_log[i]; + + switch (log->op_type) { + case KMEMLEAK_ALLOC: + early_alloc(log); + break; + case KMEMLEAK_ALLOC_PERCPU: + early_alloc_percpu(log); + break; + case KMEMLEAK_FREE: + kmemleak_free(log->ptr); + break; + case KMEMLEAK_FREE_PART: + kmemleak_free_part(log->ptr, log->size); + break; + case KMEMLEAK_FREE_PERCPU: + kmemleak_free_percpu(log->ptr); + break; + case KMEMLEAK_NOT_LEAK: + kmemleak_not_leak(log->ptr); + break; + case KMEMLEAK_IGNORE: + kmemleak_ignore(log->ptr); + break; + case KMEMLEAK_SCAN_AREA: + kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); + break; + case KMEMLEAK_NO_SCAN: + kmemleak_no_scan(log->ptr); + break; + default: + kmemleak_warn("Unknown early log operation: %d\n", + log->op_type); + } + + if (kmemleak_warning) { + print_log_trace(log); + kmemleak_warning = 0; + } + } +} + +/* + * Late initialization function. + */ +static int __init kmemleak_late_init(void) +{ + struct dentry *dentry; + + kmemleak_initialized = 1; + + if (kmemleak_error) { + /* + * Some error occurred and kmemleak was disabled. There is a + * small chance that kmemleak_disable() was called immediately + * after setting kmemleak_initialized and we may end up with + * two clean-up threads but serialized by scan_mutex. + */ + schedule_work(&cleanup_work); + return -ENOMEM; + } + + dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, + &kmemleak_fops); + if (!dentry) + pr_warning("Failed to create the debugfs kmemleak file\n"); + mutex_lock(&scan_mutex); + start_scan_thread(); + mutex_unlock(&scan_mutex); + + pr_info("Kernel memory leak detector initialized\n"); + + return 0; +} +late_initcall(kmemleak_late_init); diff --git a/kernel/mm/ksm.c b/kernel/mm/ksm.c new file mode 100644 index 000000000..7ee101eaa --- /dev/null +++ b/kernel/mm/ksm.c @@ -0,0 +1,2341 @@ +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008-2009 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * Hugh Dickins + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +#ifdef CONFIG_NUMA +#define NUMA(x) (x) +#define DO_NUMA(x) do { (x); } while (0) +#else +#define NUMA(x) (0) +#define DO_NUMA(x) do { } while (0) +#endif + +/* + * A few notes about the KSM scanning process, + * to make it easier to understand the data structures below: + * + * In order to reduce excessive scanning, KSM sorts the memory pages by their + * contents into a data structure that holds pointers to the pages' locations. + * + * Since the contents of the pages may change at any moment, KSM cannot just + * insert the pages into a normal sorted tree and expect it to find anything. + * Therefore KSM uses two data structures - the stable and the unstable tree. + * + * The stable tree holds pointers to all the merged pages (ksm pages), sorted + * by their contents. Because each such page is write-protected, searching on + * this tree is fully assured to be working (except when pages are unmapped), + * and therefore this tree is called the stable tree. + * + * In addition to the stable tree, KSM uses a second data structure called the + * unstable tree: this tree holds pointers to pages which have been found to + * be "unchanged for a period of time". The unstable tree sorts these pages + * by their contents, but since they are not write-protected, KSM cannot rely + * upon the unstable tree to work correctly - the unstable tree is liable to + * be corrupted as its contents are modified, and so it is called unstable. + * + * KSM solves this problem by several techniques: + * + * 1) The unstable tree is flushed every time KSM completes scanning all + * memory areas, and then the tree is rebuilt again from the beginning. + * 2) KSM will only insert into the unstable tree, pages whose hash value + * has not changed since the previous scan of all memory areas. + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the + * colors of the nodes and not on their contents, assuring that even when + * the tree gets "corrupted" it won't get out of balance, so scanning time + * remains the same (also, searching and inserting nodes in an rbtree uses + * the same algorithm, so we have no overhead when we flush and rebuild). + * 4) KSM never flushes the stable tree, which means that even if it were to + * take 10 attempts to find a page in the unstable tree, once it is found, + * it is secured in the stable tree. (When we scan a new page, we first + * compare it against the stable tree, and then against the unstable tree.) + * + * If the merge_across_nodes tunable is unset, then KSM maintains multiple + * stable trees and multiple unstable trees: one of each for each NUMA node. + */ + +/** + * struct mm_slot - ksm information per mm that is being scanned + * @link: link to the mm_slots hash list + * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @rmap_list: head for this mm_slot's singly-linked list of rmap_items + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node link; + struct list_head mm_list; + struct rmap_item *rmap_list; + struct mm_struct *mm; +}; + +/** + * struct ksm_scan - cursor for scanning + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * @rmap_list: link to the next rmap to be scanned in the rmap_list + * @seqnr: count of completed full scans (needed when removing unstable node) + * + * There is only the one ksm_scan instance of this cursor structure. + */ +struct ksm_scan { + struct mm_slot *mm_slot; + unsigned long address; + struct rmap_item **rmap_list; + unsigned long seqnr; +}; + +/** + * struct stable_node - node of the stable rbtree + * @node: rb node of this ksm page in the stable tree + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @list: linked into migrate_nodes, pending placement in the proper node tree + * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @nid: NUMA node id of stable tree in which linked (may not match kpfn) + */ +struct stable_node { + union { + struct rb_node node; /* when node of stable tree */ + struct { /* when listed for migration */ + struct list_head *head; + struct list_head list; + }; + }; + struct hlist_head hlist; + unsigned long kpfn; +#ifdef CONFIG_NUMA + int nid; +#endif +}; + +/** + * struct rmap_item - reverse mapping item for virtual addresses + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @nid: NUMA node id of unstable tree in which linked (may not match page) + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @oldchecksum: previous checksum of the page at that virtual address + * @node: rb node of this rmap_item in the unstable tree + * @head: pointer to stable_node heading this list in the stable tree + * @hlist: link into hlist of rmap_items hanging off that stable_node + */ +struct rmap_item { + struct rmap_item *rmap_list; + union { + struct anon_vma *anon_vma; /* when stable */ +#ifdef CONFIG_NUMA + int nid; /* when node of unstable tree */ +#endif + }; + struct mm_struct *mm; + unsigned long address; /* + low bits used for flags below */ + unsigned int oldchecksum; /* when unstable */ + union { + struct rb_node node; /* when node of unstable tree */ + struct { /* when listed from stable tree */ + struct stable_node *head; + struct hlist_node hlist; + }; + }; +}; + +#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ +#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ +#define STABLE_FLAG 0x200 /* is listed from the stable tree */ + +/* The stable and unstable tree heads */ +static struct rb_root one_stable_tree[1] = { RB_ROOT }; +static struct rb_root one_unstable_tree[1] = { RB_ROOT }; +static struct rb_root *root_stable_tree = one_stable_tree; +static struct rb_root *root_unstable_tree = one_unstable_tree; + +/* Recently migrated nodes of stable tree, pending proper placement */ +static LIST_HEAD(migrate_nodes); + +#define MM_SLOTS_HASH_BITS 10 +static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + +static struct mm_slot ksm_mm_head = { + .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), +}; +static struct ksm_scan ksm_scan = { + .mm_slot = &ksm_mm_head, +}; + +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *stable_node_cache; +static struct kmem_cache *mm_slot_cache; + +/* The number of nodes in the stable tree */ +static unsigned long ksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long ksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long ksm_pages_unshared; + +/* The number of rmap_items in use: to calculate pages_volatile */ +static unsigned long ksm_rmap_items; + +/* Number of pages ksmd should scan in one batch */ +static unsigned int ksm_thread_pages_to_scan = 100; + +/* Milliseconds ksmd should sleep between batches */ +static unsigned int ksm_thread_sleep_millisecs = 20; + +#ifdef CONFIG_NUMA +/* Zeroed when merging across nodes is not allowed */ +static unsigned int ksm_merge_across_nodes = 1; +static int ksm_nr_node_ids = 1; +#else +#define ksm_merge_across_nodes 1U +#define ksm_nr_node_ids 1 +#endif + +#define KSM_RUN_STOP 0 +#define KSM_RUN_MERGE 1 +#define KSM_RUN_UNMERGE 2 +#define KSM_RUN_OFFLINE 4 +static unsigned long ksm_run = KSM_RUN_STOP; +static void wait_while_offlining(void); + +static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DEFINE_MUTEX(ksm_thread_mutex); +static DEFINE_SPINLOCK(ksm_mmlist_lock); + +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +static int __init ksm_slab_init(void) +{ + rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out; + + stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + if (!stable_node_cache) + goto out_free1; + + mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + if (!mm_slot_cache) + goto out_free2; + + return 0; + +out_free2: + kmem_cache_destroy(stable_node_cache); +out_free1: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init ksm_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(stable_node_cache); + kmem_cache_destroy(rmap_item_cache); + mm_slot_cache = NULL; +} + +static inline struct rmap_item *alloc_rmap_item(void) +{ + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) + ksm_rmap_items++; + return rmap_item; +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + ksm_rmap_items--; + rmap_item->mm = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct stable_node *alloc_stable_node(void) +{ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); +} + +static inline void free_stable_node(struct stable_node *stable_node) +{ + kmem_cache_free(stable_node_cache, stable_node); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *slot; + + hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) + if (slot->mm == mm) + return slot; + + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + mm_slot->mm = mm; + hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); +} + +/* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + int ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); + if (IS_ERR_OR_NULL(page)) + break; + if (PageKsm(page)) + ret = handle_mm_fault(vma->vm_mm, vma, addr, + FAULT_FLAG_WRITE); + else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but ksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_area_struct *vma; + if (ksm_test_exit(mm)) + return NULL; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + return NULL; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + return NULL; + return vma; +} + +static void break_cow(struct rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + + /* + * It is not an accident that whenever we want to break COW + * to undo, we also need to drop a reference to the anon_vma. + */ + put_anon_vma(rmap_item->anon_vma); + + down_read(&mm->mmap_sem); + vma = find_mergeable_vma(mm, addr); + if (vma) + break_ksm(vma, addr); + up_read(&mm->mmap_sem); +} + +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. + */ + if (PageAnon(head)) + return head; + } + return NULL; +} + +static struct page *get_mergeable_page(struct rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + struct page *page; + + down_read(&mm->mmap_sem); + vma = find_mergeable_vma(mm, addr); + if (!vma) + goto out; + + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + if (PageAnon(page) || page_trans_compound_anon(page)) { + flush_anon_page(vma, page, addr); + flush_dcache_page(page); + } else { + put_page(page); +out: page = NULL; + } + up_read(&mm->mmap_sem); + return page; +} + +/* + * This helper is used for getting right index into array of tree roots. + * When merge_across_nodes knob is set to 1, there are only two rb-trees for + * stable and unstable pages from all nodes with roots in index 0. Otherwise, + * every node has its own stable and unstable tree. + */ +static inline int get_kpfn_nid(unsigned long kpfn) +{ + return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); +} + +static void remove_node_from_stable_tree(struct stable_node *stable_node) +{ + struct rmap_item *rmap_item; + + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { + if (rmap_item->hlist.next) + ksm_pages_sharing--; + else + ksm_pages_shared--; + put_anon_vma(rmap_item->anon_vma); + rmap_item->address &= PAGE_MASK; + cond_resched(); + } + + if (stable_node->head == &migrate_nodes) + list_del(&stable_node->list); + else + rb_erase(&stable_node->node, + root_stable_tree + NUMA(stable_node->nid)); + free_stable_node(stable_node); +} + +/* + * get_ksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * But beware, the stable node's page might be being migrated. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * is on its way to being freed; but it is an anomaly to bear in mind. + */ +static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) +{ + struct page *page; + void *expected_mapping; + unsigned long kpfn; + + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); +again: + kpfn = READ_ONCE(stable_node->kpfn); + page = pfn_to_page(kpfn); + + /* + * page is computed from kpfn, so on most architectures reading + * page->mapping is naturally ordered after reading node->kpfn, + * but on Alpha we need to be more careful. + */ + smp_read_barrier_depends(); + if (READ_ONCE(page->mapping) != expected_mapping) + goto stale; + + /* + * We cannot do anything with the page while its refcount is 0. + * Usually 0 means free, or tail of a higher-order page: in which + * case this node is no longer referenced, and should be freed; + * however, it might mean that the page is under page_freeze_refs(). + * The __remove_mapping() case is easy, again the node is now stale; + * but if page is swapcache in migrate_page_move_mapping(), it might + * still be our page, in which case it's essential to keep the node. + */ + while (!get_page_unless_zero(page)) { + /* + * Another check for page->mapping != expected_mapping would + * work here too. We have chosen the !PageSwapCache test to + * optimize the common case, when the page is or is about to + * be freed: PageSwapCache is cleared (under spin_lock_irq) + * in the freeze_refs section of __remove_mapping(); but Anon + * page->mapping reset to NULL later, in free_pages_prepare(). + */ + if (!PageSwapCache(page)) + goto stale; + cpu_relax(); + } + + if (READ_ONCE(page->mapping) != expected_mapping) { + put_page(page); + goto stale; + } + + if (lock_it) { + lock_page(page); + if (READ_ONCE(page->mapping) != expected_mapping) { + unlock_page(page); + put_page(page); + goto stale; + } + } + return page; + +stale: + /* + * We come here from above when page->mapping or !PageSwapCache + * suggests that the node is stale; but it might be under migration. + * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), + * before checking whether node->kpfn has been changed. + */ + smp_rmb(); + if (READ_ONCE(stable_node->kpfn) != kpfn) + goto again; + remove_node_from_stable_tree(stable_node); + return NULL; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + if (rmap_item->address & STABLE_FLAG) { + struct stable_node *stable_node; + struct page *page; + + stable_node = rmap_item->head; + page = get_ksm_page(stable_node, true); + if (!page) + goto out; + + hlist_del(&rmap_item->hlist); + unlock_page(page); + put_page(page); + + if (stable_node->hlist.first) + ksm_pages_sharing--; + else + ksm_pages_shared--; + + put_anon_vma(rmap_item->anon_vma); + rmap_item->address &= PAGE_MASK; + + } else if (rmap_item->address & UNSTABLE_FLAG) { + unsigned char age; + /* + * Usually ksmd can and must skip the rb_erase, because + * root_unstable_tree was already reset to RB_ROOT. + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. + */ + age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); + BUG_ON(age > 1); + if (!age) + rb_erase(&rmap_item->node, + root_unstable_tree + NUMA(rmap_item->nid)); + ksm_pages_unshared--; + rmap_item->address &= PAGE_MASK; + } +out: + cond_resched(); /* we're called from many long loops */ +} + +static void remove_trailing_rmap_items(struct mm_slot *mm_slot, + struct rmap_item **rmap_list) +{ + while (*rmap_list) { + struct rmap_item *rmap_item = *rmap_list; + *rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); + free_rmap_item(rmap_item); + } +} + +/* + * Though it's very tempting to unmerge rmap_items from stable tree rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +static int unmerge_ksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ +static int remove_stable_node(struct stable_node *stable_node) +{ + struct page *page; + int err; + + page = get_ksm_page(stable_node, true); + if (!page) { + /* + * get_ksm_page did remove_node_from_stable_tree itself. + */ + return 0; + } + + if (WARN_ON_ONCE(page_mapped(page))) { + /* + * This should not happen: but if it does, just refuse to let + * merge_across_nodes be switched - there is no need to panic. + */ + err = -EBUSY; + } else { + /* + * The stable node did not yet appear stale to get_ksm_page(), + * since that allows for an unmapped ksm page to be recognized + * right up until it is freed; but the node is safe to remove. + * This page might be in a pagevec waiting to be freed, + * or it might be PageSwapCache (perhaps under writeback), + * or it might have been removed from swapcache a moment ago. + */ + set_page_stable_node(page, NULL); + remove_node_from_stable_tree(stable_node); + err = 0; + } + + unlock_page(page); + put_page(page); + return err; +} + +static int remove_all_stable_nodes(void) +{ + struct stable_node *stable_node; + struct list_head *this, *next; + int nid; + int err = 0; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + while (root_stable_tree[nid].rb_node) { + stable_node = rb_entry(root_stable_tree[nid].rb_node, + struct stable_node, node); + if (remove_stable_node(stable_node)) { + err = -EBUSY; + break; /* proceed to next nid */ + } + cond_resched(); + } + } + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, struct stable_node, list); + if (remove_stable_node(stable_node)) + err = -EBUSY; + cond_resched(); + } + return err; +} + +static int unmerge_and_remove_all_rmap_items(void) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int err = 0; + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, + struct mm_slot, mm_list); + spin_unlock(&ksm_mmlist_lock); + + for (mm_slot = ksm_scan.mm_slot; + mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + break; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + continue; + err = unmerge_ksm_pages(vma, + vma->vm_start, vma->vm_end); + if (err) + goto error; + } + + remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_test_exit(mm)) { + hash_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + } + + /* Clean up stable nodes, but don't worry if some are still busy */ + remove_all_stable_nodes(); + ksm_scan.seqnr = 0; + return 0; + +error: + up_read(&mm->mmap_sem); + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = &ksm_mm_head; + spin_unlock(&ksm_mmlist_lock); + return err; +} +#endif /* CONFIG_SYSFS */ + +static u32 calc_checksum(struct page *page) +{ + u32 checksum; + void *addr = kmap_atomic(page); + checksum = jhash2(addr, PAGE_SIZE / 4, 17); + kunmap_atomic(addr); + return checksum; +} + +static int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + return ret; +} + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + pte_t *ptep; + spinlock_t *ptl; + int swapped; + int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + BUG_ON(PageTransCompound(page)); + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptep = page_check_address(page, mm, addr, &ptl, 0); + if (!ptep) + goto out_mn; + + if (pte_write(*ptep) || pte_dirty(*ptep)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, addr, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesn't + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racey and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + */ + entry = ptep_clear_flush_notify(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) + 1 + swapped != page_count(page)) { + set_pte_at(mm, addr, ptep, entry); + goto out_unlock; + } + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); + set_pte_at_notify(mm, addr, ptep, entry); + } + *orig_pte = *ptep; + err = 0; + +out_unlock: + pte_unmap_unlock(ptep, ptl); +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out: + return err; +} + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, -EFAULT on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + int err = -EFAULT; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pmd = mm_find_pmd(mm, addr); + if (!pmd) + goto out; + + mmun_start = addr; + mmun_end = addr + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out_mn; + } + + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + + page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); + put_page(page); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out_mn: + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out: + return err; +} + +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + +/* + * try_to_merge_one_page - take two pages and merge them into one + * @vma: the vma that holds the pte pointing to page + * @page: the PageAnon page that we want to replace with kpage + * @kpage: the PageKsm page that we want to map instead of page, + * or NULL the first time when we want to use page as kpage. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_one_page(struct vm_area_struct *vma, + struct page *page, struct page *kpage) +{ + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (page == kpage) /* ksm page forked */ + return 0; + + if (!(vma->vm_flags & VM_MERGEABLE)) + goto out; + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); + if (!PageAnon(page)) + goto out; + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(page)) + goto out; + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, page, &orig_pte) == 0) { + if (!kpage) { + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + set_page_stable_node(page, NULL); + mark_page_accessed(page); + err = 0; + } else if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); + } + + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { + munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } + + unlock_page(page); +out: + return err; +} + +/* + * try_to_merge_with_ksm_page - like try_to_merge_two_pages, + * but no new kernel page is allocated: kpage must already be a ksm page. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, + struct page *page, struct page *kpage) +{ + struct mm_struct *mm = rmap_item->mm; + struct vm_area_struct *vma; + int err = -EFAULT; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, rmap_item->address); + if (!vma || vma->vm_start > rmap_item->address) + goto out; + + err = try_to_merge_one_page(vma, page, kpage); + if (err) + goto out; + + /* Unstable nid is in union with stable anon_vma: remove first */ + remove_rmap_item_from_tree(rmap_item); + + /* Must get reference to anon_vma while still holding mmap_sem */ + rmap_item->anon_vma = vma->anon_vma; + get_anon_vma(vma->anon_vma); +out: + up_read(&mm->mmap_sem); + return err; +} + +/* + * try_to_merge_two_pages - take two identical pages and prepare them + * to be merged into one page. + * + * This function returns the kpage if we successfully merged two identical + * pages into one ksm page, NULL otherwise. + * + * Note that this function upgrades page to ksm page: if one of the pages + * is already a ksm page, try_to_merge_with_ksm_page should be used. + */ +static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, + struct page *page, + struct rmap_item *tree_rmap_item, + struct page *tree_page) +{ + int err; + + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); + if (!err) { + err = try_to_merge_with_ksm_page(tree_rmap_item, + tree_page, page); + /* + * If that fails, we have a ksm page with only one pte + * pointing to it: so break it. + */ + if (err) + break_cow(rmap_item); + } + return err ? NULL : page; +} + +/* + * stable_tree_search - search for page inside the stable tree + * + * This function checks if there is a page inside the stable tree + * with identical content to the page that we are scanning right now. + * + * This function returns the stable tree node of identical content if found, + * NULL otherwise. + */ +static struct page *stable_tree_search(struct page *page) +{ + int nid; + struct rb_root *root; + struct rb_node **new; + struct rb_node *parent; + struct stable_node *stable_node; + struct stable_node *page_node; + + page_node = page_stable_node(page); + if (page_node && page_node->head != &migrate_nodes) { + /* ksm page forked */ + get_page(page); + return page; + } + + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_stable_tree + nid; +again: + new = &root->rb_node; + parent = NULL; + + while (*new) { + struct page *tree_page; + int ret; + + cond_resched(); + stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node, false); + if (!tree_page) + return NULL; + + ret = memcmp_pages(page, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * Lock and unlock the stable_node's page (which + * might already have been migrated) so that page + * migration is sure to notice its raised count. + * It would be more elegant to return stable_node + * than kpage, but that involves more changes. + */ + tree_page = get_ksm_page(stable_node, true); + if (tree_page) { + unlock_page(tree_page); + if (get_kpfn_nid(stable_node->kpfn) != + NUMA(stable_node->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; + } + /* + * There is now a place for page_node, but the tree may + * have been rebalanced, so re-evaluate parent and new. + */ + if (page_node) + goto again; + return NULL; + } + } + + if (!page_node) + return NULL; + + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_link_node(&page_node->node, parent, new); + rb_insert_color(&page_node->node, root); + get_page(page); + return page; + +replace: + if (page_node) { + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node->node, &page_node->node, root); + get_page(page); + } else { + rb_erase(&stable_node->node, root); + page = NULL; + } + stable_node->head = &migrate_nodes; + list_add(&stable_node->list, stable_node->head); + return page; +} + +/* + * stable_tree_insert - insert stable tree node pointing to new ksm page + * into the stable tree. + * + * This function returns the stable tree node just allocated on success, + * NULL otherwise. + */ +static struct stable_node *stable_tree_insert(struct page *kpage) +{ + int nid; + unsigned long kpfn; + struct rb_root *root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct stable_node *stable_node; + + kpfn = page_to_pfn(kpage); + nid = get_kpfn_nid(kpfn); + root = root_stable_tree + nid; + new = &root->rb_node; + + while (*new) { + struct page *tree_page; + int ret; + + cond_resched(); + stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node, false); + if (!tree_page) + return NULL; + + ret = memcmp_pages(kpage, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * It is not a bug that stable_tree_search() didn't + * find this node: because at that time our page was + * not yet write-protected, so may have changed since. + */ + return NULL; + } + } + + stable_node = alloc_stable_node(); + if (!stable_node) + return NULL; + + INIT_HLIST_HEAD(&stable_node->hlist); + stable_node->kpfn = kpfn; + set_page_stable_node(kpage, stable_node); + DO_NUMA(stable_node->nid = nid); + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, root); + + return stable_node; +} + +/* + * unstable_tree_search_insert - search for identical page, + * else insert rmap_item into the unstable tree. + * + * This function searches for a page in the unstable tree identical to the + * page currently being scanned; and if no identical page is found in the + * tree, we insert rmap_item as a new object into the unstable tree. + * + * This function returns pointer to rmap_item found to be identical + * to the currently scanned page, NULL otherwise. + * + * This function does both searching and inserting, because they share + * the same walking algorithm in an rbtree. + */ +static +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, + struct page *page, + struct page **tree_pagep) +{ + struct rb_node **new; + struct rb_root *root; + struct rb_node *parent = NULL; + int nid; + + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_unstable_tree + nid; + new = &root->rb_node; + + while (*new) { + struct rmap_item *tree_rmap_item; + struct page *tree_page; + int ret; + + cond_resched(); + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + tree_page = get_mergeable_page(tree_rmap_item); + if (IS_ERR_OR_NULL(tree_page)) + return NULL; + + /* + * Don't substitute a ksm page for a forked page. + */ + if (page == tree_page) { + put_page(tree_page); + return NULL; + } + + ret = memcmp_pages(page, tree_page); + + parent = *new; + if (ret < 0) { + put_page(tree_page); + new = &parent->rb_left; + } else if (ret > 0) { + put_page(tree_page); + new = &parent->rb_right; + } else if (!ksm_merge_across_nodes && + page_to_nid(tree_page) != nid) { + /* + * If tree_page has been migrated to another NUMA node, + * it will be flushed out and put in the right unstable + * tree next time: only merge with it when across_nodes. + */ + put_page(tree_page); + return NULL; + } else { + *tree_pagep = tree_page; + return tree_rmap_item; + } + } + + rmap_item->address |= UNSTABLE_FLAG; + rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + DO_NUMA(rmap_item->nid = nid); + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, root); + + ksm_pages_unshared++; + return NULL; +} + +/* + * stable_tree_append - add another rmap_item to the linked list of + * rmap_items hanging off a given node of the stable tree, all sharing + * the same ksm page. + */ +static void stable_tree_append(struct rmap_item *rmap_item, + struct stable_node *stable_node) +{ + rmap_item->head = stable_node; + rmap_item->address |= STABLE_FLAG; + hlist_add_head(&rmap_item->hlist, &stable_node->hlist); + + if (rmap_item->hlist.next) + ksm_pages_sharing++; + else + ksm_pages_shared++; +} + +/* + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can + * be inserted into the unstable tree, or merged with a page already there and + * both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +{ + struct rmap_item *tree_rmap_item; + struct page *tree_page = NULL; + struct stable_node *stable_node; + struct page *kpage; + unsigned int checksum; + int err; + + stable_node = page_stable_node(page); + if (stable_node) { + if (stable_node->head != &migrate_nodes && + get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { + rb_erase(&stable_node->node, + root_stable_tree + NUMA(stable_node->nid)); + stable_node->head = &migrate_nodes; + list_add(&stable_node->list, stable_node->head); + } + if (stable_node->head != &migrate_nodes && + rmap_item->head == stable_node) + return; + } + + /* We first start with searching the page inside the stable tree */ + kpage = stable_tree_search(page); + if (kpage == page && rmap_item->head == stable_node) { + put_page(kpage); + return; + } + + remove_rmap_item_from_tree(rmap_item); + + if (kpage) { + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); + if (!err) { + /* + * The page was successfully merged: + * add its rmap_item to the stable tree. + */ + lock_page(kpage); + stable_tree_append(rmap_item, page_stable_node(kpage)); + unlock_page(kpage); + } + put_page(kpage); + return; + } + + /* + * If the hash value of the page has changed from the last time + * we calculated it, this page is changing frequently: therefore we + * don't want to insert it in the unstable tree, and we don't want + * to waste our time searching for something identical to it there. + */ + checksum = calc_checksum(page); + if (rmap_item->oldchecksum != checksum) { + rmap_item->oldchecksum = checksum; + return; + } + + tree_rmap_item = + unstable_tree_search_insert(rmap_item, page, &tree_page); + if (tree_rmap_item) { + kpage = try_to_merge_two_pages(rmap_item, page, + tree_rmap_item, tree_page); + put_page(tree_page); + if (kpage) { + /* + * The pages were successfully merged: insert new + * node in the stable tree and add both rmap_items. + */ + lock_page(kpage); + stable_node = stable_tree_insert(kpage); + if (stable_node) { + stable_tree_append(tree_rmap_item, stable_node); + stable_tree_append(rmap_item, stable_node); + } + unlock_page(kpage); + + /* + * If we fail to insert the page into the stable tree, + * we will have 2 virtual addresses that are pointing + * to a ksm page left outside the stable tree, + * in which case we need to break_cow on both. + */ + if (!stable_node) { + break_cow(tree_rmap_item); + break_cow(rmap_item); + } + } + } +} + +static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, + struct rmap_item **rmap_list, + unsigned long addr) +{ + struct rmap_item *rmap_item; + + while (*rmap_list) { + rmap_item = *rmap_list; + if ((rmap_item->address & PAGE_MASK) == addr) + return rmap_item; + if (rmap_item->address > addr) + break; + *rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); + free_rmap_item(rmap_item); + } + + rmap_item = alloc_rmap_item(); + if (rmap_item) { + /* It has already been zeroed */ + rmap_item->mm = mm_slot->mm; + rmap_item->address = addr; + rmap_item->rmap_list = *rmap_list; + *rmap_list = rmap_item; + } + return rmap_item; +} + +static struct rmap_item *scan_get_next_rmap_item(struct page **page) +{ + struct mm_struct *mm; + struct mm_slot *slot; + struct vm_area_struct *vma; + struct rmap_item *rmap_item; + int nid; + + if (list_empty(&ksm_mm_head.mm_list)) + return NULL; + + slot = ksm_scan.mm_slot; + if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + + /* + * Whereas stale stable_nodes on the stable_tree itself + * get pruned in the regular course of stable_tree_search(), + * those moved out to the migrate_nodes list can accumulate: + * so prune them once before each full scan. + */ + if (!ksm_merge_across_nodes) { + struct stable_node *stable_node; + struct list_head *this, *next; + struct page *page; + + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, + struct stable_node, list); + page = get_ksm_page(stable_node, false); + if (page) + put_page(page); + cond_resched(); + } + } + + for (nid = 0; nid < ksm_nr_node_ids; nid++) + root_unstable_tree[nid] = RB_ROOT; + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + ksm_scan.mm_slot = slot; + spin_unlock(&ksm_mmlist_lock); + /* + * Although we tested list_empty() above, a racing __ksm_exit + * of the last mm on the list may have removed it since then. + */ + if (slot == &ksm_mm_head) + return NULL; +next_mm: + ksm_scan.address = 0; + ksm_scan.rmap_list = &slot->rmap_list; + } + + mm = slot->mm; + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + vma = NULL; + else + vma = find_vma(mm, ksm_scan.address); + + for (; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_MERGEABLE)) + continue; + if (ksm_scan.address < vma->vm_start) + ksm_scan.address = vma->vm_start; + if (!vma->anon_vma) + ksm_scan.address = vma->vm_end; + + while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; + *page = follow_page(vma, ksm_scan.address, FOLL_GET); + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (PageAnon(*page) || + page_trans_compound_anon(*page)) { + flush_anon_page(vma, *page, ksm_scan.address); + flush_dcache_page(*page); + rmap_item = get_next_rmap_item(slot, + ksm_scan.rmap_list, ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_list = + &rmap_item->rmap_list; + ksm_scan.address += PAGE_SIZE; + } else + put_page(*page); + up_read(&mm->mmap_sem); + return rmap_item; + } + put_page(*page); + ksm_scan.address += PAGE_SIZE; + cond_resched(); + } + } + + if (ksm_test_exit(mm)) { + ksm_scan.address = 0; + ksm_scan.rmap_list = &slot->rmap_list; + } + /* + * Nuke all the rmap_items that are above this current rmap: + * because there were no VM_MERGEABLE vmas with such addresses. + */ + remove_trailing_rmap_items(slot, ksm_scan.rmap_list); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_sem + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_sem then protects against race with MADV_MERGEABLE). + */ + hash_del(&slot->link); + list_del(&slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + + /* Repeat until we've completed scanning the whole list */ + slot = ksm_scan.mm_slot; + if (slot != &ksm_mm_head) + goto next_mm; + + ksm_scan.seqnr++; + return NULL; +} + +/** + * ksm_do_scan - the ksm scanner main worker function. + * @scan_npages - number of pages we want to scan before we return. + */ +static void ksm_do_scan(unsigned int scan_npages) +{ + struct rmap_item *rmap_item; + struct page *uninitialized_var(page); + + while (scan_npages-- && likely(!freezing(current))) { + cond_resched(); + rmap_item = scan_get_next_rmap_item(&page); + if (!rmap_item) + return; + cmp_and_merge_page(page, rmap_item); + put_page(page); + } +} + +static int ksmd_should_run(void) +{ + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); +} + +static int ksm_scan_thread(void *nothing) +{ + set_freezable(); + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksmd_should_run()) + ksm_do_scan(ksm_thread_pages_to_scan); + mutex_unlock(&ksm_thread_mutex); + + try_to_freeze(); + + if (ksmd_should_run()) { + schedule_timeout_interruptible( + msecs_to_jiffies(ksm_thread_sleep_millisecs)); + } else { + wait_event_freezable(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP)) + return 0; /* just ignore the advice */ + +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } + + *vm_flags |= VM_MERGEABLE; + break; + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } + + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} + +int __ksm_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int needs_wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* Check ksm_run too? Would need tighter locking */ + needs_wakeup = list_empty(&ksm_mm_head.mm_list); + + spin_lock(&ksm_mmlist_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * When KSM_RUN_MERGE (or KSM_RUN_STOP), + * insert just behind the scanning cursor, to let the area settle + * down a little; when fork is followed by immediate exec, we don't + * want ksmd to waste time setting up and tearing down an rmap_list. + * + * But when KSM_RUN_UNMERGE, it's important to insert ahead of its + * scanning cursor, otherwise KSM pages in newly forked mms will be + * missed: then we might as well insert at the end of the list. + */ + if (ksm_run & KSM_RUN_UNMERGE) + list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + else + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + set_bit(MMF_VM_MERGEABLE, &mm->flags); + atomic_inc(&mm->mm_count); + + if (needs_wakeup) + wake_up_interruptible(&ksm_thread_wait); + + return 0; +} + +void __ksm_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int easy_to_free = 0; + + /* + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_sem to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. + */ + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (!mm_slot->rmap_list) { + hash_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + easy_to_free = 1; + } else { + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +struct page *ksm_might_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = page_anon_vma(page); + struct page *new_page; + + if (PageKsm(page)) { + if (page_stable_node(page) && + !(ksm_run & KSM_RUN_UNMERGE)) + return page; /* no need to copy it */ + } else if (!anon_vma) { + return page; /* no need to copy it */ + } else if (anon_vma->root == vma->anon_vma->root && + page->index == linear_page_index(vma, address)) { + return page; /* still no need to copy it */ + } + if (!PageUptodate(page)) + return page; /* let do_swap_page report the error */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + __set_page_locked(new_page); + } + + return new_page; +} + +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +{ + struct stable_node *stable_node; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON_PAGE(!PageKsm(page), page); + + /* + * Rely on the page lock to protect against concurrent modifications + * to that page's node of the stable tree. + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + anon_vma_lock_read(anon_vma); + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg); + if (ret != SWAP_AGAIN) { + anon_vma_unlock_read(anon_vma); + goto out; + } + if (rwc->done && rwc->done(page)) { + anon_vma_unlock_read(anon_vma); + goto out; + } + } + anon_vma_unlock_read(anon_vma); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage); + stable_node->kpfn = page_to_pfn(newpage); + /* + * newpage->mapping was set in advance; now we need smp_wmb() + * to make sure that the new stable_node->kpfn is visible + * to get_ksm_page() before it can see that oldpage->mapping + * has gone stale (or that PageSwapCache has been cleared). + */ + smp_wmb(); + set_page_stable_node(oldpage, NULL); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static void wait_while_offlining(void) +{ + while (ksm_run & KSM_RUN_OFFLINE) { + mutex_unlock(&ksm_thread_mutex); + wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), + TASK_UNINTERRUPTIBLE); + mutex_lock(&ksm_thread_mutex); + } +} + +static void ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct stable_node *stable_node; + struct list_head *this, *next; + struct rb_node *node; + int nid; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + node = rb_first(root_stable_tree + nid); + while (node) { + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + node = rb_first(root_stable_tree + nid); + } else + node = rb_next(node); + cond_resched(); + } + } + list_for_each_safe(this, next, &migrate_nodes) { + stable_node = list_entry(this, struct stable_node, list); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + remove_node_from_stable_tree(stable_node); + cond_resched(); + } +} + +static int ksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() + * and remove_all_stable_nodes() while memory is going offline: + * it is unsafe for them to touch the stable tree at this time. + * But unmerge_ksm_pages(), rmap lookups and other entry points + * which do not need the ksm_thread_mutex are all safe. + */ + mutex_lock(&ksm_thread_mutex); + ksm_run |= KSM_RUN_OFFLINE; + mutex_unlock(&ksm_thread_mutex); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree, + * otherwise get_ksm_page() might later try to access a + * non-existent struct page. + */ + ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_lock(&ksm_thread_mutex); + ksm_run &= ~KSM_RUN_OFFLINE; + mutex_unlock(&ksm_thread_mutex); + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); + break; + } + return NOTIFY_OK; +} +#else +static void wait_while_offlining(void) +{ +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define KSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define KSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_thread_sleep_millisecs = msecs; + + return count; +} +KSM_ATTR(sleep_millisecs); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); +} + +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = kstrtoul(buf, 10, &nr_pages); + if (err || nr_pages > UINT_MAX) + return -EINVAL; + + ksm_thread_pages_to_scan = nr_pages; + + return count; +} +KSM_ATTR(pages_to_scan); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", ksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = kstrtoul(buf, 10, &flags); + if (err || flags > UINT_MAX) + return -EINVAL; + if (flags > KSM_RUN_UNMERGE) + return -EINVAL; + + /* + * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. + * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). + */ + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_run != flags) { + ksm_run = flags; + if (flags & KSM_RUN_UNMERGE) { + set_current_oom_origin(); + err = unmerge_and_remove_all_rmap_items(); + clear_current_oom_origin(); + if (err) { + ksm_run = KSM_RUN_STOP; + count = err; + } + } + } + mutex_unlock(&ksm_thread_mutex); + + if (flags & KSM_RUN_MERGE) + wake_up_interruptible(&ksm_thread_wait); + + return count; +} +KSM_ATTR(run); + +#ifdef CONFIG_NUMA +static ssize_t merge_across_nodes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_merge_across_nodes); +} + +static ssize_t merge_across_nodes_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long knob; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_merge_across_nodes != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else if (root_stable_tree == one_stable_tree) { + struct rb_root *buf; + /* + * This is the first time that we switch away from the + * default of merging across nodes: must now allocate + * a buffer to hold as many roots as may be needed. + * Allocate stable and unstable together: + * MAXSMP NODES_SHIFT 10 will use 16kB. + */ + buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), + GFP_KERNEL); + /* Let us assume that RB_ROOT is NULL is zero */ + if (!buf) + err = -ENOMEM; + else { + root_stable_tree = buf; + root_unstable_tree = buf + nr_node_ids; + /* Stable tree is empty but not the unstable */ + root_unstable_tree[0] = one_unstable_tree[0]; + } + } + if (!err) { + ksm_merge_across_nodes = knob; + ksm_nr_node_ids = knob ? 1 : nr_node_ids; + } + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(merge_across_nodes); +#endif + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_shared); +} +KSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_sharing); +} +KSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_unshared); +} +KSM_ATTR_RO(pages_unshared); + +static ssize_t pages_volatile_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + long ksm_pages_volatile; + + ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared + - ksm_pages_sharing - ksm_pages_unshared; + /* + * It was not worth any locking to calculate that statistic, + * but it might therefore sometimes be negative: conceal that. + */ + if (ksm_pages_volatile < 0) + ksm_pages_volatile = 0; + return sprintf(buf, "%ld\n", ksm_pages_volatile); +} +KSM_ATTR_RO(pages_volatile); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_scan.seqnr); +} +KSM_ATTR_RO(full_scans); + +static struct attribute *ksm_attrs[] = { + &sleep_millisecs_attr.attr, + &pages_to_scan_attr.attr, + &run_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, + &full_scans_attr.attr, +#ifdef CONFIG_NUMA + &merge_across_nodes_attr.attr, +#endif + NULL, +}; + +static struct attribute_group ksm_attr_group = { + .attrs = ksm_attrs, + .name = "ksm", +}; +#endif /* CONFIG_SYSFS */ + +static int __init ksm_init(void) +{ + struct task_struct *ksm_thread; + int err; + + err = ksm_slab_init(); + if (err) + goto out; + + ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); + if (IS_ERR(ksm_thread)) { + pr_err("ksm: creating kthread failed\n"); + err = PTR_ERR(ksm_thread); + goto out_free; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &ksm_attr_group); + if (err) { + pr_err("ksm: register sysfs failed\n"); + kthread_stop(ksm_thread); + goto out_free; + } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MEMORY_HOTREMOVE + /* There is no significance to this priority 100 */ + hotplug_memory_notifier(ksm_memory_callback, 100); +#endif + return 0; + +out_free: + ksm_slab_free(); +out: + return err; +} +subsys_initcall(ksm_init); diff --git a/kernel/mm/list_lru.c b/kernel/mm/list_lru.c new file mode 100644 index 000000000..909eca2c8 --- /dev/null +++ b/kernel/mm/list_lru.c @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. + * Authors: David Chinner and Glauber Costa + * + * Generic LRU infrastructure + */ +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_MEMCG_KMEM +static LIST_HEAD(list_lrus); +static DEFINE_MUTEX(list_lrus_mutex); + +static void list_lru_register(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_add(&lru->list, &list_lrus); + mutex_unlock(&list_lrus_mutex); +} + +static void list_lru_unregister(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_del(&lru->list); + mutex_unlock(&list_lrus_mutex); +} +#else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return !!lru->node[0].memcg_lrus; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + /* + * The lock protects the array of per cgroup lists from relocation + * (see memcg_update_list_lru_node). + */ + lockdep_assert_held(&nlru->lock); + if (nlru->memcg_lrus && idx >= 0) + return nlru->memcg_lrus->lru[idx]; + + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + struct mem_cgroup *memcg; + + if (!nlru->memcg_lrus) + return &nlru->lru; + + memcg = mem_cgroup_from_kmem(ptr); + if (!memcg) + return &nlru->lru; + + return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); +} +#else +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return false; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + return &nlru->lru; +} +#endif /* CONFIG_MEMCG_KMEM */ + +bool list_lru_add(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + + spin_lock(&nlru->lock); + l = list_lru_from_kmem(nlru, item); + if (list_empty(item)) { + list_add_tail(item, &l->list); + l->nr_items++; + spin_unlock(&nlru->lock); + return true; + } + spin_unlock(&nlru->lock); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_add); + +bool list_lru_del(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + + spin_lock(&nlru->lock); + l = list_lru_from_kmem(nlru, item); + if (!list_empty(item)) { + list_del_init(item); + l->nr_items--; + spin_unlock(&nlru->lock); + return true; + } + spin_unlock(&nlru->lock); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_del); + +void list_lru_isolate(struct list_lru_one *list, struct list_head *item) +{ + list_del_init(item); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate); + +void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, + struct list_head *head) +{ + list_move(item, head); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate_move); + +static unsigned long __list_lru_count_one(struct list_lru *lru, + int nid, int memcg_idx) +{ + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + unsigned long count; + + spin_lock(&nlru->lock); + l = list_lru_from_memcg_idx(nlru, memcg_idx); + count = l->nr_items; + spin_unlock(&nlru->lock); + + return count; +} + +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) +{ + return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); +} +EXPORT_SYMBOL_GPL(list_lru_count_one); + +unsigned long list_lru_count_node(struct list_lru *lru, int nid) +{ + long count = 0; + int memcg_idx; + + count += __list_lru_count_one(lru, nid, -1); + if (list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) + count += __list_lru_count_one(lru, nid, memcg_idx); + } + return count; +} +EXPORT_SYMBOL_GPL(list_lru_count_node); + +static unsigned long +__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + struct list_head *item, *n; + unsigned long isolated = 0; + + spin_lock(&nlru->lock); + l = list_lru_from_memcg_idx(nlru, memcg_idx); +restart: + list_for_each_safe(item, n, &l->list) { + enum lru_status ret; + + /* + * decrement nr_to_walk first so that we don't livelock if we + * get stuck on large numbesr of LRU_RETRY items + */ + if (!*nr_to_walk) + break; + --*nr_to_walk; + + ret = isolate(item, l, &nlru->lock, cb_arg); + switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); + case LRU_REMOVED: + isolated++; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; + break; + case LRU_ROTATE: + list_move_tail(item, &l->list); + break; + case LRU_SKIP: + break; + case LRU_RETRY: + /* + * The lru lock has been dropped, our list traversal is + * now invalid and so we have to restart from scratch. + */ + assert_spin_locked(&nlru->lock); + goto restart; + default: + BUG(); + } + } + + spin_unlock(&nlru->lock); + return isolated; +} + +unsigned long +list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), + isolate, cb_arg, nr_to_walk); +} +EXPORT_SYMBOL_GPL(list_lru_walk_one); + +unsigned long list_lru_walk_node(struct list_lru *lru, int nid, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + long isolated = 0; + int memcg_idx; + + isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, + nr_to_walk); + if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) { + isolated += __list_lru_walk_one(lru, nid, memcg_idx, + isolate, cb_arg, nr_to_walk); + if (*nr_to_walk <= 0) + break; + } + } + return isolated; +} +EXPORT_SYMBOL_GPL(list_lru_walk_node); + +static void init_one_lru(struct list_lru_one *l) +{ + INIT_LIST_HEAD(&l->list); + l->nr_items = 0; +} + +#ifdef CONFIG_MEMCG_KMEM +static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) + kfree(memcg_lrus->lru[i]); +} + +static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) { + struct list_lru_one *l; + + l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); + if (!l) + goto fail; + + init_one_lru(l); + memcg_lrus->lru[i] = l; + } + return 0; +fail: + __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); + return -ENOMEM; +} + +static int memcg_init_list_lru_node(struct list_lru_node *nlru) +{ + int size = memcg_nr_cache_ids; + + nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); + if (!nlru->memcg_lrus) + return -ENOMEM; + + if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { + kfree(nlru->memcg_lrus); + return -ENOMEM; + } + + return 0; +} + +static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) +{ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); + kfree(nlru->memcg_lrus); +} + +static int memcg_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + struct list_lru_memcg *old, *new; + + BUG_ON(old_size > new_size); + + old = nlru->memcg_lrus; + new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); + if (!new) + return -ENOMEM; + + if (__memcg_init_list_lru_node(new, old_size, new_size)) { + kfree(new); + return -ENOMEM; + } + + memcpy(new, old, old_size * sizeof(void *)); + + /* + * The lock guarantees that we won't race with a reader + * (see list_lru_from_memcg_idx). + * + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + nlru->memcg_lrus = new; + spin_unlock_irq(&nlru->lock); + + kfree(old); + return 0; +} + +static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + /* do not bother shrinking the array back to the old size, because we + * cannot handle allocation failures here */ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); +} + +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) { + if (!memcg_aware) + lru->node[i].memcg_lrus = NULL; + else if (memcg_init_list_lru_node(&lru->node[i])) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_destroy_list_lru_node(&lru->node[i]); + return -ENOMEM; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_destroy_list_lru_node(&lru->node[i]); +} + +static int memcg_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return 0; + + for (i = 0; i < nr_node_ids; i++) { + if (memcg_update_list_lru_node(&lru->node[i], + old_size, new_size)) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); + return -ENOMEM; +} + +static void memcg_cancel_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); +} + +int memcg_update_all_list_lrus(int new_size) +{ + int ret = 0; + struct list_lru *lru; + int old_size = memcg_nr_cache_ids; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) { + ret = memcg_update_list_lru(lru, old_size, new_size); + if (ret) + goto fail; + } +out: + mutex_unlock(&list_lrus_mutex); + return ret; +fail: + list_for_each_entry_continue_reverse(lru, &list_lrus, list) + memcg_cancel_update_list_lru(lru, old_size, new_size); + goto out; +} + +static void memcg_drain_list_lru_node(struct list_lru_node *nlru, + int src_idx, int dst_idx) +{ + struct list_lru_one *src, *dst; + + /* + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + + src = list_lru_from_memcg_idx(nlru, src_idx); + dst = list_lru_from_memcg_idx(nlru, dst_idx); + + list_splice_init(&src->list, &dst->list); + dst->nr_items += src->nr_items; + src->nr_items = 0; + + spin_unlock_irq(&nlru->lock); +} + +static void memcg_drain_list_lru(struct list_lru *lru, + int src_idx, int dst_idx) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); +} + +void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +{ + struct list_lru *lru; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) + memcg_drain_list_lru(lru, src_idx, dst_idx); + mutex_unlock(&list_lrus_mutex); +} +#else +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + return 0; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +int __list_lru_init(struct list_lru *lru, bool memcg_aware, + struct lock_class_key *key) +{ + int i; + size_t size = sizeof(*lru->node) * nr_node_ids; + int err = -ENOMEM; + + memcg_get_cache_ids(); + + lru->node = kzalloc(size, GFP_KERNEL); + if (!lru->node) + goto out; + + for (i = 0; i < nr_node_ids; i++) { + spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); + init_one_lru(&lru->node[i].lru); + } + + err = memcg_init_list_lru(lru, memcg_aware); + if (err) { + kfree(lru->node); + goto out; + } + + list_lru_register(lru); +out: + memcg_put_cache_ids(); + return err; +} +EXPORT_SYMBOL_GPL(__list_lru_init); + +void list_lru_destroy(struct list_lru *lru) +{ + /* Already destroyed or not yet initialized? */ + if (!lru->node) + return; + + memcg_get_cache_ids(); + + list_lru_unregister(lru); + + memcg_destroy_list_lru(lru); + kfree(lru->node); + lru->node = NULL; + + memcg_put_cache_ids(); +} +EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/kernel/mm/maccess.c b/kernel/mm/maccess.c new file mode 100644 index 000000000..d53adf9ba --- /dev/null +++ b/kernel/mm/maccess.c @@ -0,0 +1,62 @@ +/* + * Access kernel memory without faulting. + */ +#include +#include +#include + +/** + * probe_kernel_read(): safely attempt to read from a location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from + * @size: size of the data chunk + * + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ + +long __weak probe_kernel_read(void *dst, const void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +long __probe_kernel_read(void *dst, const void *src, size_t size) +{ + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, + (__force const void __user *)src, size); + pagefault_enable(); + set_fs(old_fs); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(probe_kernel_read); + +/** + * probe_kernel_write(): safely attempt to write to a location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long __weak probe_kernel_write(void *dst, const void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +long __probe_kernel_write(void *dst, const void *src, size_t size) +{ + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); + pagefault_enable(); + set_fs(old_fs); + + return ret ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(probe_kernel_write); diff --git a/kernel/mm/madvise.c b/kernel/mm/madvise.c new file mode 100644 index 000000000..d55147551 --- /dev/null +++ b/kernel/mm/madvise.c @@ -0,0 +1,549 @@ +/* + * linux/mm/madvise.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 2002 Christoph Hellwig + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_sem for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) +{ + struct mm_struct *mm = vma->vm_mm; + int error = 0; + pgoff_t pgoff; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTCOPY; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (new_flags & VM_SPECIAL) { + error = -EINVAL; + goto out; + } + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + if (new_flags == vma->vm_flags) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto out; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto out; + } + +success: + /* + * vm_flags is protected by the mmap_sem held in write mode. + */ + vma->vm_flags = new_flags; + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +#ifdef CONFIG_SWAP +static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + pte_t *orig_pte; + struct vm_area_struct *vma = walk->private; + unsigned long index; + + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + return 0; + + for (index = start; index != end; index += PAGE_SIZE) { + pte_t pte; + swp_entry_t entry; + struct page *page; + spinlock_t *ptl; + + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + pte = *(orig_pte + ((index - start) / PAGE_SIZE)); + pte_unmap_unlock(orig_pte, ptl); + + if (pte_present(pte) || pte_none(pte)) + continue; + entry = pte_to_swp_entry(pte); + if (unlikely(non_swap_entry(entry))) + continue; + + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + vma, index); + if (page) + page_cache_release(page); + } + + return 0; +} + +static void force_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_walk walk = { + .mm = vma->vm_mm, + .pmd_entry = swapin_walk_pmd_entry, + .private = vma, + }; + + walk_page_range(start, end, &walk); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +static void force_shm_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct address_space *mapping) +{ + pgoff_t index; + struct page *page; + swp_entry_t swap; + + for (; start < end; start += PAGE_SIZE) { + index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + page = find_get_entry(mapping, index); + if (!radix_tree_exceptional_entry(page)) { + if (page) + page_cache_release(page); + continue; + } + swap = radix_to_swp_entry(page); + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, + NULL, 0); + if (page) + page_cache_release(page); + } + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} +#endif /* CONFIG_SWAP */ + +/* + * Schedule all required I/O operations. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct file *file = vma->vm_file; + +#ifdef CONFIG_SWAP + if (!file) { + *prev = vma; + force_swapin_readahead(vma, start, end); + return 0; + } + + if (shmem_mapping(file->f_mapping)) { + *prev = vma; + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#else + if (!file) + return -EBADF; +#endif + + if (IS_DAX(file_inode(file))) { + /* no bad return value, but ignore advice */ + return 0; + } + + *prev = vma; + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + force_page_cache_readahead(file->f_mapping, file, start, end - start); + return 0; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for shrink_active_list to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * shrink_active_list to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + zap_page_range(vma, start, end - start, NULL); + return 0; +} + +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + */ +static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + loff_t offset; + int error; + struct file *f; + + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + + if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) + return -EINVAL; + + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + /* + * Filesystem's fallocate may need to take i_mutex. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + get_file(f); + up_read(¤t->mm->mmap_sem); + error = vfs_fallocate(f, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - start); + fput(f); + down_read(¤t->mm->mmap_sem); + return error; +} + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) +{ + struct page *p; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE << + compound_order(compound_head(p))) { + int ret; + + ret = get_user_pages_fast(start, 1, 0, &p); + if (ret != 1) + return ret; + + if (PageHWPoison(p)) { + put_page(p); + continue; + } + if (bhv == MADV_SOFT_OFFLINE) { + pr_info("Soft offlining page %#lx at %#lx\n", + page_to_pfn(p), start); + ret = soft_offline_page(p, MF_COUNT_INCREASED); + if (ret) + return ret; + continue; + } + pr_info("Injecting memory failure for page %#lx at %#lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + } + return 0; +} +#endif + +static long +madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_DONTNEED: + return madvise_dontneed(vma, prev, start, end); + default: + return madvise_behavior(vma, prev, start, end, behavior); + } +} + +static int +madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_DOFORK: + case MADV_DONTFORK: + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: +#endif + case MADV_DONTDUMP: + case MADV_DODUMP: + return 1; + + default: + return 0; + } +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + unsigned long end, tmp; + struct vm_area_struct *vma, *prev; + int unmapped_error = 0; + int error = -EINVAL; + int write; + size_t len; + struct blk_plug plug; + +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_hwpoison(behavior, start, start+len_in); +#endif + if (!madvise_behavior_valid(behavior)) + return error; + + if (start & ~PAGE_MASK) + return error; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return error; + + end = start + len; + if (end < start) + return error; + + error = 0; + if (end == start) + return error; + + write = madvise_need_mmap_write(behavior); + if (write) + down_write(¤t->mm->mmap_sem); + else + down_read(¤t->mm->mmap_sem); + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + blk_start_plug(&plug); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + goto out; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = madvise_vma(vma, &prev, start, tmp, behavior); + if (error) + goto out; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + goto out; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); + } +out: + blk_finish_plug(&plug); + if (write) + up_write(¤t->mm->mmap_sem); + else + up_read(¤t->mm->mmap_sem); + + return error; +} diff --git a/kernel/mm/memblock.c b/kernel/mm/memblock.c new file mode 100644 index 000000000..9318b567e --- /dev/null +++ b/kernel/mm/memblock.c @@ -0,0 +1,1605 @@ +/* + * Procedures for maintaining information about logical memory blocks. + * + * Peter Bergner, IBM Corp. June 2001. + * Copyright (C) 2001 Peter Bergner. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; +#endif + +struct memblock memblock __initdata_memblock = { + .memory.regions = memblock_memory_init_regions, + .memory.cnt = 1, /* empty dummy entry */ + .memory.max = INIT_MEMBLOCK_REGIONS, + + .reserved.regions = memblock_reserved_init_regions, + .reserved.cnt = 1, /* empty dummy entry */ + .reserved.max = INIT_MEMBLOCK_REGIONS, + +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + .physmem.regions = memblock_physmem_init_regions, + .physmem.cnt = 1, /* empty dummy entry */ + .physmem.max = INIT_PHYSMEM_REGIONS, +#endif + + .bottom_up = false, + .current_limit = MEMBLOCK_ALLOC_ANYWHERE, +}; + +int memblock_debug __initdata_memblock; +#ifdef CONFIG_MOVABLE_NODE +bool movable_node_enabled __initdata_memblock = false; +#endif +static int memblock_can_resize __initdata_memblock; +static int memblock_memory_in_slab __initdata_memblock = 0; +static int memblock_reserved_in_slab __initdata_memblock = 0; + +/* inline so we don't get a warning when pr_debug is compiled out */ +static __init_memblock const char * +memblock_type_name(struct memblock_type *type) +{ + if (type == &memblock.memory) + return "memory"; + else if (type == &memblock.reserved) + return "reserved"; + else + return "unknown"; +} + +/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ +static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) +{ + return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); +} + +/* + * Address comparison utilities + */ +static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); +} + +static long __init_memblock memblock_overlaps_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) +{ + unsigned long i; + + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; + if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + break; + } + + return (i < type->cnt) ? i : -1; +} + +/* + * __memblock_find_range_bottom_up - find free area utility in bottom-up + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Utility called from memblock_find_in_range_node(), find free area bottom-up. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t this_start, this_end, cand; + u64 i; + + for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + cand = round_up(this_start, align); + if (cand < this_end && this_end - cand >= size) + return cand; + } + + return 0; +} + +/** + * __memblock_find_range_top_down - find free area utility, in top-down + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Utility called from memblock_find_in_range_node(), find free area top-down. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t this_start, this_end, cand; + u64 i; + + for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + if (this_end < size) + continue; + + cand = round_down(this_end - size, align); + if (cand >= this_start) + return cand; + } + + return 0; +} + +/** + * memblock_find_in_range_node - find free area in given range and node + * @size: size of free area to find + * @align: alignment of free area to find + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Find @size free area aligned to @align in the specified range and node. + * + * When allocation direction is bottom-up, the @start should be greater + * than the end of the kernel image. Otherwise, it will be trimmed. The + * reason is that we want the bottom-up allocation just near the kernel + * image so it is highly likely that the allocated memory and the kernel + * will reside in the same node. + * + * If bottom-up allocation failed, will try to allocate memory top-down. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) +{ + phys_addr_t kernel_end, ret; + + /* pump up @end */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + end = memblock.current_limit; + + /* avoid allocating the first page */ + start = max_t(phys_addr_t, start, PAGE_SIZE); + end = max(start, end); + kernel_end = __pa_symbol(_end); + + /* + * try bottom-up allocation only when bottom-up mode + * is set and @end is above the kernel image. + */ + if (memblock_bottom_up() && end > kernel_end) { + phys_addr_t bottom_up_start; + + /* make sure we will allocate above the kernel */ + bottom_up_start = max(start, kernel_end); + + /* ok, try bottom-up allocation first */ + ret = __memblock_find_range_bottom_up(bottom_up_start, end, + size, align, nid); + if (ret) + return ret; + + /* + * we always limit bottom-up allocation above the kernel, + * but top-down allocation doesn't have the limit, so + * retrying top-down allocation may succeed when bottom-up + * allocation failed. + * + * bottom-up allocation is expected to be fail very rarely, + * so we use WARN_ONCE() here to see the stack trace if + * fail happens. + */ + WARN_ONCE(1, "memblock: bottom-up allocation failed, " + "memory hotunplug may be affected\n"); + } + + return __memblock_find_range_top_down(start, end, size, align, nid); +} + +/** + * memblock_find_in_range - find free area in given range + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * + * Find @size free area aligned to @align in the specified range. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, + phys_addr_t end, phys_addr_t size, + phys_addr_t align) +{ + return memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE); +} + +static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) +{ + type->total_size -= type->regions[r].size; + memmove(&type->regions[r], &type->regions[r + 1], + (type->cnt - (r + 1)) * sizeof(type->regions[r])); + type->cnt--; + + /* Special case for empty arrays */ + if (type->cnt == 0) { + WARN_ON(type->total_size != 0); + type->cnt = 1; + type->regions[0].base = 0; + type->regions[0].size = 0; + type->regions[0].flags = 0; + memblock_set_region_node(&type->regions[0], MAX_NUMNODES); + } +} + +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + +phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( + phys_addr_t *addr) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + *addr = __pa(memblock.reserved.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); +} + +phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info( + phys_addr_t *addr) +{ + if (memblock.memory.regions == memblock_memory_init_regions) + return 0; + + *addr = __pa(memblock.memory.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); +} + +#endif + +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start,@new_area_start+@new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * RETURNS: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) +{ + struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; + phys_addr_t old_size, new_size, addr; + int use_slab = slab_is_available(); + int *in_slab; + + /* We don't allow resizing until we know about the reserved regions + * of memory that aren't suitable for allocation + */ + if (!memblock_can_resize) + return -1; + + /* Calculate new doubled size */ + old_size = type->max * sizeof(struct memblock_region); + new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); + + /* Retrieve the slab flag */ + if (type == &memblock.memory) + in_slab = &memblock_memory_in_slab; + else + in_slab = &memblock_reserved_in_slab; + + /* Try to find some space for it. + * + * WARNING: We assume that either slab_is_available() and we use it or + * we use MEMBLOCK for allocations. That means that this is unsafe to + * use when bootmem is currently active (unless bootmem itself is + * implemented on top of MEMBLOCK which isn't the case yet) + * + * This should however not be an issue for now, as we currently only + * call into MEMBLOCK while it's still active, or much later when slab + * is active for memory hotplug operations + */ + if (use_slab) { + new_array = kmalloc(new_size, GFP_KERNEL); + addr = new_array ? __pa(new_array) : 0; + } else { + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_alloc_size, PAGE_SIZE); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_alloc_size, PAGE_SIZE); + + new_array = addr ? __va(addr) : NULL; + } + if (!addr) { + pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", + memblock_type_name(type), type->max, type->max * 2); + return -1; + } + + memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", + memblock_type_name(type), type->max * 2, (u64)addr, + (u64)addr + new_size - 1); + + /* + * Found space, we now need to move the array over before we add the + * reserved region since it may be our reserved array itself that is + * full. + */ + memcpy(new_array, type->regions, old_size); + memset(new_array + type->max, 0, old_size); + old_array = type->regions; + type->regions = new_array; + type->max <<= 1; + + /* Free old array. We needn't free it if the array is the static one */ + if (*in_slab) + kfree(old_array); + else if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(__pa(old_array), old_alloc_size); + + /* + * Reserve the new array if that comes from the memblock. Otherwise, we + * needn't do it + */ + if (!use_slab) + BUG_ON(memblock_reserve(addr, new_alloc_size)); + + /* Update slab flag */ + *in_slab = use_slab; + + return 0; +} + +/** + * memblock_merge_regions - merge neighboring compatible regions + * @type: memblock type to scan + * + * Scan @type and merge neighboring compatible regions. + */ +static void __init_memblock memblock_merge_regions(struct memblock_type *type) +{ + int i = 0; + + /* cnt never goes below 1 */ + while (i < type->cnt - 1) { + struct memblock_region *this = &type->regions[i]; + struct memblock_region *next = &type->regions[i + 1]; + + if (this->base + this->size != next->base || + memblock_get_region_node(this) != + memblock_get_region_node(next) || + this->flags != next->flags) { + BUG_ON(this->base + this->size > next->base); + i++; + continue; + } + + this->size += next->size; + /* move forward from next + 1, index of which is i + 2 */ + memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); + type->cnt--; + } +} + +/** + * memblock_insert_region - insert new memblock region + * @type: memblock type to insert into + * @idx: index for the insertion point + * @base: base address of the new region + * @size: size of the new region + * @nid: node id of the new region + * @flags: flags of the new region + * + * Insert new memblock region [@base,@base+@size) into @type at @idx. + * @type must already have extra room to accomodate the new region. + */ +static void __init_memblock memblock_insert_region(struct memblock_type *type, + int idx, phys_addr_t base, + phys_addr_t size, + int nid, unsigned long flags) +{ + struct memblock_region *rgn = &type->regions[idx]; + + BUG_ON(type->cnt >= type->max); + memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); + rgn->base = base; + rgn->size = size; + rgn->flags = flags; + memblock_set_region_node(rgn, nid); + type->cnt++; + type->total_size += size; +} + +/** + * memblock_add_range - add new memblock region + * @type: memblock type to add new region into + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * @flags: flags of the new region + * + * Add new memblock region [@base,@base+@size) into @type. The new region + * is allowed to overlap with existing ones - overlaps don't affect already + * existing regions. @type is guaranteed to be minimal (all neighbouring + * compatible regions are merged) after the addition. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init_memblock memblock_add_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, + int nid, unsigned long flags) +{ + bool insert = false; + phys_addr_t obase = base; + phys_addr_t end = base + memblock_cap_size(base, &size); + int i, nr_new; + + if (!size) + return 0; + + /* special case for empty array */ + if (type->regions[0].size == 0) { + WARN_ON(type->cnt != 1 || type->total_size); + type->regions[0].base = base; + type->regions[0].size = size; + type->regions[0].flags = flags; + memblock_set_region_node(&type->regions[0], nid); + type->total_size = size; + return 0; + } +repeat: + /* + * The following is executed twice. Once with %false @insert and + * then with %true. The first counts the number of regions needed + * to accomodate the new area. The second actually inserts them. + */ + base = obase; + nr_new = 0; + + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; + + if (rbase >= end) + break; + if (rend <= base) + continue; + /* + * @rgn overlaps. If it separates the lower part of new + * area, insert that portion. + */ + if (rbase > base) { + nr_new++; + if (insert) + memblock_insert_region(type, i++, base, + rbase - base, nid, + flags); + } + /* area below @rend is dealt with, forget about it */ + base = min(rend, end); + } + + /* insert the remaining portion */ + if (base < end) { + nr_new++; + if (insert) + memblock_insert_region(type, i, base, end - base, + nid, flags); + } + + /* + * If this was the first round, resize array and repeat for actual + * insertions; otherwise, merge and return. + */ + if (!insert) { + while (type->cnt + nr_new > type->max) + if (memblock_double_array(type, obase, size) < 0) + return -ENOMEM; + insert = true; + goto repeat; + } else { + memblock_merge_regions(type); + return 0; + } +} + +int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, + int nid) +{ + return memblock_add_range(&memblock.memory, base, size, nid, 0); +} + +static int __init_memblock memblock_add_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) +{ + struct memblock_type *_rgn = &memblock.memory; + + memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", + (unsigned long long)base, + (unsigned long long)base + size - 1, + flags, (void *)_RET_IP_); + + return memblock_add_range(_rgn, base, size, nid, flags); +} + +int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +{ + return memblock_add_region(base, size, MAX_NUMNODES, 0); +} + +/** + * memblock_isolate_range - isolate given range into disjoint memblocks + * @type: memblock type to isolate range for + * @base: base of range to isolate + * @size: size of range to isolate + * @start_rgn: out parameter for the start of isolated region + * @end_rgn: out parameter for the end of isolated region + * + * Walk @type and ensure that regions don't cross the boundaries defined by + * [@base,@base+@size). Crossing regions are split at the boundaries, + * which may create at most two more regions. The index of the first + * region inside the range is returned in *@start_rgn and end in *@end_rgn. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int __init_memblock memblock_isolate_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, + int *start_rgn, int *end_rgn) +{ + phys_addr_t end = base + memblock_cap_size(base, &size); + int i; + + *start_rgn = *end_rgn = 0; + + if (!size) + return 0; + + /* we'll create at most two more regions */ + while (type->cnt + 2 > type->max) + if (memblock_double_array(type, base, size) < 0) + return -ENOMEM; + + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; + + if (rbase >= end) + break; + if (rend <= base) + continue; + + if (rbase < base) { + /* + * @rgn intersects from below. Split and continue + * to process the next region - the new top half. + */ + rgn->base = base; + rgn->size -= base - rbase; + type->total_size -= base - rbase; + memblock_insert_region(type, i, rbase, base - rbase, + memblock_get_region_node(rgn), + rgn->flags); + } else if (rend > end) { + /* + * @rgn intersects from above. Split and redo the + * current region - the new bottom half. + */ + rgn->base = end; + rgn->size -= end - rbase; + type->total_size -= end - rbase; + memblock_insert_region(type, i--, rbase, end - rbase, + memblock_get_region_node(rgn), + rgn->flags); + } else { + /* @rgn is fully contained, record it */ + if (!*end_rgn) + *start_rgn = i; + *end_rgn = i + 1; + } + } + + return 0; +} + +int __init_memblock memblock_remove_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) +{ + int start_rgn, end_rgn; + int i, ret; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = end_rgn - 1; i >= start_rgn; i--) + memblock_remove_region(type, i); + return 0; +} + +int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) +{ + return memblock_remove_range(&memblock.memory, base, size); +} + + +int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) +{ + memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", + (unsigned long long)base, + (unsigned long long)base + size - 1, + (void *)_RET_IP_); + + kmemleak_free_part(__va(base), size); + return memblock_remove_range(&memblock.reserved, base, size); +} + +static int __init_memblock memblock_reserve_region(phys_addr_t base, + phys_addr_t size, + int nid, + unsigned long flags) +{ + struct memblock_type *type = &memblock.reserved; + + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", + (unsigned long long)base, + (unsigned long long)base + size - 1, + flags, (void *)_RET_IP_); + + return memblock_add_range(type, base, size, nid, flags); +} + +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + return memblock_reserve_region(base, size, MAX_NUMNODES, 0); +} + +/** + * + * This function isolates region [@base, @base + @size), and sets/clears flag + * + * Return 0 on succees, -errno on failure. + */ +static int __init_memblock memblock_setclr_flag(phys_addr_t base, + phys_addr_t size, int set, int flag) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + if (set) + memblock_set_region_flags(&type->regions[i], flag); + else + memblock_clear_region_flags(&type->regions[i], flag); + + memblock_merge_regions(type); + return 0; +} + +/** + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on succees, -errno on failure. + */ +int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); +} + +/** + * __next__mem_range - next function for for_each_free_mem_range() etc. + * @idx: pointer to u64 loop variable + * @nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL + * + * Find the first area from *@idx which matches @nid, fill the out + * parameters, and update *@idx for the next iteration. The lower 32bit of + * *@idx contains index into type_a and the upper 32bit indexes the + * areas before each region in type_b. For example, if type_b regions + * look like the following, + * + * 0:[0-16), 1:[32-48), 2:[128-130) + * + * The upper 32bit indexes the following regions. + * + * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX) + * + * As both region arrays are sorted, the function advances the two indices + * in lockstep and returns each intersection. + */ +void __init_memblock __next_mem_range(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) +{ + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; + + if (WARN_ONCE(nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + for (; idx_a < type_a->cnt; idx_a++) { + struct memblock_region *m = &type_a->regions[idx_a]; + + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); + + /* only memory regions are associated with nodes, check it */ + if (nid != NUMA_NO_NODE && nid != m_nid) + continue; + + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b < type_b->cnt + 1; idx_b++) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; + + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ + if (r_start >= m_end) + break; + /* if the two regions intersect, we're done */ + if (m_start < r_end) { + if (out_start) + *out_start = + max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = m_nid; + /* + * The region which ends first is + * advanced for the next iteration. + */ + if (m_end <= r_end) + idx_a++; + else + idx_b++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + } + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/** + * __next_mem_range_rev - generic next function for for_each_*_range_rev() + * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * + * @idx: pointer to u64 loop variable + * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL + * + * Reverse of __next_mem_range(). + */ +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) +{ + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; + + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + if (*idx == (u64)ULLONG_MAX) { + idx_a = type_a->cnt - 1; + idx_b = type_b->cnt; + } + + for (; idx_a >= 0; idx_a--) { + struct memblock_region *m = &type_a->regions[idx_a]; + + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); + + /* only memory regions are associated with nodes, check it */ + if (nid != NUMA_NO_NODE && nid != m_nid) + continue; + + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b >= 0; idx_b--) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : ULLONG_MAX; + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ + + if (r_end <= m_start) + break; + /* if the two regions intersect, we're done */ + if (m_end > r_start) { + if (out_start) + *out_start = max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = m_nid; + if (m_start >= r_start) + idx_a--; + else + idx_b--; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + } + } + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +/* + * Common iterator interface used to define for_each_mem_range(). + */ +void __init_memblock __next_mem_pfn_range(int *idx, int nid, + unsigned long *out_start_pfn, + unsigned long *out_end_pfn, int *out_nid) +{ + struct memblock_type *type = &memblock.memory; + struct memblock_region *r; + + while (++*idx < type->cnt) { + r = &type->regions[*idx]; + + if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) + continue; + if (nid == MAX_NUMNODES || nid == r->nid) + break; + } + if (*idx >= type->cnt) { + *idx = -1; + return; + } + + if (out_start_pfn) + *out_start_pfn = PFN_UP(r->base); + if (out_end_pfn) + *out_end_pfn = PFN_DOWN(r->base + r->size); + if (out_nid) + *out_nid = r->nid; +} + +/** + * memblock_set_node - set node ID on memblock regions + * @base: base of area to set node ID for + * @size: size of area to set node ID for + * @type: memblock type to set node ID for + * @nid: node ID to set + * + * Set the nid of memblock @type regions in [@base,@base+@size) to @nid. + * Regions which cross the area boundaries are split as necessary. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, + struct memblock_type *type, int nid) +{ + int start_rgn, end_rgn; + int i, ret; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_set_region_node(&type->regions[i], nid); + + memblock_merge_regions(type); + return 0; +} +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) +{ + phys_addr_t found; + + if (!align) + align = SMP_CACHE_BYTES; + + found = memblock_find_in_range_node(size, align, start, end, nid); + if (found && !memblock_reserve(found, size)) { + /* + * The min_count is set to 0 so that memblock allocations are + * never reported as leaks. + */ + kmemleak_alloc(__va(found), size, 0, 0); + return found; + } + return 0; +} + +phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end) +{ + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); +} + +static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t max_addr, + int nid) +{ + return memblock_alloc_range_nid(size, align, 0, max_addr, nid); +} + +phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) +{ + return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); +} + +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); +} + +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + phys_addr_t alloc; + + alloc = __memblock_alloc_base(size, align, max_addr); + + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; +} + +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); +} + +phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t res = memblock_alloc_nid(size, align, nid); + + if (res) + return res; + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); +} + +/** + * memblock_virt_alloc_internal - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region to allocate (phys address) + * @max_addr: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Also, allocation may fall back + * to any node in the system if the specified node can not + * hold the requested memory. + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE. + * + * The memory block is aligned on SMP_CACHE_BYTES if @align == 0. + * + * The phys address of allocated boot memory block is converted to virtual and + * allocated memory is reset to 0. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc for + * allocated boot memory block, so that it is never reported as leaks. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +static void * __init memblock_virt_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + phys_addr_t alloc; + void *ptr; + + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of free_all_bootmem) + */ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, nid); + + if (!align) + align = SMP_CACHE_BYTES; + + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + +again: + alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, + nid); + if (alloc) + goto done; + + if (nid != NUMA_NO_NODE) { + alloc = memblock_find_in_range_node(size, align, min_addr, + max_addr, NUMA_NO_NODE); + if (alloc) + goto done; + } + + if (min_addr) { + min_addr = 0; + goto again; + } else { + goto error; + } + +done: + memblock_reserve(alloc, size); + ptr = phys_to_virt(alloc); + memset(ptr, 0, size); + + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. This is because many of these blocks + * are only referred via the physical address which is not + * looked up by kmemleak. + */ + kmemleak_alloc(ptr, size, 0, 0); + + return ptr; + +error: + return NULL; +} + +/** + * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides + * additional debug information (including caller info), if enabled. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_nopanic( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + return memblock_virt_alloc_internal(size, align, min_addr, + max_addr, nid); +} + +/** + * memblock_virt_alloc_try_nid - allocate boot memory block with panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * which provides debug information (including caller info), if enabled, + * and panics if the request can not be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + return ptr; + + panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr); + return NULL; +} + +/** + * __memblock_free_early - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_virt_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init __memblock_free_early(phys_addr_t base, phys_addr_t size) +{ + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + memblock_remove_range(&memblock.reserved, base, size); +} + +/* + * __memblock_free_late - free bootmem block pages directly to buddy allocator + * @addr: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are released directly + * to the buddy allocator, no bootmem metadata is updated because it is gone. + */ +void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + u64 cursor, end; + + memblock_dbg("%s: [%#016llx-%#016llx] %pF\n", + __func__, (u64)base, (u64)base + size - 1, + (void *)_RET_IP_); + kmemleak_free_part(__va(base), size); + cursor = PFN_UP(base); + end = PFN_DOWN(base + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +/* + * Remaining API functions + */ + +phys_addr_t __init memblock_phys_mem_size(void) +{ + return memblock.memory.total_size; +} + +phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) +{ + unsigned long pages = 0; + struct memblock_region *r; + unsigned long start_pfn, end_pfn; + + for_each_memblock(memory, r) { + start_pfn = memblock_region_memory_base_pfn(r); + end_pfn = memblock_region_memory_end_pfn(r); + start_pfn = min_t(unsigned long, start_pfn, limit_pfn); + end_pfn = min_t(unsigned long, end_pfn, limit_pfn); + pages += end_pfn - start_pfn; + } + + return PFN_PHYS(pages); +} + +/* lowest address */ +phys_addr_t __init_memblock memblock_start_of_DRAM(void) +{ + return memblock.memory.regions[0].base; +} + +phys_addr_t __init_memblock memblock_end_of_DRAM(void) +{ + int idx = memblock.memory.cnt - 1; + + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); +} + +void __init memblock_enforce_memory_limit(phys_addr_t limit) +{ + phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + struct memblock_region *r; + + if (!limit) + return; + + /* find out max address */ + for_each_memblock(memory, r) { + if (limit <= r->size) { + max_addr = r->base + limit; + break; + } + limit -= r->size; + } + + /* truncate both memory and reserved regions */ + memblock_remove_range(&memblock.memory, max_addr, + (phys_addr_t)ULLONG_MAX); + memblock_remove_range(&memblock.reserved, max_addr, + (phys_addr_t)ULLONG_MAX); +} + +static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +{ + unsigned int left = 0, right = type->cnt; + + do { + unsigned int mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else + return mid; + } while (left < right); + return -1; +} + +int __init memblock_is_reserved(phys_addr_t addr) +{ + return memblock_search(&memblock.reserved, addr) != -1; +} + +int __init_memblock memblock_is_memory(phys_addr_t addr) +{ + return memblock_search(&memblock.memory, addr) != -1; +} + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int __init_memblock memblock_search_pfn_nid(unsigned long pfn, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + struct memblock_type *type = &memblock.memory; + int mid = memblock_search(type, PFN_PHYS(pfn)); + + if (mid == -1) + return -1; + + *start_pfn = PFN_DOWN(type->regions[mid].base); + *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); + + return type->regions[mid].nid; +} +#endif + +/** + * memblock_is_region_memory - check if a region is a subset of memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) is a subset of a memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ +int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) +{ + int idx = memblock_search(&memblock.memory, base); + phys_addr_t end = base + memblock_cap_size(base, &size); + + if (idx == -1) + return 0; + return memblock.memory.regions[idx].base <= base && + (memblock.memory.regions[idx].base + + memblock.memory.regions[idx].size) >= end; +} + +/** + * memblock_is_region_reserved - check if a region intersects reserved memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base+@size) intersects a reserved memory block. + * + * RETURNS: + * 0 if false, non-zero if true + */ +int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +{ + memblock_cap_size(base, &size); + return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; +} + +void __init_memblock memblock_trim_memory(phys_addr_t align) +{ + phys_addr_t start, end, orig_start, orig_end; + struct memblock_region *r; + + for_each_memblock(memory, r) { + orig_start = r->base; + orig_end = r->base + r->size; + start = round_up(orig_start, align); + end = round_down(orig_end, align); + + if (start == orig_start && end == orig_end) + continue; + + if (start < end) { + r->base = start; + r->size = end - start; + } else { + memblock_remove_region(&memblock.memory, + r - memblock.memory.regions); + r--; + } + } +} + +void __init_memblock memblock_set_current_limit(phys_addr_t limit) +{ + memblock.current_limit = limit; +} + +phys_addr_t __init_memblock memblock_get_current_limit(void) +{ + return memblock.current_limit; +} + +static void __init_memblock memblock_dump(struct memblock_type *type, char *name) +{ + unsigned long long base, size; + unsigned long flags; + int i; + + pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); + + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + char nid_buf[32] = ""; + + base = rgn->base; + size = rgn->size; + flags = rgn->flags; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + if (memblock_get_region_node(rgn) != MAX_NUMNODES) + snprintf(nid_buf, sizeof(nid_buf), " on node %d", + memblock_get_region_node(rgn)); +#endif + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", + name, i, base, base + size - 1, size, nid_buf, flags); + } +} + +void __init_memblock __memblock_dump_all(void) +{ + pr_info("MEMBLOCK configuration:\n"); + pr_info(" memory size = %#llx reserved size = %#llx\n", + (unsigned long long)memblock.memory.total_size, + (unsigned long long)memblock.reserved.total_size); + + memblock_dump(&memblock.memory, "memory"); + memblock_dump(&memblock.reserved, "reserved"); +} + +void __init memblock_allow_resize(void) +{ + memblock_can_resize = 1; +} + +static int __init early_memblock(char *p) +{ + if (p && strstr(p, "debug")) + memblock_debug = 1; + return 0; +} +early_param("memblock", early_memblock); + +#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) + +static int memblock_debug_show(struct seq_file *m, void *private) +{ + struct memblock_type *type = m->private; + struct memblock_region *reg; + int i; + + for (i = 0; i < type->cnt; i++) { + reg = &type->regions[i]; + seq_printf(m, "%4d: ", i); + if (sizeof(phys_addr_t) == 4) + seq_printf(m, "0x%08lx..0x%08lx\n", + (unsigned long)reg->base, + (unsigned long)(reg->base + reg->size - 1)); + else + seq_printf(m, "0x%016llx..0x%016llx\n", + (unsigned long long)reg->base, + (unsigned long long)(reg->base + reg->size - 1)); + + } + return 0; +} + +static int memblock_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, memblock_debug_show, inode->i_private); +} + +static const struct file_operations memblock_debug_fops = { + .open = memblock_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root = debugfs_create_dir("memblock", NULL); + if (!root) + return -ENXIO; + debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); + debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops); +#endif + + return 0; +} +__initcall(memblock_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/kernel/mm/memcontrol.c b/kernel/mm/memcontrol.c new file mode 100644 index 000000000..8bd68b5ec --- /dev/null +++ b/kernel/mm/memcontrol.c @@ -0,0 +1,5933 @@ +/* memcontrol.c - Memory Controller + * + * Copyright IBM Corporation, 2007 + * Author Balbir Singh + * + * Copyright 2007 OpenVZ SWsoft Inc + * Author: Pavel Emelianov + * + * Memory thresholds + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * + * Native page reclaim + * Charge lifetime sanitation + * Lockless page tracking & accounting + * Unified hierarchy configuration model + * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include +#include +#include +#include + +#include "slab.h" + +#include + +#include + +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); + +#define MEM_CGROUP_RECLAIM_RETRIES 5 +static struct mem_cgroup *root_mem_cgroup __read_mostly; + +/* Whether the swap controller is active */ +#ifdef CONFIG_MEMCG_SWAP +int do_swap_account __read_mostly; +#else +#define do_swap_account 0 +#endif + +static DEFINE_LOCAL_IRQ_LOCK(event_lock); +static const char * const mem_cgroup_stat_names[] = { + "cache", + "rss", + "rss_huge", + "mapped_file", + "writeback", + "swap", +}; + +static const char * const mem_cgroup_events_names[] = { + "pgpgin", + "pgpgout", + "pgfault", + "pgmajfault", +}; + +static const char * const mem_cgroup_lru_names[] = { + "inactive_anon", + "active_anon", + "inactive_file", + "active_file", + "unevictable", +}; + +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremated by the number of pages. This counter is used for + * for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_TARGET_NUMAINFO, + MEM_CGROUP_NTARGETS, +}; +#define THRESHOLDS_EVENTS_TARGET 128 +#define SOFTLIMIT_EVENTS_TARGET 1024 +#define NUMAINFO_EVENTS_TARGET 1024 + +struct mem_cgroup_stat_cpu { + long count[MEM_CGROUP_STAT_NSTATS]; + unsigned long events[MEMCG_NR_EVENTS]; + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct reclaim_iter { + struct mem_cgroup *position; + /* scan generation, increased every round-trip */ + unsigned int generation; +}; + +/* + * per-zone information in memory controller. + */ +struct mem_cgroup_per_zone { + struct lruvec lruvec; + unsigned long lru_size[NR_LRU_LISTS]; + + struct reclaim_iter iter[DEF_PRIORITY + 1]; + + struct rb_node tree_node; /* RB tree node */ + unsigned long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; + struct mem_cgroup *memcg; /* Back pointer, we cannot */ + /* use container_of */ +}; + +struct mem_cgroup_per_node { + struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; +}; + +/* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + +struct mem_cgroup_threshold { + struct eventfd_ctx *eventfd; + unsigned long threshold; +}; + +/* For threshold */ +struct mem_cgroup_threshold_ary { + /* An array index points to threshold just below or equal to usage. */ + int current_threshold; + /* Size of entries[] */ + unsigned int size; + /* Array of thresholds */ + struct mem_cgroup_threshold entries[0]; +}; + +struct mem_cgroup_thresholds { + /* Primary thresholds array */ + struct mem_cgroup_threshold_ary *primary; + /* + * Spare threshold array. + * This is needed to make mem_cgroup_unregister_event() "never fail". + * It must be able to store at least primary->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *spare; +}; + +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; + +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { + /* + * memcg which the event belongs to. + */ + struct mem_cgroup *memcg; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd); + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; + +static void mem_cgroup_threshold(struct mem_cgroup *memcg); +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); + +/* + * The memory controller data structure. The memory controller controls both + * page cache and RSS per cgroup. We would eventually like to provide + * statistics based on the statistics developed by Rik Van Riel for clock-pro, + * to help the administrator determine what knobs to tune. + */ +struct mem_cgroup { + struct cgroup_subsys_state css; + + /* Accounted resources */ + struct page_counter memory; + struct page_counter memsw; + struct page_counter kmem; + + /* Normal memory consumption range */ + unsigned long low; + unsigned long high; + + unsigned long soft_limit; + + /* vmpressure notifications */ + struct vmpressure vmpressure; + + /* css_online() has been completed */ + int initialized; + + /* + * Should the accounting and control be hierarchical, per subtree? + */ + bool use_hierarchy; + + bool oom_lock; + atomic_t under_oom; + atomic_t oom_wakeups; + + int swappiness; + /* OOM-Killer disable */ + int oom_kill_disable; + + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + struct task_struct *move_lock_task; + unsigned long move_lock_flags; + /* + * percpu counter. + */ + struct mem_cgroup_stat_cpu __percpu *stat; + /* + * used when a cpu is offlined or other synchronizations + * See mem_cgroup_read_stat(). + */ + struct mem_cgroup_stat_cpu nocpu_base; + spinlock_t pcp_counter_lock; + +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) + struct cg_proto tcp_mem; +#endif +#if defined(CONFIG_MEMCG_KMEM) + /* Index in the kmem_cache->memcg_params.memcg_caches array */ + int kmemcg_id; + bool kmem_acct_activated; + bool kmem_acct_active; +#endif + + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + atomic_t numainfo_events; + atomic_t numainfo_updating; +#endif + + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; + + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ +}; + +#ifdef CONFIG_MEMCG_KMEM +bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return memcg->kmem_acct_active; +} +#endif + +/* Stuffs for move charges at task migration. */ +/* + * Types of charges to be moved. + */ +#define MOVE_ANON 0x1U +#define MOVE_FILE 0x2U +#define MOVE_MASK (MOVE_ANON | MOVE_FILE) + +/* "mc" and its members are protected by cgroup_mutex */ +static struct move_charge_struct { + spinlock_t lock; /* for from, to */ + struct mem_cgroup *from; + struct mem_cgroup *to; + unsigned long flags; + unsigned long precharge; + unsigned long moved_charge; + unsigned long moved_swap; + struct task_struct *moving_task; /* a task moving charges */ + wait_queue_head_t waitq; /* a waitq for other context */ +} mc = { + .lock = __SPIN_LOCK_UNLOCKED(mc.lock), + .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), +}; + +/* + * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * limit reclaim to prevent infinite loops, if they ever occur. + */ +#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 + +enum charge_type { + MEM_CGROUP_CHARGE_TYPE_CACHE = 0, + MEM_CGROUP_CHARGE_TYPE_ANON, + MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ + MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ + NR_CHARGE_TYPE, +}; + +/* for encoding cft->private value on file */ +enum res_type { + _MEM, + _MEMSWAP, + _OOM_TYPE, + _KMEM, +}; + +#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) +#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) +/* Used for OOM nofiier */ +#define OOM_CONTROL (0) + +/* + * The memcg_create_mutex will be held whenever a new cgroup is created. + * As a consequence, any change that needs to protect against new child cgroups + * appearing has to hold it as well. + */ +static DEFINE_MUTEX(memcg_create_mutex); + +struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return s ? container_of(s, struct mem_cgroup, css) : NULL; +} + +/* Some nice accessors for the vmpressure. */ +struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) +{ + if (!memcg) + memcg = root_mem_cgroup; + return &memcg->vmpressure; +} + +struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +{ + return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; +} + +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) +{ + return (memcg == root_mem_cgroup); +} + +/* + * We restrict the id in the range of [1, 65535], so it can fit into + * an unsigned short. + */ +#define MEM_CGROUP_ID_MAX USHRT_MAX + +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) +{ + return memcg->css.id; +} + +/* + * A helper function to get mem_cgroup from ID. must be called under + * rcu_read_lock(). The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.) + */ +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + struct cgroup_subsys_state *css; + + css = css_from_id(id, &memory_cgrp_subsys); + return mem_cgroup_from_css(css); +} + +/* Writing them here to avoid exposing memcg's inner layout */ +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) + +void sock_update_memcg(struct sock *sk) +{ + if (mem_cgroup_sockets_enabled) { + struct mem_cgroup *memcg; + struct cg_proto *cg_proto; + + BUG_ON(!sk->sk_prot->proto_cgroup); + + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_cgrp) { + BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); + css_get(&sk->sk_cgrp->memcg->css); + return; + } + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + cg_proto = sk->sk_prot->proto_cgroup(memcg); + if (!mem_cgroup_is_root(memcg) && + memcg_proto_active(cg_proto) && + css_tryget_online(&memcg->css)) { + sk->sk_cgrp = cg_proto; + } + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { + struct mem_cgroup *memcg; + WARN_ON(!sk->sk_cgrp->memcg); + memcg = sk->sk_cgrp->memcg; + css_put(&sk->sk_cgrp->memcg->css); + } +} + +struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg || mem_cgroup_is_root(memcg)) + return NULL; + + return &memcg->tcp_mem; +} +EXPORT_SYMBOL(tcp_proto_cgroup); + +#endif + +#ifdef CONFIG_MEMCG_KMEM +/* + * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. + * The main reason for not using cgroup id for this: + * this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * The current size of the caches array is stored in memcg_nr_cache_ids. It + * will double each time we have to increase it. + */ +static DEFINE_IDA(memcg_cache_ida); +int memcg_nr_cache_ids; + +/* Protects memcg_nr_cache_ids */ +static DECLARE_RWSEM(memcg_cache_ids_sem); + +void memcg_get_cache_ids(void) +{ + down_read(&memcg_cache_ids_sem); +} + +void memcg_put_cache_ids(void) +{ + up_read(&memcg_cache_ids_sem); +} + +/* + * MIN_SIZE is different than 1, because we would like to avoid going through + * the alloc/free process all the time. In a small machine, 4 kmem-limited + * cgroups is a reasonable guess. In the future, it could be a parameter or + * tunable, but that is strictly not necessary. + * + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get + * this constant directly from cgroup, but it is understandable that this is + * better kept as an internal representation in cgroup.c. In any case, the + * cgrp_id space is not getting any smaller, and we don't have to necessarily + * increase ours as well if it increases. + */ +#define MEMCG_CACHES_MIN_SIZE 4 +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_kmem_get_cache are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +struct static_key memcg_kmem_enabled_key; +EXPORT_SYMBOL(memcg_kmem_enabled_key); + +#endif /* CONFIG_MEMCG_KMEM */ + +static struct mem_cgroup_per_zone * +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) +{ + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + return &memcg->nodeinfo[nid]->zoneinfo[zid]; +} + +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) +{ + return &memcg->css; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &memcg->nodeinfo[nid]->zoneinfo[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); + __mem_cgroup_remove_exceeded(mz, mctz); + spin_unlock_irqrestore(&mctz->lock, flags); +} + +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) +{ + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long soft_limit = READ_ONCE(memcg->soft_limit); + unsigned long excess = 0; + + if (nr_pages > soft_limit) + excess = nr_pages - soft_limit; + + return excess; +} + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) +{ + unsigned long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + mctz = soft_limit_tree_from_page(page); + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = mem_cgroup_page_zoneinfo(memcg, page); + excess = soft_limit_excess(memcg); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irqrestore(&mctz->lock, flags); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) +{ + struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_zone *mz; + int nid, zid; + + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + mctz = soft_limit_tree_node_zone(nid, zid); + mem_cgroup_remove_exceeded(mz, mctz); + } + } +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz, mctz); + if (!soft_limit_excess(mz->memcg) || + !css_tryget_online(&mz->memcg->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock_irq(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock_irq(&mctz->lock); + return mz; +} + +/* + * Implementation Note: reading percpu statistics for memcg. + * + * Both of vmstat[] and percpu_counter has threshold and do periodic + * synchronization to implement "quick" read. There are trade-off between + * reading cost and precision of value. Then, we may have a chance to implement + * a periodic synchronizion of counter in memcg's counter. + * + * But this _read() function is used for user interface now. The user accounts + * memory usage by memory cgroup and he _always_ requires exact value because + * he accounts memory. Even if we provide quick-and-fuzzy read, we always + * have to visit all online cpus and make sum. So, for now, unnecessary + * synchronization is not implemented. (just implemented for cpu hotplug) + * + * If there are kernel internal actions which can make use of some not-exact + * value, and reading all cpu value can be performance bottleneck in some + * common workload, threashold and synchonization as vmstat[] should be + * implemented. + */ +static long mem_cgroup_read_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx) +{ + long val = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + val += per_cpu(memcg->stat->count[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.count[idx]; + spin_unlock(&memcg->pcp_counter_lock); +#endif + put_online_cpus(); + return val; +} + +static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx) +{ + unsigned long val = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + val += per_cpu(memcg->stat->events[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.events[idx]; + spin_unlock(&memcg->pcp_counter_lock); +#endif + put_online_cpus(); + return val; +} + +static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, + struct page *page, + int nr_pages) +{ + /* + * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is + * counted as CACHE even if it's on ANON LRU. + */ + if (PageAnon(page)) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], + nr_pages); + else + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], + nr_pages); + + if (PageTransHuge(page)) + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + nr_pages); + + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + else { + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + nr_pages = -nr_pages; /* for event */ + } + + __this_cpu_add(memcg->stat->nr_page_events, nr_pages); +} + +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + return mz->lru_size[lru]; +} + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, + unsigned int lru_mask) +{ + unsigned long nr = 0; + int zid; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct mem_cgroup_per_zone *mz; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + nr += mz->lru_size[lru]; + } + } + return nr; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, + unsigned int lru_mask) +{ + unsigned long nr = 0; + int nid; + + for_each_node_state(nid, N_MEMORY) + nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + return nr; +} + +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) +{ + unsigned long val, next; + + val = __this_cpu_read(memcg->stat->nr_page_events); + next = __this_cpu_read(memcg->stat->targets[target]); + /* from time_after() in jiffies.h */ + if ((long)next - (long)val < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->stat->targets[target], next); + return true; + } + return false; +} + +/* + * Check events in order. + * + */ +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) +{ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; + bool do_numainfo __maybe_unused; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); +#if MAX_NUMNODES > 1 + do_numainfo = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_NUMAINFO); +#endif + mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_tree(memcg, page); +#if MAX_NUMNODES > 1 + if (unlikely(do_numainfo)) + atomic_inc(&memcg->numainfo_events); +#endif + } +} + +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) +{ + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); +} + +static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg = NULL; + + rcu_read_lock(); + do { + /* + * Page cache insertions can happen withou an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + */ + if (unlikely(!mm)) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } + } while (!css_tryget_online(&memcg->css)); + rcu_read_unlock(); + return memcg; +} + +/** + * mem_cgroup_iter - iterate over memory cgroup hierarchy + * @root: hierarchy root + * @prev: previously returned memcg, NULL on first invocation + * @reclaim: cookie for shared reclaim walks, NULL for full walks + * + * Returns references to children of the hierarchy below @root, or + * @root itself, or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use mem_cgroup_iter_break() + * to cancel a hierarchy walk before the round-trip is complete. + * + * Reclaimers can specify a zone and a priority level in @reclaim to + * divide up the memcgs in the hierarchy among all concurrent + * reclaimers operating on the same zone and priority. + */ +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) +{ + struct reclaim_iter *uninitialized_var(iter); + struct cgroup_subsys_state *css = NULL; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *pos = NULL; + + if (mem_cgroup_disabled()) + return NULL; + + if (!root) + root = root_mem_cgroup; + + if (prev && !reclaim) + pos = prev; + + if (!root->use_hierarchy && root != root_mem_cgroup) { + if (prev) + goto out; + return root; + } + + rcu_read_lock(); + + if (reclaim) { + struct mem_cgroup_per_zone *mz; + + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + iter = &mz->iter[reclaim->priority]; + + if (prev && reclaim->generation != iter->generation) + goto out_unlock; + + do { + pos = READ_ONCE(iter->position); + /* + * A racing update may change the position and + * put the last reference, hence css_tryget(), + * or retry to see the updated position. + */ + } while (pos && !css_tryget(&pos->css)); + } + + if (pos) + css = &pos->css; + + for (;;) { + css = css_next_descendant_pre(css, &root->css); + if (!css) { + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + continue; + break; + } + + /* + * Verify the css and acquire a reference. The root + * is provided by the caller, so we know it's alive + * and kicking, and don't take an extra reference. + */ + memcg = mem_cgroup_from_css(css); + + if (css == &root->css) + break; + + if (css_tryget(css)) { + /* + * Make sure the memcg is initialized: + * mem_cgroup_css_online() orders the the + * initialization against setting the flag. + */ + if (smp_load_acquire(&memcg->initialized)) + break; + + css_put(css); + } + + memcg = NULL; + } + + if (reclaim) { + if (cmpxchg(&iter->position, pos, memcg) == pos) { + if (memcg) + css_get(&memcg->css); + if (pos) + css_put(&pos->css); + } + + /* + * pairs with css_tryget when dereferencing iter->position + * above. + */ + if (pos) + css_put(&pos->css); + + if (!memcg) + iter->generation++; + else if (!prev) + reclaim->generation = iter->generation; + } + +out_unlock: + rcu_read_unlock(); +out: + if (prev && prev != root) + css_put(&prev->css); + + return memcg; +} + +/** + * mem_cgroup_iter_break - abort a hierarchy walk prematurely + * @root: hierarchy root + * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + */ +void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ + if (!root) + root = root_mem_cgroup; + if (prev && prev != root) + css_put(&prev->css); +} + +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + +void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + goto out; + + switch (idx) { + case PGFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + break; + case PGMAJFAULT: + this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); + break; + default: + BUG(); + } +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(__mem_cgroup_count_vm_event); + +/** + * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg + * @zone: zone of the wanted lruvec + * @memcg: memcg of the wanted lruvec + * + * Returns the lru list vector holding pages for the given @zone and + * @mem. This can be the global zone lruvec, if the memory controller + * is disabled. + */ +struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_zone *mz; + struct lruvec *lruvec; + + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } + + mz = mem_cgroup_zone_zoneinfo(memcg, zone); + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; +} + +/** + * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page + * @page: the page + * @zone: zone of the page + * + * This function is only safe when following the LRU page isolation + * and putback protocol: the LRU lock must be held, and the page must + * either be PageLRU() or the caller must have isolated/allocated it. + */ +struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) +{ + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + if (mem_cgroup_disabled()) { + lruvec = &zone->lruvec; + goto out; + } + + memcg = page->mem_cgroup; + /* + * Swapcache readahead pages are added to the LRU - and + * possibly migrated - before they are charged. + */ + if (!memcg) + memcg = root_mem_cgroup; + + mz = mem_cgroup_page_zoneinfo(memcg, page); + lruvec = &mz->lruvec; +out: + /* + * Since a node can be onlined after the mem_cgroup was created, + * we have to be prepared to initialize lruvec->zone here; + * and if offlined then reonlined, we need to reinitialize it. + */ + if (unlikely(lruvec->zone != zone)) + lruvec->zone = zone; + return lruvec; +} + +/** + * mem_cgroup_update_lru_size - account for adding or removing an lru page + * @lruvec: mem_cgroup per zone lru vector + * @lru: index of lru list the page is sitting on + * @nr_pages: positive when adding or negative when removing + * + * This function must be called when a page is added to or removed from an + * lru list. + */ +void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, + int nr_pages) +{ + struct mem_cgroup_per_zone *mz; + unsigned long *lru_size; + + if (mem_cgroup_disabled()) + return; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + lru_size = mz->lru_size + lru; + *lru_size += nr_pages; + VM_BUG_ON((long)(*lru_size) < 0); +} + +bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) +{ + if (root == memcg) + return true; + if (!root->use_hierarchy) + return false; + return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); +} + +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) +{ + struct mem_cgroup *task_memcg; + struct task_struct *p; + bool ret; + + p = find_lock_task_mm(task); + if (p) { + task_memcg = get_mem_cgroup_from_mm(p->mm); + task_unlock(p); + } else { + /* + * All threads may have already detached their mm's, but the oom + * killer still needs to detect if they have already been oom + * killed to prevent needlessly killing additional tasks. + */ + rcu_read_lock(); + task_memcg = mem_cgroup_from_task(task); + css_get(&task_memcg->css); + rcu_read_unlock(); + } + ret = mem_cgroup_is_descendant(task_memcg, memcg); + css_put(&task_memcg->css); + return ret; +} + +int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) +{ + unsigned long inactive_ratio; + unsigned long inactive; + unsigned long active; + unsigned long gb; + + inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); + active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); + + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; + + return inactive * inactive_ratio < active; +} + +bool mem_cgroup_lruvec_online(struct lruvec *lruvec) +{ + struct mem_cgroup_per_zone *mz; + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return true; + + mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + memcg = mz->memcg; + + return !!(memcg->css.flags & CSS_ONLINE); +} + +#define mem_cgroup_from_counter(counter, member) \ + container_of(counter, struct mem_cgroup, member) + +/** + * mem_cgroup_margin - calculate chargeable space of a memory cgroup + * @memcg: the memory cgroup + * + * Returns the maximum amount of memory @mem can be charged with, in + * pages. + */ +static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) +{ + unsigned long margin = 0; + unsigned long count; + unsigned long limit; + + count = page_counter_read(&memcg->memory); + limit = READ_ONCE(memcg->memory.limit); + if (count < limit) + margin = limit - count; + + if (do_swap_account) { + count = page_counter_read(&memcg->memsw); + limit = READ_ONCE(memcg->memsw.limit); + if (count <= limit) + margin = min(margin, limit - count); + } + + return margin; +} + +int mem_cgroup_swappiness(struct mem_cgroup *memcg) +{ + /* root ? */ + if (mem_cgroup_disabled() || !memcg->css.parent) + return vm_swappiness; + + return memcg->swappiness; +} + +/* + * A routine for checking "mem" is under move_account() or not. + * + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". + */ +static bool mem_cgroup_under_move(struct mem_cgroup *memcg) +{ + struct mem_cgroup *from; + struct mem_cgroup *to; + bool ret = false; + /* + * Unlike task_move routines, we access mc.to, mc.from not under + * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. + */ + spin_lock(&mc.lock); + from = mc.from; + to = mc.to; + if (!from) + goto unlock; + + ret = mem_cgroup_is_descendant(from, memcg) || + mem_cgroup_is_descendant(to, memcg); +unlock: + spin_unlock(&mc.lock); + return ret; +} + +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) +{ + if (mc.moving_task && current != mc.moving_task) { + if (mem_cgroup_under_move(memcg)) { + DEFINE_WAIT(wait); + prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); + /* moving charge context might have finished. */ + if (mc.moving_task) + schedule(); + finish_wait(&mc.waitq, &wait); + return true; + } + } + return false; +} + +#define K(x) ((x) << (PAGE_SHIFT-10)) +/** + * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. + * @memcg: The memory cgroup that went over limit + * @p: Task that is going to be killed + * + * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is + * enabled + */ +void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +{ + /* oom_info_lock ensures that parallel ooms do not interleave */ + static DEFINE_MUTEX(oom_info_lock); + struct mem_cgroup *iter; + unsigned int i; + + mutex_lock(&oom_info_lock); + rcu_read_lock(); + + if (p) { + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_cont(" killed as a result of limit of "); + } else { + pr_info("Memory limit reached of cgroup "); + } + + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont("\n"); + + rcu_read_unlock(); + + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memory)), + K((u64)memcg->memory.limit), memcg->memory.failcnt); + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + + for_each_mem_cgroup_tree(iter, memcg) { + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(iter->css.cgroup); + pr_cont(":"); + + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + continue; + pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], + K(mem_cgroup_read_stat(iter, i))); + } + + for (i = 0; i < NR_LRU_LISTS; i++) + pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], + K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + + pr_cont("\n"); + } + mutex_unlock(&oom_info_lock); +} + +/* + * This function returns the number of memcg under hierarchy tree. Returns + * 1(self count) if no children. + */ +static int mem_cgroup_count_children(struct mem_cgroup *memcg) +{ + int num = 0; + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + num++; + return num; +} + +/* + * Return the memory (and swap, if configured) limit for a memcg. + */ +static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +{ + unsigned long limit; + + limit = memcg->memory.limit; + if (mem_cgroup_swappiness(memcg)) { + unsigned long memsw_limit; + + memsw_limit = memcg->memsw.limit; + limit = min(limit + total_swap_pages, memsw_limit); + } + return limit; +} + +static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) +{ + struct mem_cgroup *iter; + unsigned long chosen_points = 0; + unsigned long totalpages; + unsigned int points = 0; + struct task_struct *chosen = NULL; + + /* + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. + */ + if (fatal_signal_pending(current) || task_will_free_mem(current)) { + mark_tsk_oom_victim(current); + return; + } + + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); + totalpages = mem_cgroup_get_limit(memcg) ? : 1; + for_each_mem_cgroup_tree(iter, memcg) { + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&iter->css, &it); + while ((task = css_task_iter_next(&it))) { + switch (oom_scan_process_thread(task, totalpages, NULL, + false)) { + case OOM_SCAN_SELECT: + if (chosen) + put_task_struct(chosen); + chosen = task; + chosen_points = ULONG_MAX; + get_task_struct(chosen); + /* fall through */ + case OOM_SCAN_CONTINUE: + continue; + case OOM_SCAN_ABORT: + css_task_iter_end(&it); + mem_cgroup_iter_break(memcg, iter); + if (chosen) + put_task_struct(chosen); + return; + case OOM_SCAN_OK: + break; + }; + points = oom_badness(task, memcg, NULL, totalpages); + if (!points || points < chosen_points) + continue; + /* Prefer thread group leaders for display purposes */ + if (points == chosen_points && + thread_group_leader(chosen)) + continue; + + if (chosen) + put_task_struct(chosen); + chosen = task; + chosen_points = points; + get_task_struct(chosen); + } + css_task_iter_end(&it); + } + + if (!chosen) + return; + points = chosen_points * 1000 / totalpages; + oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, + NULL, "Memory cgroup out of memory"); +} + +#if MAX_NUMNODES > 1 + +/** + * test_mem_cgroup_node_reclaimable + * @memcg: the target memcg + * @nid: the node ID to be checked. + * @noswap : specify true here if the user wants flle only information. + * + * This function returns whether the specified memcg contains any + * reclaimable pages on a node. Returns true if there are any reclaimable + * pages in the node. + */ +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, + int nid, bool noswap) +{ + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) + return true; + if (noswap || !total_swap_pages) + return false; + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) + return true; + return false; + +} + +/* + * Always updating the nodemask is not very good - even if we have an empty + * list or the wrong list here, we can start from some node and traverse all + * nodes based on the zonelist. So update the list loosely once per 10 secs. + * + */ +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) +{ + int nid; + /* + * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET + * pagein/pageout changes since the last update. + */ + if (!atomic_read(&memcg->numainfo_events)) + return; + if (atomic_inc_return(&memcg->numainfo_updating) > 1) + return; + + /* make a nodemask where this memcg uses memory from */ + memcg->scan_nodes = node_states[N_MEMORY]; + + for_each_node_mask(nid, node_states[N_MEMORY]) { + + if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) + node_clear(nid, memcg->scan_nodes); + } + + atomic_set(&memcg->numainfo_events, 0); + atomic_set(&memcg->numainfo_updating, 0); +} + +/* + * Selecting a node where we start reclaim from. Because what we need is just + * reducing usage counter, start from anywhere is O,K. Considering + * memory reclaim from current node, there are pros. and cons. + * + * Freeing memory from current node means freeing memory from a node which + * we'll use or we've used. So, it may make LRU bad. And if several threads + * hit limits, it will see a contention on a node. But freeing from remote + * node means more costs for memory reclaim because of memory latency. + * + * Now, we use round-robin. Better algorithm is welcomed. + */ +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) +{ + int node; + + mem_cgroup_may_update_nodemask(memcg); + node = memcg->last_scanned_node; + + node = next_node(node, memcg->scan_nodes); + if (node == MAX_NUMNODES) + node = first_node(memcg->scan_nodes); + /* + * We call this when we hit limit, not when pages are added to LRU. + * No LRU may hold pages because all pages are UNEVICTABLE or + * memcg is too small and all pages are not on LRU. In that case, + * we use curret node. + */ + if (unlikely(node == MAX_NUMNODES)) + node = numa_node_id(); + + memcg->last_scanned_node = node; + return node; +} +#else +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) +{ + return 0; +} +#endif + +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + struct zone *zone, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; + int loop = 0; + unsigned long excess; + unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = 0, + }; + + excess = soft_limit_excess(root_memcg); + + while (1) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { + loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!total) + break; + /* + * We want to do more targeted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) + break; + } + continue; + } + total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, + zone, &nr_scanned); + *total_scanned += nr_scanned; + if (!soft_limit_excess(root_memcg)) + break; + } + mem_cgroup_iter_break(root_memcg, victim); + return total; +} + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map memcg_oom_lock_dep_map = { + .name = "memcg_oom_lock", +}; +#endif + +static DEFINE_SPINLOCK(memcg_oom_lock); + +/* + * Check OOM-Killer is already running under our hierarchy. + * If someone is running, return false. + */ +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter, *failed = NULL; + + spin_lock(&memcg_oom_lock); + + for_each_mem_cgroup_tree(iter, memcg) { + if (iter->oom_lock) { + /* + * this subtree of our hierarchy is already locked + * so we cannot give a lock. + */ + failed = iter; + mem_cgroup_iter_break(memcg, iter); + break; + } else + iter->oom_lock = true; + } + + if (failed) { + /* + * OK, we failed to lock the whole subtree so we have + * to clean up what we set up to the failing subtree + */ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter == failed) { + mem_cgroup_iter_break(memcg, iter); + break; + } + iter->oom_lock = false; + } + } else + mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); + + spin_unlock(&memcg_oom_lock); + + return !failed; +} + +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + spin_lock(&memcg_oom_lock); + mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); + for_each_mem_cgroup_tree(iter, memcg) + iter->oom_lock = false; + spin_unlock(&memcg_oom_lock); +} + +static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + atomic_inc(&iter->under_oom); +} + +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + /* + * When a new child is created while the hierarchy is under oom, + * mem_cgroup_oom_lock() may not be called. We have to use + * atomic_add_unless() here. + */ + for_each_mem_cgroup_tree(iter, memcg) + atomic_add_unless(&iter->under_oom, -1, 0); +} + +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); + +struct oom_wait_info { + struct mem_cgroup *memcg; + wait_queue_t wait; +}; + +static int memcg_oom_wake_function(wait_queue_t *wait, + unsigned mode, int sync, void *arg) +{ + struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; + struct mem_cgroup *oom_wait_memcg; + struct oom_wait_info *oom_wait_info; + + oom_wait_info = container_of(wait, struct oom_wait_info, wait); + oom_wait_memcg = oom_wait_info->memcg; + + if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && + !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) + return 0; + return autoremove_wake_function(wait, mode, sync, arg); +} + +static void memcg_wakeup_oom(struct mem_cgroup *memcg) +{ + atomic_inc(&memcg->oom_wakeups); + /* for filtering, pass "memcg" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); +} + +static void memcg_oom_recover(struct mem_cgroup *memcg) +{ + if (memcg && atomic_read(&memcg->under_oom)) + memcg_wakeup_oom(memcg); +} + +static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +{ + if (!current->memcg_oom.may_oom) + return; + /* + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. + */ + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; +} + +/** + * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state + * + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. + * + * Memcg supports userspace OOM handling where failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at + * the end of the page fault to complete the OOM handling. + * + * Returns %true if an ongoing memcg OOM situation was detected and + * completed, %false otherwise. + */ +bool mem_cgroup_oom_synchronize(bool handle) +{ + struct mem_cgroup *memcg = current->memcg_oom.memcg; + struct oom_wait_info owait; + bool locked; + + /* OOM is global, do not handle */ + if (!memcg) + return false; + + if (!handle || oom_killer_disabled) + goto cleanup; + + owait.memcg = memcg; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); + + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); + } else { + schedule(); + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + + if (locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); + } +cleanup: + current->memcg_oom.memcg = NULL; + css_put(&memcg->css); + return true; +} + +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: + * + * memcg = mem_cgroup_begin_page_stat(page); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg); + */ +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) +{ + struct mem_cgroup *memcg; + unsigned long flags; + + /* + * The RCU lock is held throughout the transaction. The fast + * path can get away without acquiring the memcg->move_lock + * because page moving starts with an RCU grace period. + * + * The RCU lock also protects the memcg from being freed when + * the page state that is going to change is the only thing + * preventing the page from being uncharged. + * E.g. end-writeback clearing PageWriteback(), which allows + * migration to go ahead and uncharge the page before the + * account transaction might be complete. + */ + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; +again: + memcg = page->mem_cgroup; + if (unlikely(!memcg)) + return NULL; + + if (atomic_read(&memcg->moving_account) <= 0) + return memcg; + + spin_lock_irqsave(&memcg->move_lock, flags); + if (memcg != page->mem_cgroup) { + spin_unlock_irqrestore(&memcg->move_lock, flags); + goto again; + } + + /* + * When charge migration first begins, we can have locked and + * unlocked page stat updates happening concurrently. Track + * the task who has the lock for mem_cgroup_end_page_stat(). + */ + memcg->move_lock_task = current; + memcg->move_lock_flags = flags; + + return memcg; +} + +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) +{ + if (memcg && memcg->move_lock_task == current) { + unsigned long flags = memcg->move_lock_flags; + + memcg->move_lock_task = NULL; + memcg->move_lock_flags = 0; + + spin_unlock_irqrestore(&memcg->move_lock, flags); + } + + rcu_read_unlock(); +} + +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx, int val) +{ + VM_BUG_ON(!rcu_read_lock_held()); + + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); +} + +/* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define CHARGE_BATCH 32U +struct memcg_stock_pcp { + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + struct work_struct work; + unsigned long flags; +#define FLUSHING_CACHED_CHARGE 0 +}; +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_MUTEX(percpu_charge_mutex); + +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. + */ +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + struct memcg_stock_pcp *stock; + bool ret = false; + + if (nr_pages > CHARGE_BATCH) + return ret; + + stock = &get_cpu_var(memcg_stock); + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { + stock->nr_pages -= nr_pages; + ret = true; + } + put_cpu_var(memcg_stock); + return ret; +} + +/* + * Returns stocks cached in percpu and reset cached information. + */ +static void drain_stock(struct memcg_stock_pcp *stock) +{ + struct mem_cgroup *old = stock->cached; + + if (stock->nr_pages) { + page_counter_uncharge(&old->memory, stock->nr_pages); + if (do_swap_account) + page_counter_uncharge(&old->memsw, stock->nr_pages); + css_put_many(&old->css, stock->nr_pages); + stock->nr_pages = 0; + } + stock->cached = NULL; +} + +/* + * This must be called under preempt disabled or must be called by + * a thread which is pinned to local cpu. + */ +static void drain_local_stock(struct work_struct *dummy) +{ + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); + drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); +} + +/* + * Cache charges(val) to local per_cpu area. + * This will be consumed by consume_stock() function, later. + */ +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + struct memcg_stock_pcp *stock; + int cpu = get_cpu_light(); + + stock = &per_cpu(memcg_stock, cpu); + + if (stock->cached != memcg) { /* reset if necessary */ + drain_stock(stock); + stock->cached = memcg; + } + stock->nr_pages += nr_pages; + put_cpu_light(); +} + +/* + * Drains all per-CPU charge caches for given root_memcg resp. subtree + * of the hierarchy under it. + */ +static void drain_all_stock(struct mem_cgroup *root_memcg) +{ + int cpu, curcpu; + + /* If someone's already draining, avoid adding running more workers. */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; + /* Notify other cpus that system-wide "drain" is running */ + get_online_cpus(); + curcpu = get_cpu_light(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *memcg; + + memcg = stock->cached; + if (!memcg || !stock->nr_pages) + continue; + if (!mem_cgroup_is_descendant(memcg, root_memcg)) + continue; + if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (cpu == curcpu) + drain_local_stock(&stock->work); + else + schedule_work_on(cpu, &stock->work); + } + } + put_cpu_light(); + put_online_cpus(); + mutex_unlock(&percpu_charge_mutex); +} + +/* + * This function drains percpu counter value from DEAD cpu and + * move it to local cpu. Note that this function can be preempted. + */ +static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) +{ + int i; + + spin_lock(&memcg->pcp_counter_lock); + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + long x = per_cpu(memcg->stat->count[i], cpu); + + per_cpu(memcg->stat->count[i], cpu) = 0; + memcg->nocpu_base.count[i] += x; + } + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long x = per_cpu(memcg->stat->events[i], cpu); + + per_cpu(memcg->stat->events[i], cpu) = 0; + memcg->nocpu_base.events[i] += x; + } + spin_unlock(&memcg->pcp_counter_lock); +} + +static int memcg_cpu_hotplug_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct memcg_stock_pcp *stock; + struct mem_cgroup *iter; + + if (action == CPU_ONLINE) + return NOTIFY_OK; + + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) + return NOTIFY_OK; + + for_each_mem_cgroup(iter) + mem_cgroup_drain_pcp_counter(iter, cpu); + + stock = &per_cpu(memcg_stock, cpu); + drain_stock(stock); + return NOTIFY_OK; +} + +static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) +{ + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct mem_cgroup *mem_over_limit; + struct page_counter *counter; + unsigned long nr_reclaimed; + bool may_swap = true; + bool drained = false; + int ret = 0; + + if (mem_cgroup_is_root(memcg)) + goto done; +retry: + if (consume_stock(memcg, nr_pages)) + goto done; + + if (!do_swap_account || + !page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (!page_counter_try_charge(&memcg->memory, batch, &counter)) + goto done_restock; + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, batch); + mem_over_limit = mem_cgroup_from_counter(counter, memory); + } else { + mem_over_limit = mem_cgroup_from_counter(counter, memsw); + may_swap = false; + } + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + + /* + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; + + if (!(gfp_mask & __GFP_WAIT)) + goto nomem; + + mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); + + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, + gfp_mask, may_swap); + + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) + goto retry; + + if (!drained) { + drain_all_stock(mem_over_limit); + drained = true; + goto retry; + } + + if (gfp_mask & __GFP_NORETRY) + goto nomem; + /* + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. + */ + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; + /* + * At task move, charge accounts can be doubly counted. So, it's + * better to wait until the end of task_move if something is going on. + */ + if (mem_cgroup_wait_acct_move(mem_over_limit)) + goto retry; + + if (nr_retries--) + goto retry; + + if (gfp_mask & __GFP_NOFAIL) + goto bypass; + + if (fatal_signal_pending(current)) + goto bypass; + + mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); + + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); +nomem: + if (!(gfp_mask & __GFP_NOFAIL)) + return -ENOMEM; +bypass: + return -EINTR; + +done_restock: + css_get_many(&memcg->css, batch); + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); + if (!(gfp_mask & __GFP_WAIT)) + goto done; + /* + * If the hierarchy is above the normal consumption range, + * make the charging task trim their excess contribution. + */ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); +done: + return ret; +} + +static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (mem_cgroup_is_root(memcg)) + return; + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); + + css_put_many(&memcg->css, nr_pages); +} + +/* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ +struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) +{ + struct mem_cgroup *memcg; + unsigned short id; + swp_entry_t ent; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + memcg = page->mem_cgroup; + if (memcg) { + if (!css_tryget_online(&memcg->css)) + memcg = NULL; + } else if (PageSwapCache(page)) { + ent.val = page_private(page); + id = lookup_swap_cgroup_id(ent); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg && !css_tryget_online(&memcg->css)) + memcg = NULL; + rcu_read_unlock(); + } + return memcg; +} + +static void lock_page_lru(struct page *page, int *isolated) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + *isolated = 1; + } else + *isolated = 0; +} + +static void unlock_page_lru(struct page *page, int isolated) +{ + struct zone *zone = page_zone(page); + + if (isolated) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(PageLRU(page), page); + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, page_lru(page)); + } + spin_unlock_irq(&zone->lru_lock); +} + +static void commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) +{ + int isolated; + + VM_BUG_ON_PAGE(page->mem_cgroup, page); + + /* + * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page + * may already be on some other mem_cgroup's LRU. Take care of it. + */ + if (lrucare) + lock_page_lru(page, &isolated); + + /* + * Nobody should be changing or seriously looking at + * page->mem_cgroup at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked + */ + page->mem_cgroup = memcg; + + if (lrucare) + unlock_page_lru(page, isolated); +} + +#ifdef CONFIG_MEMCG_KMEM +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages) +{ + struct page_counter *counter; + int ret = 0; + + ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); + if (ret < 0) + return ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret == -EINTR) { + /* + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change + * our minds. + * + * This condition will only trigger if the task entered + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already + * directed to the root cgroup in memcontrol.h + */ + page_counter_charge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); + ret = 0; + } else if (ret) + page_counter_uncharge(&memcg->kmem, nr_pages); + + return ret; +} + +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) +{ + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); + + page_counter_uncharge(&memcg->kmem, nr_pages); + + css_put_many(&memcg->css, nr_pages); +} + +/* + * helper for acessing a memcg's index. It will be used as an index in the + * child cache array in kmem_cache, and also to derive its name. This function + * will return -1 when this is not a kmem-limited memcg. + */ +int memcg_cache_id(struct mem_cgroup *memcg) +{ + return memcg ? memcg->kmemcg_id : -1; +} + +static int memcg_alloc_cache_id(void) +{ + int id, size; + int err; + + id = ida_simple_get(&memcg_cache_ida, + 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); + if (id < 0) + return id; + + if (id < memcg_nr_cache_ids) + return id; + + /* + * There's no space for the new id in memcg_caches arrays, + * so we have to grow them. + */ + down_write(&memcg_cache_ids_sem); + + size = 2 * (id + 1); + if (size < MEMCG_CACHES_MIN_SIZE) + size = MEMCG_CACHES_MIN_SIZE; + else if (size > MEMCG_CACHES_MAX_SIZE) + size = MEMCG_CACHES_MAX_SIZE; + + err = memcg_update_all_caches(size); + if (!err) + err = memcg_update_all_list_lrus(size); + if (!err) + memcg_nr_cache_ids = size; + + up_write(&memcg_cache_ids_sem); + + if (err) { + ida_simple_remove(&memcg_cache_ida, id); + return err; + } + return id; +} + +static void memcg_free_cache_id(int id) +{ + ida_simple_remove(&memcg_cache_ida, id); +} + +struct memcg_kmem_cache_create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + +static void memcg_kmem_cache_create_func(struct work_struct *w) +{ + struct memcg_kmem_cache_create_work *cw = + container_of(w, struct memcg_kmem_cache_create_work, work); + struct mem_cgroup *memcg = cw->memcg; + struct kmem_cache *cachep = cw->cachep; + + memcg_create_kmem_cache(memcg, cachep); + + css_put(&memcg->css); + kfree(cw); +} + +/* + * Enqueue the creation of a per-memcg kmem_cache. + */ +static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + struct memcg_kmem_cache_create_work *cw; + + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); + if (!cw) + return; + + css_get(&memcg->css); + + cw->memcg = memcg; + cw->cachep = cachep; + INIT_WORK(&cw->work, memcg_kmem_cache_create_func); + + schedule_work(&cw->work); +} + +static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_schedule_kmem_cache_create will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + current->memcg_kmem_skip_account = 1; + __memcg_schedule_kmem_cache_create(memcg, cachep); + current->memcg_kmem_skip_account = 0; +} + +/* + * Return the kmem_cache we're supposed to use for a slab allocation. + * We try to use the current memcg's version of the cache. + * + * If the cache does not exist yet, if we are the first user of it, + * we either create it immediately, if possible, or create it asynchronously + * in a workqueue. + * In the latter case, we will let the current allocation go through with + * the original cache. + * + * Can't be called in interrupt context or from kernel threads. + * This function needs to be called with rcu_read_lock() held. + */ +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) +{ + struct mem_cgroup *memcg; + struct kmem_cache *memcg_cachep; + int kmemcg_id; + + VM_BUG_ON(!is_root_cache(cachep)); + + if (current->memcg_kmem_skip_account) + return cachep; + + memcg = get_mem_cgroup_from_mm(current->mm); + kmemcg_id = READ_ONCE(memcg->kmemcg_id); + if (kmemcg_id < 0) + goto out; + + memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); + if (likely(memcg_cachep)) + return memcg_cachep; + + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. + */ + memcg_schedule_kmem_cache_create(memcg, cachep); +out: + css_put(&memcg->css); + return cachep; +} + +void __memcg_kmem_put_cache(struct kmem_cache *cachep) +{ + if (!is_root_cache(cachep)) + css_put(&cachep->memcg_params.memcg->css); +} + +/* + * We need to verify if the allocation against current->mm->owner's memcg is + * possible for the given order. But the page is not allocated yet, so we'll + * need a further commit step to do the final arrangements. + * + * It is possible for the task to switch cgroups in this mean time, so at + * commit time, we can't rely on task conversion any longer. We'll then use + * the handle argument to return to the caller which cgroup we should commit + * against. We could also return the memcg directly and avoid the pointer + * passing, but a boolean return value gives better semantics considering + * the compiled-out case as well. + * + * Returning true means the allocation is possible. + */ +bool +__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +{ + struct mem_cgroup *memcg; + int ret; + + *_memcg = NULL; + + memcg = get_mem_cgroup_from_mm(current->mm); + + if (!memcg_kmem_is_active(memcg)) { + css_put(&memcg->css); + return true; + } + + ret = memcg_charge_kmem(memcg, gfp, 1 << order); + if (!ret) + *_memcg = memcg; + + css_put(&memcg->css); + return (ret == 0); +} + +void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, + int order) +{ + VM_BUG_ON(mem_cgroup_is_root(memcg)); + + /* The page allocation failed. Revert */ + if (!page) { + memcg_uncharge_kmem(memcg, 1 << order); + return; + } + page->mem_cgroup = memcg; +} + +void __memcg_kmem_uncharge_pages(struct page *page, int order) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + + if (!memcg) + return; + + VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); + + memcg_uncharge_kmem(memcg, 1 << order); + page->mem_cgroup = NULL; +} + +struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) +{ + struct mem_cgroup *memcg = NULL; + struct kmem_cache *cachep; + struct page *page; + + page = virt_to_head_page(ptr); + if (PageSlab(page)) { + cachep = page->slab_cache; + if (!is_root_cache(cachep)) + memcg = cachep->memcg_params.memcg; + } else + /* page allocated by alloc_kmem_pages */ + memcg = page->mem_cgroup; + + return memcg; +} +#endif /* CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +/* + * Because tail pages are not marked as "used", set it. We're under + * zone->lru_lock, 'splitting on pmd' and compound_lock. + * charge/uncharge will be never happen and move_account() is done under + * compound_lock(), so we don't have to take care of races. + */ +void mem_cgroup_split_huge_fixup(struct page *head) +{ + int i; + + if (mem_cgroup_disabled()) + return; + + for (i = 1; i < HPAGE_PMD_NR; i++) + head[i].mem_cgroup = head->mem_cgroup; + + __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + HPAGE_PMD_NR); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_MEMCG_SWAP +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) +{ + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); +} + +/** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called page_counter_charge() about + * both res and memsw, and called css_get(). + */ +static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) +{ + unsigned short old_id, new_id; + + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to); + + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + mem_cgroup_swap_statistics(from, false); + mem_cgroup_swap_statistics(to, true); + return 0; + } + return -EINVAL; +} +#else +static inline int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) +{ + return -EINVAL; +} +#endif + +static DEFINE_MUTEX(memcg_limit_mutex); + +static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, + unsigned long limit) +{ + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; + int retry_count; + int ret; + + /* + * For keeping hierarchical_reclaim simple, how long we should retry + * is depends on callers. We set our retry-count to be function + * of # of children which we should visit in this loop. + */ + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); + + oldusage = page_counter_read(&memcg->memory); + + do { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + mutex_lock(&memcg_limit_mutex); + if (limit > memcg->memsw.limit) { + mutex_unlock(&memcg_limit_mutex); + ret = -EINVAL; + break; + } + if (limit > memcg->memory.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memory, limit); + mutex_unlock(&memcg_limit_mutex); + + if (!ret) + break; + + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); + + curusage = page_counter_read(&memcg->memory); + /* Usage is reduced ? */ + if (curusage >= oldusage) + retry_count--; + else + oldusage = curusage; + } while (retry_count); + + if (!ret && enlarge) + memcg_oom_recover(memcg); + + return ret; +} + +static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, + unsigned long limit) +{ + unsigned long curusage; + unsigned long oldusage; + bool enlarge = false; + int retry_count; + int ret; + + /* see mem_cgroup_resize_res_limit */ + retry_count = MEM_CGROUP_RECLAIM_RETRIES * + mem_cgroup_count_children(memcg); + + oldusage = page_counter_read(&memcg->memsw); + + do { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + mutex_lock(&memcg_limit_mutex); + if (limit < memcg->memory.limit) { + mutex_unlock(&memcg_limit_mutex); + ret = -EINVAL; + break; + } + if (limit > memcg->memsw.limit) + enlarge = true; + ret = page_counter_limit(&memcg->memsw, limit); + mutex_unlock(&memcg_limit_mutex); + + if (!ret) + break; + + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); + + curusage = page_counter_read(&memcg->memsw); + /* Usage is reduced ? */ + if (curusage >= oldusage) + retry_count--; + else + oldusage = curusage; + } while (retry_count); + + if (!ret && enlarge) + memcg_oom_recover(memcg); + + return ret; +} + +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long excess; + unsigned long nr_scanned; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + nr_scanned = 0; + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + gfp_mask, &nr_scanned); + nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; + spin_lock_irq(&mctz->lock); + __mem_cgroup_remove_exceeded(mz, mctz); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) + next_mz = __mem_cgroup_largest_soft_limit_node(mctz); + + excess = soft_limit_excess(mz->memcg); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irq(&mctz->lock); + css_put(&mz->memcg->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->memcg->css); + return nr_reclaimed; +} + +/* + * Test whether @memcg has children, dead or alive. Note that this + * function doesn't care whether @memcg has use_hierarchy enabled and + * returns %true if there are child csses according to the cgroup + * hierarchy. Testing use_hierarchy is the caller's responsiblity. + */ +static inline bool memcg_has_children(struct mem_cgroup *memcg) +{ + bool ret; + + /* + * The lock does not prevent addition or deletion of children, but + * it prevents a new child from being initialized based on this + * parent in css_online(), so it's enough to decide whether + * hierarchically inherited attributes can still be changed or not. + */ + lockdep_assert_held(&memcg_create_mutex); + + rcu_read_lock(); + ret = css_next_child(NULL, &memcg->css); + rcu_read_unlock(); + return ret; +} + +/* + * Reclaims as many pages from the given memcg as possible and moves + * the rest to the parent. + * + * Caller is responsible for holding css reference for memcg. + */ +static int mem_cgroup_force_empty(struct mem_cgroup *memcg) +{ + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + + /* we call try-to-free pages for make this cgroup empty */ + lru_add_drain_all(); + /* try to free all pages in this cgroup */ + while (nr_retries && page_counter_read(&memcg->memory)) { + int progress; + + if (signal_pending(current)) + return -EINTR; + + progress = try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, true); + if (!progress) { + nr_retries--; + /* maybe some writeback is necessary */ + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + } + + return 0; +} + +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + if (mem_cgroup_is_root(memcg)) + return -EINVAL; + return mem_cgroup_force_empty(memcg) ?: nbytes; +} + +static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_from_css(css)->use_hierarchy; +} + +static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + int retval = 0; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); + + mutex_lock(&memcg_create_mutex); + + if (memcg->use_hierarchy == val) + goto out; + + /* + * If parent's use_hierarchy is set, we can't make any modifications + * in the child subtrees. If it is unset, then the change can + * occur, provided the current cgroup has no children. + * + * For the root cgroup, parent_mem is NULL, we allow value to be + * set if there are no children. + */ + if ((!parent_memcg || !parent_memcg->use_hierarchy) && + (val == 1 || val == 0)) { + if (!memcg_has_children(memcg)) + memcg->use_hierarchy = val; + else + retval = -EBUSY; + } else + retval = -EINVAL; + +out: + mutex_unlock(&memcg_create_mutex); + + return retval; +} + +static unsigned long tree_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx) +{ + struct mem_cgroup *iter; + long val = 0; + + /* Per-cpu values can be negative, use a signed accumulator */ + for_each_mem_cgroup_tree(iter, memcg) + val += mem_cgroup_read_stat(iter, idx); + + if (val < 0) /* race ? */ + val = 0; + return val; +} + +static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +{ + u64 val; + + if (mem_cgroup_is_root(memcg)) { + val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); + if (swap) + val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + } else { + if (!swap) + val = page_counter_read(&memcg->memory); + else + val = page_counter_read(&memcg->memsw); + } + return val << PAGE_SHIFT; +} + +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, + RES_SOFT_LIMIT, +}; + +static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct page_counter *counter; + + switch (MEMFILE_TYPE(cft->private)) { + case _MEM: + counter = &memcg->memory; + break; + case _MEMSWAP: + counter = &memcg->memsw; + break; + case _KMEM: + counter = &memcg->kmem; + break; + default: + BUG(); + } + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + if (counter == &memcg->memory) + return mem_cgroup_usage(memcg, false); + if (counter == &memcg->memsw) + return mem_cgroup_usage(memcg, true); + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + case RES_SOFT_LIMIT: + return (u64)memcg->soft_limit * PAGE_SIZE; + default: + BUG(); + } +} + +#ifdef CONFIG_MEMCG_KMEM +static int memcg_activate_kmem(struct mem_cgroup *memcg, + unsigned long nr_pages) +{ + int err = 0; + int memcg_id; + + BUG_ON(memcg->kmemcg_id >= 0); + BUG_ON(memcg->kmem_acct_activated); + BUG_ON(memcg->kmem_acct_active); + + /* + * For simplicity, we won't allow this to be disabled. It also can't + * be changed if the cgroup has children already, or if tasks had + * already joined. + * + * If tasks join before we set the limit, a person looking at + * kmem.usage_in_bytes will have no way to determine when it took + * place, which makes the value quite meaningless. + * + * After it first became limited, changes in the value of the limit are + * of course permitted. + */ + mutex_lock(&memcg_create_mutex); + if (cgroup_has_tasks(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) + err = -EBUSY; + mutex_unlock(&memcg_create_mutex); + if (err) + goto out; + + memcg_id = memcg_alloc_cache_id(); + if (memcg_id < 0) { + err = memcg_id; + goto out; + } + + /* + * We couldn't have accounted to this cgroup, because it hasn't got + * activated yet, so this should succeed. + */ + err = page_counter_limit(&memcg->kmem, nr_pages); + VM_BUG_ON(err); + + static_key_slow_inc(&memcg_kmem_enabled_key); + /* + * A memory cgroup is considered kmem-active as soon as it gets + * kmemcg_id. Setting the id after enabling static branching will + * guarantee no one starts accounting before all call sites are + * patched. + */ + memcg->kmemcg_id = memcg_id; + memcg->kmem_acct_activated = true; + memcg->kmem_acct_active = true; +out: + return err; +} + +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long limit) +{ + int ret; + + mutex_lock(&memcg_limit_mutex); + if (!memcg_kmem_is_active(memcg)) + ret = memcg_activate_kmem(memcg, limit); + else + ret = page_counter_limit(&memcg->kmem, limit); + mutex_unlock(&memcg_limit_mutex); + return ret; +} + +static int memcg_propagate_kmem(struct mem_cgroup *memcg) +{ + int ret = 0; + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + + if (!parent) + return 0; + + mutex_lock(&memcg_limit_mutex); + /* + * If the parent cgroup is not kmem-active now, it cannot be activated + * after this point, because it has at least one child already. + */ + if (memcg_kmem_is_active(parent)) + ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); + mutex_unlock(&memcg_limit_mutex); + return ret; +} +#else +static int memcg_update_kmem_limit(struct mem_cgroup *memcg, + unsigned long limit) +{ + return -EINVAL; +} +#endif /* CONFIG_MEMCG_KMEM */ + +/* + * The user of this function is... + * RES_LIMIT. + */ +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; + int ret; + + buf = strstrip(buf); + ret = page_counter_memparse(buf, "-1", &nr_pages); + if (ret) + return ret; + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + ret = mem_cgroup_resize_limit(memcg, nr_pages); + break; + case _MEMSWAP: + ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); + break; + case _KMEM: + ret = memcg_update_kmem_limit(memcg, nr_pages); + break; + } + break; + case RES_SOFT_LIMIT: + memcg->soft_limit = nr_pages; + ret = 0; + break; + } + return ret ?: nbytes; +} + +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct page_counter *counter; + + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + counter = &memcg->memory; + break; + case _MEMSWAP: + counter = &memcg->memsw; + break; + case _KMEM: + counter = &memcg->kmem; + break; + default: + BUG(); + } + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_MAX_USAGE: + page_counter_reset_watermark(counter); + break; + case RES_FAILCNT: + counter->failcnt = 0; + break; + default: + BUG(); + } + + return nbytes; +} + +static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_from_css(css)->move_charge_at_immigrate; +} + +#ifdef CONFIG_MMU +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val & ~MOVE_MASK) + return -EINVAL; + + /* + * No kind of locking is needed in here, because ->can_attach() will + * check this value once in the beginning of the process, and then carry + * on with stale data. This means that changes to this value will only + * affect task migrations starting after the change. + */ + memcg->move_charge_at_immigrate = val; + return 0; +} +#else +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + return -ENOSYS; +} +#endif + +#ifdef CONFIG_NUMA +static int memcg_numa_stat_show(struct seq_file *m, void *v) +{ + struct numa_stat { + const char *name; + unsigned int lru_mask; + }; + + static const struct numa_stat stats[] = { + { "total", LRU_ALL }, + { "file", LRU_ALL_FILE }, + { "anon", LRU_ALL_ANON }, + { "unevictable", BIT(LRU_UNEVICTABLE) }, + }; + const struct numa_stat *stat; + int nid; + unsigned long nr; + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); + seq_printf(m, "%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); + } + + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + struct mem_cgroup *iter; + + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); + seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_node_nr_lru_pages( + iter, nid, stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); + } + + return 0; +} +#endif /* CONFIG_NUMA */ + +static int memcg_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long memory, memsw; + struct mem_cgroup *mi; + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != + MEM_CGROUP_STAT_NSTATS); + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != + MEM_CGROUP_EVENTS_NSTATS); + BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); + + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + continue; + seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], + mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); + } + + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) + seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], + mem_cgroup_read_events(memcg, i)); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], + mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); + + /* Hierarchical information */ + memory = memsw = PAGE_COUNTER_MAX; + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { + memory = min(memory, mi->memory.limit); + memsw = min(memsw, mi->memsw.limit); + } + seq_printf(m, "hierarchical_memory_limit %llu\n", + (u64)memory * PAGE_SIZE); + if (do_swap_account) + seq_printf(m, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); + + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + long long val = 0; + + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + continue; + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; + seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); + } + + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_read_events(mi, i); + seq_printf(m, "total_%s %llu\n", + mem_cgroup_events_names[i], val); + } + + for (i = 0; i < NR_LRU_LISTS; i++) { + unsigned long long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); + } + +#ifdef CONFIG_DEBUG_VM + { + int nid, zid; + struct mem_cgroup_per_zone *mz; + struct zone_reclaim_stat *rstat; + unsigned long recent_rotated[2] = {0, 0}; + unsigned long recent_scanned[2] = {0, 0}; + + for_each_online_node(nid) + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + rstat = &mz->lruvec.reclaim_stat; + + recent_rotated[0] += rstat->recent_rotated[0]; + recent_rotated[1] += rstat->recent_rotated[1]; + recent_scanned[0] += rstat->recent_scanned[0]; + recent_scanned[1] += rstat->recent_scanned[1]; + } + seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); + seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); + seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); + seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); + } +#endif + + return 0; +} + +static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return mem_cgroup_swappiness(memcg); +} + +static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > 100) + return -EINVAL; + + if (css->parent) + memcg->swappiness = val; + else + vm_swappiness = val; + + return 0; +} + +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + unsigned long usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds.primary); + else + t = rcu_dereference(memcg->memsw_thresholds.primary); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below or equal to usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = t->current_threshold; + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd, 1); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd, 1); + + /* Update current_threshold */ + t->current_threshold = i - 1; +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_swap_account) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; +} + +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) +{ + struct mem_cgroup_eventfd_list *ev; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry(ev, &memcg->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + + spin_unlock(&memcg_oom_lock); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + mem_cgroup_oom_notify_cb(iter); +} + +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long threshold; + unsigned long usage; + int i, size, ret; + + ret = page_counter_memparse(args, "-1", &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds->primary) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + size = thresholds->primary ? thresholds->primary->size + 1 : 1; + + /* Allocate memory for new array of thresholds */ + new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), + GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto unlock; + } + new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds->primary) { + memcpy(new->entries, thresholds->primary->entries, (size - 1) * + sizeof(struct mem_cgroup_threshold)); + } + + /* Add new threshold */ + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(new->entries, size, sizeof(struct mem_cgroup_threshold), + compare_thresholds, NULL); + + /* Find current threshold */ + new->current_threshold = -1; + for (i = 0; i < size; i++) { + if (new->entries[i].threshold <= usage) { + /* + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } else + break; + } + + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long usage; + int i, j, size; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + if (!thresholds->primary) + goto unlock; + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + size = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) + size++; + } + + new = thresholds->spare; + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + kfree(new); + new = NULL; + goto swap_buffers; + } + + new->size = size; + + /* Copy thresholds and find current threshold */ + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) + continue; + + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold <= usage) { + /* + * new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + j++; + } + +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); +unlock: + mutex_unlock(&memcg->thresholds_lock); +} + +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup_eventfd_list *event; + + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + spin_lock(&memcg_oom_lock); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (atomic_read(&memcg->under_oom)) + eventfd_signal(eventfd, 1); + spin_unlock(&memcg_oom_lock); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct mem_cgroup_eventfd_list *ev, *tmp; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + spin_unlock(&memcg_oom_lock); +} + +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); + + seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); + seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); + return 0; +} + +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + /* cannot set to root cgroup and only 0 and 1 are allowed */ + if (!css->parent || !((val == 0) || (val == 1))) + return -EINVAL; + + memcg->oom_kill_disable = val; + if (!val) + memcg_oom_recover(memcg); + + return 0; +} + +#ifdef CONFIG_MEMCG_KMEM +static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +{ + int ret; + + ret = memcg_propagate_kmem(memcg); + if (ret) + return ret; + + return mem_cgroup_sockets_init(memcg, ss); +} + +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + + if (!memcg->kmem_acct_active) + return; + + /* + * Clear the 'active' flag before clearing memcg_caches arrays entries. + * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it + * guarantees no cache will be created for this cgroup after we are + * done (see memcg_create_kmem_cache()). + */ + memcg->kmem_acct_active = false; + + memcg_deactivate_kmem_caches(memcg); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). + */ + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); +} + +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ + if (memcg->kmem_acct_activated) { + memcg_destroy_kmem_caches(memcg); + static_key_slow_dec(&memcg_kmem_enabled_key); + WARN_ON(page_counter_read(&memcg->kmem)); + } + mem_cgroup_sockets_destroy(memcg); +} +#else +static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) +{ + return 0; +} + +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ +} + +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ +} +#endif + +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); + struct mem_cgroup *memcg = event->memcg; + + remove_wait_queue(event->wqh, &event->wait); + + event->unregister_event(memcg, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(&memcg->css); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); + struct mem_cgroup *memcg = event->memcg; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&memcg->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + } + + return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + const char *name; + char *endp; + int ret; + + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buf = endp + 1; + + cfd = simple_strtoul(buf, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buf = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + event->memcg = memcg; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(file_inode(cfile.file), MAY_READ); + if (ret < 0) + goto out_put_cfile; + + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. + */ + name = cfile.file->f_path.dentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. + */ + cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, + &memory_cgrp_subsys); + ret = -EINVAL; + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); + goto out_put_cfile; + } + + ret = event->register_event(memcg, event->eventfd, buf); + if (ret) + goto out_put_css; + + efile.file->f_op->poll(efile.file, &event->pt); + + spin_lock(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock(&memcg->event_list_lock); + + fdput(cfile); + fdput(efile); + + return nbytes; + +out_put_css: + css_put(css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + +static struct cftype mem_cgroup_legacy_files[] = { + { + .name = "usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "soft_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "failcnt", + .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "stat", + .seq_show = memcg_stat_show, + }, + { + .name = "force_empty", + .write = mem_cgroup_force_empty_write, + }, + { + .name = "use_hierarchy", + .write_u64 = mem_cgroup_hierarchy_write, + .read_u64 = mem_cgroup_hierarchy_read, + }, + { + .name = "cgroup.event_control", /* XXX: for compat */ + .write = memcg_write_event_control, + .flags = CFTYPE_NO_PREFIX, + .mode = S_IWUGO, + }, + { + .name = "swappiness", + .read_u64 = mem_cgroup_swappiness_read, + .write_u64 = mem_cgroup_swappiness_write, + }, + { + .name = "move_charge_at_immigrate", + .read_u64 = mem_cgroup_move_charge_read, + .write_u64 = mem_cgroup_move_charge_write, + }, + { + .name = "oom_control", + .seq_show = mem_cgroup_oom_control_read, + .write_u64 = mem_cgroup_oom_control_write, + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), + }, + { + .name = "pressure_level", + }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .seq_show = memcg_numa_stat_show, + }, +#endif +#ifdef CONFIG_MEMCG_KMEM + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, +#ifdef CONFIG_SLABINFO + { + .name = "kmem.slabinfo", + .seq_start = slab_start, + .seq_next = slab_next, + .seq_stop = slab_stop, + .seq_show = memcg_slab_show, + }, +#endif +#endif + { }, /* terminate */ +}; + +static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +{ + struct mem_cgroup_per_node *pn; + struct mem_cgroup_per_zone *mz; + int zone, tmp = node; + /* + * This routine is called against possible nodes. + * But it's BUG to call kmalloc() against offline node. + * + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + if (!pn) + return 1; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = &pn->zoneinfo[zone]; + lruvec_init(&mz->lruvec); + mz->usage_in_excess = 0; + mz->on_tree = false; + mz->memcg = memcg; + } + memcg->nodeinfo[node] = pn; + return 0; +} + +static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +{ + kfree(memcg->nodeinfo[node]); +} + +static struct mem_cgroup *mem_cgroup_alloc(void) +{ + struct mem_cgroup *memcg; + size_t size; + + size = sizeof(struct mem_cgroup); + size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); + + memcg = kzalloc(size, GFP_KERNEL); + if (!memcg) + return NULL; + + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) + goto out_free; + spin_lock_init(&memcg->pcp_counter_lock); + return memcg; + +out_free: + kfree(memcg); + return NULL; +} + +/* + * At destroying mem_cgroup, references from swap_cgroup can remain. + * (scanning all at force_empty is too costly...) + * + * Instead of clearing all references at force_empty, we remember + * the number of reference from swap_cgroup and free mem_cgroup when + * it goes down to 0. + * + * Removal of cgroup itself succeeds regardless of refs from swap. + */ + +static void __mem_cgroup_free(struct mem_cgroup *memcg) +{ + int node; + + mem_cgroup_remove_from_trees(memcg); + + for_each_node(node) + free_mem_cgroup_per_zone_info(memcg, node); + + free_percpu(memcg->stat); + kfree(memcg); +} + +/* + * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. + */ +struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +{ + if (!memcg->memory.parent) + return NULL; + return mem_cgroup_from_counter(memcg->memory.parent, memory); +} +EXPORT_SYMBOL(parent_mem_cgroup); + +static struct cgroup_subsys_state * __ref +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct mem_cgroup *memcg; + long error = -ENOMEM; + int node; + + memcg = mem_cgroup_alloc(); + if (!memcg) + return ERR_PTR(error); + + for_each_node(node) + if (alloc_mem_cgroup_per_zone_info(memcg, node)) + goto free_out; + + /* root ? */ + if (parent_css == NULL) { + root_mem_cgroup = memcg; + page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_init(&memcg->memsw, NULL); + page_counter_init(&memcg->kmem, NULL); + } + + memcg->last_scanned_node = MAX_NUMNODES; + INIT_LIST_HEAD(&memcg->oom_notify); + memcg->move_charge_at_immigrate = 0; + mutex_init(&memcg->thresholds_lock); + spin_lock_init(&memcg->move_lock); + vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->event_list); + spin_lock_init(&memcg->event_list_lock); +#ifdef CONFIG_MEMCG_KMEM + memcg->kmemcg_id = -1; +#endif + + return &memcg->css; + +free_out: + __mem_cgroup_free(memcg); + return ERR_PTR(error); +} + +static int +mem_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); + int ret; + + if (css->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; + + if (!parent) + return 0; + + mutex_lock(&memcg_create_mutex); + + memcg->use_hierarchy = parent->use_hierarchy; + memcg->oom_kill_disable = parent->oom_kill_disable; + memcg->swappiness = mem_cgroup_swappiness(parent); + + if (parent->use_hierarchy) { + page_counter_init(&memcg->memory, &parent->memory); + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_init(&memcg->memsw, &parent->memsw); + page_counter_init(&memcg->kmem, &parent->kmem); + + /* + * No need to take a reference to the parent because cgroup + * core guarantees its existence. + */ + } else { + page_counter_init(&memcg->memory, NULL); + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_init(&memcg->memsw, NULL); + page_counter_init(&memcg->kmem, NULL); + /* + * Deeper hierachy with use_hierarchy == false doesn't make + * much sense so let cgroup subsystem know about this + * unfortunate state in our controller. + */ + if (parent != root_mem_cgroup) + memory_cgrp_subsys.broken_hierarchy = true; + } + mutex_unlock(&memcg_create_mutex); + + ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); + if (ret) + return ret; + + /* + * Make sure the memcg is initialized: mem_cgroup_iter() + * orders reading memcg->initialized against its callers + * reading the memcg members. + */ + smp_store_release(&memcg->initialized, 1); + + return 0; +} + +static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + + vmpressure_cleanup(&memcg->vmpressure); + + memcg_deactivate_kmem(memcg); +} + +static void mem_cgroup_css_free(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + memcg_destroy_kmem(memcg); + __mem_cgroup_free(memcg); +} + +/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css. This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency. The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); + mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); + memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + memcg->low = 0; + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; +} + +#ifdef CONFIG_MMU +/* Handlers for move charge at task migration. */ +static int mem_cgroup_do_precharge(unsigned long count) +{ + int ret; + + /* Try a single bulk charge without reclaim first */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { + mc.precharge += count; + return ret; + } + if (ret == -EINTR) { + cancel_charge(root_mem_cgroup, count); + return ret; + } + + /* Try charges one by one with reclaim */ + while (count--) { + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + cancel_charge(root_mem_cgroup, 1); + if (ret) + return ret; + mc.precharge++; + cond_resched(); + } + return 0; +} + +/** + * get_mctgt_type - get target type of moving charge + * @vma: the vma the pte to be checked belongs + * @addr: the address corresponding to the pte to be checked + * @ptent: the pte to be checked + * @target: the pointer the target page or swap ent will be stored(can be NULL) + * + * Returns + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for + * move charge. if @target is not NULL, the page is stored in target->page + * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. + * + * Called with pte lock held. + */ +union mc_target { + struct page *page; + swp_entry_t ent; +}; + +enum mc_target_type { + MC_TARGET_NONE = 0, + MC_TARGET_PAGE, + MC_TARGET_SWAP, +}; + +static struct page *mc_handle_present_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent) +{ + struct page *page = vm_normal_page(vma, addr, ptent); + + if (!page || !page_mapped(page)) + return NULL; + if (PageAnon(page)) { + if (!(mc.flags & MOVE_ANON)) + return NULL; + } else { + if (!(mc.flags & MOVE_FILE)) + return NULL; + } + if (!get_page_unless_zero(page)) + return NULL; + + return page; +} + +#ifdef CONFIG_SWAP +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + swp_entry_t ent = pte_to_swp_entry(ptent); + + if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) + return NULL; + /* + * Because lookup_swap_cache() updates some statistics counter, + * we call find_get_page() with swapper_space directly. + */ + page = find_get_page(swap_address_space(ent), ent.val); + if (do_swap_account) + entry->val = ent.val; + + return page; +} +#else +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + return NULL; +} +#endif + +static struct page *mc_handle_file_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + struct address_space *mapping; + pgoff_t pgoff; + + if (!vma->vm_file) /* anonymous vma */ + return NULL; + if (!(mc.flags & MOVE_FILE)) + return NULL; + + mapping = vma->vm_file->f_mapping; + pgoff = linear_page_index(vma, addr); + + /* page is moved even if it's not RSS of this task(page-faulted). */ +#ifdef CONFIG_SWAP + /* shmem/tmpfs may report page out on swap: account for that too. */ + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); +#endif + return page; +} + +/** + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) + * @from: mem_cgroup which the page is moved from. + * @to: mem_cgroup which the page is moved to. @from != @to. + * + * The caller must confirm following. + * - page is not on LRU (isolate_page() is useful.) + * - compound_lock is held when nr_pages > 1 + * + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. + */ +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct mem_cgroup *from, + struct mem_cgroup *to) +{ + unsigned long flags; + int ret; + + VM_BUG_ON(from == to); + VM_BUG_ON_PAGE(PageLRU(page), page); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + /* + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; + + ret = -EINVAL; + if (page->mem_cgroup != from) + goto out_unlock; + + spin_lock_irqsave(&from->move_lock, flags); + + if (!PageAnon(page) && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } + + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } + + /* + * It is safe to change page->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ + + /* caller should have done css_get */ + page->mem_cgroup = to; + spin_unlock_irqrestore(&from->move_lock, flags); + + ret = 0; + + local_lock_irq(event_lock); + mem_cgroup_charge_statistics(to, page, nr_pages); + memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); + memcg_check_events(from, page); + local_unlock_irq(event_lock); +out_unlock: + unlock_page(page); +out: + return ret; +} + +static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, union mc_target *target) +{ + struct page *page = NULL; + enum mc_target_type ret = MC_TARGET_NONE; + swp_entry_t ent = { .val = 0 }; + + if (pte_present(ptent)) + page = mc_handle_present_pte(vma, addr, ptent); + else if (is_swap_pte(ptent)) + page = mc_handle_swap_pte(vma, addr, ptent, &ent); + else if (pte_none(ptent)) + page = mc_handle_file_pte(vma, addr, ptent, &ent); + + if (!page && !ent.val) + return ret; + if (page) { + /* + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the page is valid or + * not under LRU exclusion. + */ + if (page->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) + target->page = page; + } + if (!ret || !target) + put_page(page); + } + /* There is a swap entry and a page doesn't exist or isn't charged */ + if (ent.val && !ret && + mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { + ret = MC_TARGET_SWAP; + if (target) + target->ent = ent; + } + return ret; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * We don't consider swapping or file mapped pages because THP does not + * support them for now. + * Caller should make sure that pmd_trans_huge(pmd) is true. + */ +static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + struct page *page = NULL; + enum mc_target_type ret = MC_TARGET_NONE; + + page = pmd_page(pmd); + VM_BUG_ON_PAGE(!page || !PageHead(page), page); + if (!(mc.flags & MOVE_ANON)) + return ret; + if (page->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) { + get_page(page); + target->page = page; + } + } + return ret; +} +#else +static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + return MC_TARGET_NONE; +} +#endif + +static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + pte_t *pte; + spinlock_t *ptl; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) + mc.precharge += HPAGE_PMD_NR; + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + if (get_mctgt_type(vma, addr, *pte, NULL)) + mc.precharge++; /* increment precharge temporarily */ + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + return 0; +} + +static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) +{ + unsigned long precharge; + + struct mm_walk mem_cgroup_count_precharge_walk = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, + .mm = mm, + }; + down_read(&mm->mmap_sem); + walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); + up_read(&mm->mmap_sem); + + precharge = mc.precharge; + mc.precharge = 0; + + return precharge; +} + +static int mem_cgroup_precharge_mc(struct mm_struct *mm) +{ + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); +} + +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + struct mem_cgroup *to = mc.to; + + /* we must uncharge all the leftover precharges from mc.to */ + if (mc.precharge) { + cancel_charge(mc.to, mc.precharge); + mc.precharge = 0; + } + /* + * we didn't uncharge from mc.from at mem_cgroup_move_account(), so + * we must uncharge here. + */ + if (mc.moved_charge) { + cancel_charge(mc.from, mc.moved_charge); + mc.moved_charge = 0; + } + /* we must fixup refcnts and charges */ + if (mc.moved_swap) { + /* uncharge swap account from the old cgroup */ + if (!mem_cgroup_is_root(mc.from)) + page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + + /* + * we charged both to->memory and to->memsw, so we + * should uncharge to->memory. + */ + if (!mem_cgroup_is_root(mc.to)) + page_counter_uncharge(&mc.to->memory, mc.moved_swap); + + css_put_many(&mc.from->css, mc.moved_swap); + + /* we've already done css_get(mc.to) */ + mc.moved_swap = 0; + } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); + spin_lock(&mc.lock); + mc.from = NULL; + mc.to = NULL; + spin_unlock(&mc.lock); +} + +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *p = cgroup_taskset_first(tset); + int ret = 0; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + unsigned long move_flags; + + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (move_flags) { + struct mm_struct *mm; + struct mem_cgroup *from = mem_cgroup_from_task(p); + + VM_BUG_ON(from == memcg); + + mm = get_task_mm(p); + if (!mm) + return 0; + /* We move charges only when we move a owner of the mm */ + if (mm->owner == p) { + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + + spin_lock(&mc.lock); + mc.from = from; + mc.to = memcg; + mc.flags = move_flags; + spin_unlock(&mc.lock); + /* We set mc.moving_task later */ + + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); + } + mmput(mm); + } + return ret; +} + +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + if (mc.to) + mem_cgroup_clear_mc(); +} + +static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + int ret = 0; + struct vm_area_struct *vma = walk->vma; + pte_t *pte; + spinlock_t *ptl; + enum mc_target_type target_type; + union mc_target target; + struct page *page; + + /* + * We don't take compound_lock() here but no race with splitting thp + * happens because: + * - if pmd_trans_huge_lock() returns 1, the relevant thp is not + * under splitting, which means there's no concurrent thp split, + * - if another thread runs into split_huge_page() just after we + * entered this if-block, the thread must wait for page table lock + * to be unlocked in __split_huge_page_splitting(), where the main + * part of thp split is not executed yet. + */ + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (mc.precharge < HPAGE_PMD_NR) { + spin_unlock(ptl); + return 0; + } + target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); + if (target_type == MC_TARGET_PAGE) { + page = target.page; + if (!isolate_lru_page(page)) { + if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + mc.from, mc.to)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + putback_lru_page(page); + } + put_page(page); + } + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +retry: + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; addr += PAGE_SIZE) { + pte_t ptent = *(pte++); + swp_entry_t ent; + + if (!mc.precharge) + break; + + switch (get_mctgt_type(vma, addr, ptent, &target)) { + case MC_TARGET_PAGE: + page = target.page; + if (isolate_lru_page(page)) + goto put; + if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { + mc.precharge--; + /* we uncharge from mc.from later. */ + mc.moved_charge++; + } + putback_lru_page(page); +put: /* get_mctgt_type() gets the page */ + put_page(page); + break; + case MC_TARGET_SWAP: + ent = target.ent; + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { + mc.precharge--; + /* we fixup refcnts and charges later. */ + mc.moved_swap++; + } + break; + default: + break; + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + if (addr != end) { + /* + * We have consumed all precharges we got in can_attach(). + * We try charge one by one, but don't do any additional + * charges to mc.to if we have failed in charge once in attach() + * phase. + */ + ret = mem_cgroup_do_precharge(1); + if (!ret) + goto retry; + } + + return ret; +} + +static void mem_cgroup_move_charge(struct mm_struct *mm) +{ + struct mm_walk mem_cgroup_move_charge_walk = { + .pmd_entry = mem_cgroup_move_charge_pte_range, + .mm = mm, + }; + + lru_add_drain_all(); + /* + * Signal mem_cgroup_begin_page_stat() to take the memcg's + * move_lock while we're moving its pages to another memcg. + * Then wait for already started RCU-only updates to finish. + */ + atomic_inc(&mc.from->moving_account); + synchronize_rcu(); +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } + /* + * When we have consumed all precharges and failed in doing + * additional charge, the page walk just aborts. + */ + walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); + up_read(&mm->mmap_sem); + atomic_dec(&mc.from->moving_account); +} + +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *p = cgroup_taskset_first(tset); + struct mm_struct *mm = get_task_mm(p); + + if (mm) { + if (mc.to) + mem_cgroup_move_charge(mm); + mmput(mm); + } + if (mc.to) + mem_cgroup_clear_mc(); +} +#else /* !CONFIG_MMU */ +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + return 0; +} +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ +} +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ +} +#endif + +/* + * Cgroup retains root cgroups across [un]mount cycles making it necessary + * to verify whether we're attached to the default hierarchy on each mount + * attempt. + */ +static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) +{ + /* + * use_hierarchy is forced on the default hierarchy. cgroup core + * guarantees that @root doesn't have any children, so turning it + * on for the root memcg is enough. + */ + if (cgroup_on_dfl(root_css->cgroup)) + root_mem_cgroup->use_hierarchy = true; + else + root_mem_cgroup->use_hierarchy = false; +} + +static u64 memory_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_usage(mem_cgroup_from_css(css), false); +} + +static int memory_low_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long low = READ_ONCE(memcg->low); + + if (low == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &low); + if (err) + return err; + + memcg->low = low; + + return nbytes; +} + +static int memory_high_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long high = READ_ONCE(memcg->high); + + if (high == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + memcg->high = high; + + return nbytes; +} + +static int memory_max_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = READ_ONCE(memcg->memory.limit); + + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + err = mem_cgroup_resize_limit(memcg, max); + if (err) + return err; + + return nbytes; +} + +static int memory_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); + seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); + seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); + seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); + + return 0; +} + +static struct cftype memory_files[] = { + { + .name = "current", + .read_u64 = memory_current_read, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_max_show, + .write = memory_max_write, + }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_events_show, + }, + { } /* terminate */ +}; + +struct cgroup_subsys memory_cgrp_subsys = { + .css_alloc = mem_cgroup_css_alloc, + .css_online = mem_cgroup_css_online, + .css_offline = mem_cgroup_css_offline, + .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, + .can_attach = mem_cgroup_can_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .attach = mem_cgroup_move_task, + .bind = mem_cgroup_bind, + .dfl_cftypes = memory_files, + .legacy_cftypes = mem_cgroup_legacy_files, + .early_init = 0, +}; + +/** + * mem_cgroup_events - count memory events against a cgroup + * @memcg: the memory cgroup + * @idx: the event index + * @nr: the number of events to account for + */ +void mem_cgroup_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx, + unsigned int nr) +{ + this_cpu_add(memcg->stat->events[idx], nr); +} + +/** + * mem_cgroup_low - check if memory consumption is below the normal range + * @root: the highest ancestor to consider + * @memcg: the memory cgroup to check + * + * Returns %true if memory consumption of @memcg, and that of all + * configurable ancestors up to @root, is below the normal range. + */ +bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return false; + + /* + * The toplevel group doesn't have a configurable range, so + * it's never low when looked at directly, and it is not + * considered an ancestor when assessing the hierarchy. + */ + + if (memcg == root_mem_cgroup) + return false; + + if (page_counter_read(&memcg->memory) >= memcg->low) + return false; + + while (memcg != root) { + memcg = parent_mem_cgroup(memcg); + + if (memcg == root_mem_cgroup) + break; + + if (page_counter_read(&memcg->memory) >= memcg->low) + return false; + } + return true; +} + +/** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (page->mem_cgroup) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } +out: + *memcgp = memcg; + return ret; +} + +/** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) +{ + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + commit_charge(page, memcg, lrucare); + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + local_lock_irq(event_lock); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); + local_unlock_irq(event_lock); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } +} + +/** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +{ + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); +} + +static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) +{ + unsigned long nr_pages = nr_anon + nr_file; + unsigned long flags; + + if (!mem_cgroup_is_root(memcg)) { + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); + memcg_oom_recover(memcg); + } + + local_lock_irqsave(event_lock, flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_pages); + memcg_check_events(memcg, dummy_page); + local_unlock_irqrestore(event_lock, flags); + + if (!mem_cgroup_is_root(memcg)) + css_put_many(&memcg->css, nr_pages); +} + +static void uncharge_list(struct list_head *page_list) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!page->mem_cgroup) + continue; + + /* + * Nobody should be changing or seriously looking at + * page->mem_cgroup at this point, we have fully + * exclusive access to the page. + */ + + if (memcg != page->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); + pgpgout = nr_anon = nr_file = nr_huge = 0; + } + memcg = page->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + page->mem_cgroup = NULL; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); +} + +/** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge(struct page *page) +{ + if (mem_cgroup_disabled()) + return; + + /* Don't touch page->lru of any random page, pre-check: */ + if (!page->mem_cgroup) + return; + + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); +} + +/** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + + if (!list_empty(page_list)) + uncharge_list(page_list); +} + +/** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: either or both pages might be on the LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) +{ + struct mem_cgroup *memcg; + int isolated; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), + newpage); + + if (mem_cgroup_disabled()) + return; + + /* Page cache replacement: new page already charged? */ + if (newpage->mem_cgroup) + return; + + /* + * Swapcache readahead pages can get migrated before being + * charged, and migration from compaction can happen to an + * uncharged page when the PFN walker finds a page that + * reclaim just put back on the LRU but has not released yet. + */ + memcg = oldpage->mem_cgroup; + if (!memcg) + return; + + if (lrucare) + lock_page_lru(oldpage, &isolated); + + oldpage->mem_cgroup = NULL; + + if (lrucare) + unlock_page_lru(oldpage, isolated); + + commit_charge(newpage, memcg, lrucare); +} + +/* + * subsys_initcall() for memory controller. + * + * Some parts like hotcpu_notifier() have to be initialized from this context + * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically + * everything that doesn't depend on a specific mem_cgroup structure should + * be initialized from here. + */ +static int __init mem_cgroup_init(void) +{ + int cpu, node; + + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); + + for_each_possible_cpu(cpu) + INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, + drain_local_stock); + + for_each_node(node) { + struct mem_cgroup_tree_per_node *rtpn; + int zone; + + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct mem_cgroup_tree_per_zone *rtpz; + + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + soft_limit_tree.rb_tree_per_node[node] = rtpn; + } + + return 0; +} +subsys_initcall(mem_cgroup_init); + +#ifdef CONFIG_MEMCG_SWAP +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ +void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short oldid; + unsigned long flags; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + memcg = page->mem_cgroup; + + /* Readahead page, never charged */ + if (!memcg) + return; + + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + + page->mem_cgroup = NULL; + + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memory, 1); + + local_lock_irqsave(event_lock, flags); + /* Caller disabled preemption with mapping->tree_lock */ + mem_cgroup_charge_statistics(memcg, page, -1); + memcg_check_events(memcg, page); + local_unlock_irqrestore(event_lock, flags); +} + +/** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ +void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg) { + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memsw, 1); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); +} + +/* for remember boot option*/ +#ifdef CONFIG_MEMCG_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata; +#endif + +static int __init enable_swap_account(char *s) +{ + if (!strcmp(s, "1")) + really_do_swap_account = 1; + else if (!strcmp(s, "0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount=", enable_swap_account); + +static struct cftype memsw_cgroup_files[] = { + { + .name = "memsw.usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.failcnt", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { }, /* terminate */ +}; + +static int __init mem_cgroup_swap_init(void) +{ + if (!mem_cgroup_disabled() && really_do_swap_account) { + do_swap_account = 1; + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memsw_cgroup_files)); + } + return 0; +} +subsys_initcall(mem_cgroup_swap_init); + +#endif /* CONFIG_MEMCG_SWAP */ diff --git a/kernel/mm/memory-failure.c b/kernel/mm/memory-failure.c new file mode 100644 index 000000000..501820c81 --- /dev/null +++ b/kernel/mm/memory-failure.c @@ -0,0 +1,1790 @@ +/* + * Copyright (C) 2008, 2009 Intel Corporation + * Authors: Andi Kleen, Fengguang Wu + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + * + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a multi-bit ECC memory or cache + * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. + */ + +/* + * Notebook: + * - hugetlb needs more code + * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages + * - pass bad pages to kdump next kernel + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +int sysctl_memory_failure_early_kill __read_mostly = 0; + +int sysctl_memory_failure_recovery __read_mostly = 1; + +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); + +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + +u32 hwpoison_filter_enable = 0; +u32 hwpoison_filter_dev_major = ~0U; +u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; +EXPORT_SYMBOL_GPL(hwpoison_filter_enable); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); + +static int hwpoison_filter_dev(struct page *p) +{ + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + /* + * page_mapping() does not accept slab pages. + */ + if (PageSlab(p)) + return -EINVAL; + + mapping = page_mapping(p); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_MEMCG_SWAP +u64 hwpoison_filter_memcg; +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); +static int hwpoison_filter_task(struct page *p) +{ + struct mem_cgroup *mem; + struct cgroup_subsys_state *css; + unsigned long ino; + + if (!hwpoison_filter_memcg) + return 0; + + mem = try_get_mem_cgroup_from_page(p); + if (!mem) + return -EINVAL; + + css = mem_cgroup_css(mem); + ino = cgroup_ino(css->cgroup); + css_put(css); + + if (ino != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + +int hwpoison_filter(struct page *p) +{ + if (!hwpoison_filter_enable) + return 0; + + if (hwpoison_filter_dev(p)) + return -EINVAL; + + if (hwpoison_filter_flags(p)) + return -EINVAL; + + if (hwpoison_filter_task(p)) + return -EINVAL; + + return 0; +} +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + +EXPORT_SYMBOL_GPL(hwpoison_filter); + +/* + * Send all the processes who have the page mapped a signal. + * ``action optional'' if they are not immediately affected by the error + * ``action required'' if error happened in current execution context + */ +static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn, struct page *page, int flags) +{ + struct siginfo si; + int ret; + + printk(KERN_ERR + "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_addr = (void *)addr; +#ifdef __ARCH_SI_TRAPNO + si.si_trapno = trapno; +#endif + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { + si.si_code = BUS_MCEERR_AR; + ret = force_sig_info(SIGBUS, &si, current); + } else { + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully no one will do that? + */ + si.si_code = BUS_MCEERR_AO; + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + } + if (ret < 0) + printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); + return ret; +} + +/* + * When a unknown page type is encountered drain as many buffers as possible + * in the hope to turn the page into a LRU or free page, which we can handle. + */ +void shake_page(struct page *p, int access) +{ + if (!PageSlab(p)) { + lru_add_drain_all(); + if (PageLRU(p)) + return; + drain_all_pages(page_zone(p)); + if (PageLRU(p) || is_free_buddy_page(p)) + return; + } + + /* + * Only call shrink_node_slabs here (which would also shrink + * other caches) if access is not potentially fatal. + */ + if (access) + drop_slab_node(page_to_nid(p)); +} +EXPORT_SYMBOL_GPL(shake_page); + +/* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + char addr_valid; +}; + +/* + * Failure handling: if we can't find or can't kill a process there's + * not much we can do. We just print a message and ignore otherwise. + */ + +/* + * Schedule a process for later kill. + * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * TBD would GFP_NOIO be enough? + */ +static void add_to_kill(struct task_struct *tsk, struct page *p, + struct vm_area_struct *vma, + struct list_head *to_kill, + struct to_kill **tkc) +{ + struct to_kill *tk; + + if (*tkc) { + tk = *tkc; + *tkc = NULL; + } else { + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + printk(KERN_ERR + "MCE: Out of memory while machine check handling\n"); + return; + } + } + tk->addr = page_address_in_vma(p, vma); + tk->addr_valid = 1; + + /* + * In theory we don't have to kill when the page was + * munmaped. But it could be also a mremap. Since that's + * likely very rare kill anyways just out of paranoia, but use + * a SIGKILL because the error is not contained anymore. + */ + if (tk->addr == -EFAULT) { + pr_info("MCE: Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); + tk->addr_valid = 0; + } + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Kill the processes that have been collected earlier. + * + * Only do anything when DOIT is set, otherwise just free the list + * (this is used for clean pages which do not need killing) + * Also when FAIL is set do a force kill because something went + * wrong earlier. + */ +static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, + int fail, struct page *page, unsigned long pfn, + int flags) +{ + struct to_kill *tk, *next; + + list_for_each_entry_safe (tk, next, to_kill, nd) { + if (forcekill) { + /* + * In case something went wrong with munmapping + * make sure the process doesn't catch the + * signal and then access the memory. Just kill it. + */ + if (fail || tk->addr_valid == 0) { + printk(KERN_ERR + "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); + force_sig(SIGKILL, tk->tsk); + } + + /* + * In theory the process could have mapped + * something else on the address in-between. We could + * check for that, but we need to tell the + * process anyways. + */ + else if (kill_proc(tk->tsk, tk->addr, trapno, + pfn, page, flags) < 0) + printk(KERN_ERR + "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); + } + put_task_struct(tk->tsk); + kfree(tk); + } +} + +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) +{ + struct task_struct *t; + if (!tsk->mm) + return NULL; + if (force_early) + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; +} + +/* + * Collect processes when the error hit an anonymous page. + */ +static void collect_procs_anon(struct page *page, struct list_head *to_kill, + struct to_kill **tkc, int force_early) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct anon_vma *av; + pgoff_t pgoff; + + av = page_lock_anon_vma_read(page); + if (av == NULL) /* Not actually mapped anymore */ + return; + + pgoff = page_to_pgoff(page); + read_lock(&tasklist_lock); + for_each_process (tsk) { + struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); + + if (!t) + continue; + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { + vma = vmac->vma; + if (!page_mapped_in_vma(page, vma)) + continue; + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); + } + } + read_unlock(&tasklist_lock); + page_unlock_anon_vma_read(av); +} + +/* + * Collect processes when the error hit a file mapped page. + */ +static void collect_procs_file(struct page *page, struct list_head *to_kill, + struct to_kill **tkc, int force_early) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct address_space *mapping = page->mapping; + + i_mmap_lock_read(mapping); + read_lock(&tasklist_lock); + for_each_process(tsk) { + pgoff_t pgoff = page_to_pgoff(page); + struct task_struct *t = task_early_kill(tsk, force_early); + + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, + pgoff) { + /* + * Send early kill signal to tasks where a vma covers + * the page but the corrupted page is not necessarily + * mapped it in its pte. + * Assume applications who requested early kill want + * to be informed of all such data corruptions. + */ + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); + } + } + read_unlock(&tasklist_lock); + i_mmap_unlock_read(mapping); +} + +/* + * Collect the processes who have the corrupted page mapped to kill. + * This is done in two steps for locking reasons. + * First preallocate one tokill structure outside the spin locks, + * so that we can kill at least one process reasonably reliable. + */ +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) +{ + struct to_kill *tk; + + if (!page->mapping) + return; + + tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); + if (!tk) + return; + if (PageAnon(page)) + collect_procs_anon(page, tokill, &tk, force_early); + else + collect_procs_file(page, tokill, &tk, force_early); + kfree(tk); +} + +/* + * Error handlers for various types of pages. + */ + +enum outcome { + IGNORED, /* Error: cannot be handled */ + FAILED, /* Error: handling failed */ + DELAYED, /* Will be handled later */ + RECOVERED, /* Successfully recovered */ +}; + +static const char *action_name[] = { + [IGNORED] = "Ignored", + [FAILED] = "Failed", + [DELAYED] = "Delayed", + [RECOVERED] = "Recovered", +}; + +enum action_page_type { + MSG_KERNEL, + MSG_KERNEL_HIGH_ORDER, + MSG_SLAB, + MSG_DIFFERENT_COMPOUND, + MSG_POISONED_HUGE, + MSG_HUGE, + MSG_FREE_HUGE, + MSG_UNMAP_FAILED, + MSG_DIRTY_SWAPCACHE, + MSG_CLEAN_SWAPCACHE, + MSG_DIRTY_MLOCKED_LRU, + MSG_CLEAN_MLOCKED_LRU, + MSG_DIRTY_UNEVICTABLE_LRU, + MSG_CLEAN_UNEVICTABLE_LRU, + MSG_DIRTY_LRU, + MSG_CLEAN_LRU, + MSG_TRUNCATED_LRU, + MSG_BUDDY, + MSG_BUDDY_2ND, + MSG_UNKNOWN, +}; + +static const char * const action_page_types[] = { + [MSG_KERNEL] = "reserved kernel page", + [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MSG_SLAB] = "kernel slab page", + [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MSG_POISONED_HUGE] = "huge page already hardware poisoned", + [MSG_HUGE] = "huge page", + [MSG_FREE_HUGE] = "free huge page", + [MSG_UNMAP_FAILED] = "unmapping failed page", + [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MSG_DIRTY_LRU] = "dirty LRU page", + [MSG_CLEAN_LRU] = "clean LRU page", + [MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MSG_BUDDY] = "free buddy page", + [MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MSG_UNKNOWN] = "unknown page", +}; + +/* + * XXX: It is possible that a page is isolated from LRU cache, + * and then kept in swap cache or failed to remove from page cache. + * The page count will stop it from being freed by unpoison. + * Stress tests should be aware of this memory leak problem. + */ +static int delete_from_lru_cache(struct page *p) +{ + if (!isolate_lru_page(p)) { + /* + * Clear sensible page flags, so that the buddy system won't + * complain when the page is unpoison-and-freed. + */ + ClearPageActive(p); + ClearPageUnevictable(p); + /* + * drop the page count elevated by isolate_lru_page() + */ + page_cache_release(p); + return 0; + } + return -EIO; +} + +/* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. + */ +static int me_kernel(struct page *p, unsigned long pfn) +{ + return IGNORED; +} + +/* + * Page in unknown state. Do nothing. + */ +static int me_unknown(struct page *p, unsigned long pfn) +{ + printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + return FAILED; +} + +/* + * Clean (or cleaned) page cache page. + */ +static int me_pagecache_clean(struct page *p, unsigned long pfn) +{ + int err; + int ret = FAILED; + struct address_space *mapping; + + delete_from_lru_cache(p); + + /* + * For anonymous pages we're done the only reference left + * should be the one m_f() holds. + */ + if (PageAnon(p)) + return RECOVERED; + + /* + * Now truncate the page in the page cache. This is really + * more like a "temporary hole punch" + * Don't do this for block devices when someone else + * has a reference, because it could be file system metadata + * and that's not safe to truncate. + */ + mapping = page_mapping(p); + if (!mapping) { + /* + * Page has been teared down in the meanwhile + */ + return FAILED; + } + + /* + * Truncation is a bit tricky. Enable it per file system for now. + * + * Open: to take i_mutex or not for this? Right now we don't. + */ + if (mapping->a_ops->error_remove_page) { + err = mapping->a_ops->error_remove_page(mapping, p); + if (err != 0) { + printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_info("MCE %#lx: failed to release buffers\n", pfn); + } else { + ret = RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = RECOVERED; + else + printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", + pfn); + } + return ret; +} + +/* + * Dirty pagecache page + * Issues: when the error hit a hole page the error is not properly + * propagated. + */ +static int me_pagecache_dirty(struct page *p, unsigned long pfn) +{ + struct address_space *mapping = page_mapping(p); + + SetPageError(p); + /* TBD: print more information about the file. */ + if (mapping) { + /* + * IO error will be reported by write(), fsync(), etc. + * who check the mapping. + * This way the application knows that something went + * wrong with its dirty file data. + * + * There's one open issue: + * + * The EIO will be only reported on the next IO + * operation and then cleared through the IO map. + * Normally Linux has two mechanisms to pass IO error + * first through the AS_EIO flag in the address space + * and then through the PageError flag in the page. + * Since we drop pages on memory failure handling the + * only mechanism open to use is through AS_AIO. + * + * This has the disadvantage that it gets cleared on + * the first operation that returns an error, while + * the PageError bit is more sticky and only cleared + * when the page is reread or dropped. If an + * application assumes it will always get error on + * fsync, but does other operations on the fd before + * and the page is dropped between then the error + * will not be properly reported. + * + * This can already happen even without hwpoisoned + * pages: first on metadata IO errors (which only + * report through AS_EIO) or when the page is dropped + * at the wrong time. + * + * So right now we assume that the application DTRT on + * the first EIO, but we're not worse than other parts + * of the kernel. + */ + mapping_set_error(mapping, EIO); + } + + return me_pagecache_clean(p, pfn); +} + +/* + * Clean and dirty swap cache. + * + * Dirty swap cache page is tricky to handle. The page could live both in page + * cache and swap cache(ie. page is freshly swapped in). So it could be + * referenced concurrently by 2 types of PTEs: + * normal PTEs and swap PTEs. We try to handle them consistently by calling + * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, + * and then + * - clear dirty bit to prevent IO + * - remove from LRU + * - but keep in the swap cache, so that when we return to it on + * a later page fault, we know the application is accessing + * corrupted data and shall be killed (we installed simple + * interception code in do_swap_page to catch it). + * + * Clean swap cache pages can be directly isolated. A later page fault will + * bring in the known good data from disk. + */ +static int me_swapcache_dirty(struct page *p, unsigned long pfn) +{ + ClearPageDirty(p); + /* Trigger EIO in shmem: */ + ClearPageUptodate(p); + + if (!delete_from_lru_cache(p)) + return DELAYED; + else + return FAILED; +} + +static int me_swapcache_clean(struct page *p, unsigned long pfn) +{ + delete_from_swap_cache(p); + + if (!delete_from_lru_cache(p)) + return RECOVERED; + else + return FAILED; +} + +/* + * Huge pages. Needs work. + * Issues: + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. + */ +static int me_huge_page(struct page *p, unsigned long pfn) +{ + int res = 0; + struct page *hpage = compound_head(p); + /* + * We can safely recover from error on free or reserved (i.e. + * not in-use) hugepage by dequeuing it from freelist. + * To check whether a hugepage is in-use or not, we can't use + * page->lru because it can be used in other hugepage operations, + * such as __unmap_hugepage_range() and gather_surplus_pages(). + * So instead we use page_mapping() and PageAnon(). + * We assume that this function is called with page lock held, + * so there is no race between isolation and mapping/unmapping. + */ + if (!(page_mapping(hpage) || PageAnon(hpage))) { + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; + } + return DELAYED; +} + +/* + * Various page states we can handle. + * + * A page state is defined by its current page->flags bits. + * The table matches them in order and calls the right handler. + * + * This is quite tricky because we can access page at any time + * in its live cycle, so all accesses have to be extremely careful. + * + * This is not complete. More states could be added. + * For any missing state don't attempt recovery. + */ + +#define dirty (1UL << PG_dirty) +#define sc (1UL << PG_swapcache) +#define unevict (1UL << PG_unevictable) +#define mlock (1UL << PG_mlocked) +#define writeback (1UL << PG_writeback) +#define lru (1UL << PG_lru) +#define swapbacked (1UL << PG_swapbacked) +#define head (1UL << PG_head) +#define tail (1UL << PG_tail) +#define compound (1UL << PG_compound) +#define slab (1UL << PG_slab) +#define reserved (1UL << PG_reserved) + +static struct page_state { + unsigned long mask; + unsigned long res; + enum action_page_type type; + int (*action)(struct page *p, unsigned long pfn); +} error_states[] = { + { reserved, reserved, MSG_KERNEL, me_kernel }, + /* + * free pages are specially detected outside this table: + * PG_buddy pages only make a small fraction of all free pages. + */ + + /* + * Could in theory check if slab page is free or if we can drop + * currently unused objects without touching them. But just + * treat it as standard kernel for now. + */ + { slab, slab, MSG_SLAB, me_kernel }, + +#ifdef CONFIG_PAGEFLAGS_EXTENDED + { head, head, MSG_HUGE, me_huge_page }, + { tail, tail, MSG_HUGE, me_huge_page }, +#else + { compound, compound, MSG_HUGE, me_huge_page }, +#endif + + { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, + + { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, + + { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, + + { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, + + /* + * Catchall entry: must be at end. + */ + { 0, 0, MSG_UNKNOWN, me_unknown }, +}; + +#undef dirty +#undef sc +#undef unevict +#undef mlock +#undef writeback +#undef lru +#undef swapbacked +#undef head +#undef tail +#undef compound +#undef slab +#undef reserved + +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */ +static void action_result(unsigned long pfn, enum action_page_type type, int result) +{ + pr_err("MCE %#lx: recovery action for %s: %s\n", + pfn, action_page_types[type], action_name[result]); +} + +static int page_action(struct page_state *ps, struct page *p, + unsigned long pfn) +{ + int result; + int count; + + result = ps->action(p, pfn); + + count = page_count(p) - 1; + if (ps->action == me_swapcache_dirty && result == DELAYED) + count--; + if (count != 0) { + printk(KERN_ERR + "MCE %#lx: %s still referenced by %d users\n", + pfn, action_page_types[ps->type], count); + result = FAILED; + } + action_result(pfn, ps->type, result); + + /* Could do more checks here if page looks ok */ + /* + * Could adjust zone counters here to correct for the missing page. + */ + + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; +} + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static int hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno, int flags, struct page **hpagep) +{ + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int kill = 1, forcekill; + struct page *hpage = *hpagep; + struct page *ppage; + + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ + if (PageReserved(p) || PageSlab(p)) + return SWAP_SUCCESS; + if (!(PageLRU(hpage) || PageHuge(p))) + return SWAP_SUCCESS; + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(hpage)) + return SWAP_SUCCESS; + + if (PageKsm(p)) { + pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); + return SWAP_FAIL; + } + + if (PageSwapCache(p)) { + printk(KERN_ERR + "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + ttu |= TTU_IGNORE_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = page_mapping(hpage); + if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && + mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(hpage)) { + SetPageDirty(hpage); + } else { + kill = 0; + ttu |= TTU_IGNORE_HWPOISON; + printk(KERN_INFO + "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + /* + * We pinned the head page for hwpoison handling, + * now we split the thp and we are interested in + * the hwpoisoned raw page, so move the refcount + * to it. Similarly, page lock is shifted. + */ + if (hpage != p) { + if (!(flags & MF_COUNT_INCREASED)) { + put_page(hpage); + get_page(p); + } + lock_page(p); + unlock_page(hpage); + *hpagep = p; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; + } + } + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + * + * Error handling: We ignore errors here because + * there's nothing that can be done. + */ + if (kill) + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + + ret = try_to_unmap(ppage, ttu); + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(ppage)); + + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if + * killing is needed or not. Only kill when the page + * was dirty or the process is not restartable, + * otherwise the tokill list is merely + * freed. When there was a problem unmapping earlier + * use a more force-full uncatchable kill to prevent + * any accesses to the poisoned memory. + */ + forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + kill_procs(&tokill, forcekill, trapno, + ret != SWAP_SUCCESS, p, pfn, flags); + + return ret; +} + +static void set_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + SetPageHWPoison(hpage + i); +} + +static void clear_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + ClearPageHWPoison(hpage + i); +} + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: fine tune action taken + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +int memory_failure(unsigned long pfn, int trapno, int flags) +{ + struct page_state *ps; + struct page *p; + struct page *hpage; + int res; + unsigned int nr_pages; + unsigned long page_flags; + + if (!sysctl_memory_failure_recovery) + panic("Memory failure from trap %d on page %lx", trapno, pfn); + + if (!pfn_valid(pfn)) { + printk(KERN_ERR + "MCE %#lx: memory outside kernel control\n", + pfn); + return -ENXIO; + } + + p = pfn_to_page(pfn); + hpage = compound_head(p); + if (TestSetPageHWPoison(p)) { + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); + return 0; + } + + /* + * Currently errors on hugetlbfs pages are measured in hugepage units, + * so nr_pages should be 1 << compound_order. OTOH when errors are on + * transparent hugepages, they are supposed to be split and error + * measurement is done in normal page units. So nr_pages should be one + * in this case. + */ + if (PageHuge(p)) + nr_pages = 1 << compound_order(hpage); + else /* normal page or thp */ + nr_pages = 1; + atomic_long_add(nr_pages, &num_poisoned_pages); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * prep_new_page() will be the gate keeper. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. + * In fact it's dangerous to directly bump up page count from 0, + * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + */ + if (!(flags & MF_COUNT_INCREASED) && + !get_page_unless_zero(hpage)) { + if (is_free_buddy_page(p)) { + action_result(pfn, MSG_BUDDY, DELAYED); + return 0; + } else if (PageHuge(hpage)) { + /* + * Check "filter hit" and "race with other subpage." + */ + lock_page(hpage); + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + return 0; + } + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, MSG_FREE_HUGE, + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; + } else { + action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); + return -EBUSY; + } + } + + /* + * We ignore non-LRU pages for good reasons. + * - PG_locked is only well defined for LRU pages and a few others + * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageSlab*() (and more non-atomic ops) + * The check (unnecessarily) ignores LRU pages being isolated and + * walked by the page reclaim code, however that's not a big loss. + */ + if (!PageHuge(p)) { + if (!PageLRU(hpage)) + shake_page(hpage, 0); + if (!PageLRU(hpage)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + if (flags & MF_COUNT_INCREASED) + action_result(pfn, MSG_BUDDY, DELAYED); + else + action_result(pfn, MSG_BUDDY_2ND, + DELAYED); + return 0; + } + } + } + + lock_page(hpage); + + /* + * The page could have changed compound pages during the locking. + * If this happens just bail out. + */ + if (compound_head(p) != hpage) { + action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); + res = -EBUSY; + goto out; + } + + /* + * We use page flags to determine what action should be taken, but + * the flags can be modified by the error containment action. One + * example is an mlocked page, where PG_mlocked is cleared by + * page_remove_rmap() in try_to_unmap_one(). So to determine page status + * correctly, we save a copy of the page flags at this time. + */ + page_flags = p->flags; + + /* + * unpoison always clear PG_hwpoison inside page lock + */ + if (!PageHWPoison(p)) { + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + put_page(hpage); + res = 0; + goto out; + } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) + atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); + put_page(hpage); + return 0; + } + + if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + + /* + * For error on the tail page, we should set PG_hwpoison + * on the head page to show that the hugepage is hwpoisoned + */ + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { + action_result(pfn, MSG_POISONED_HUGE, IGNORED); + unlock_page(hpage); + put_page(hpage); + return 0; + } + /* + * Set PG_hwpoison on all pages in an error hugepage, + * because containment is done in hugepage unit for now. + * Since we have done TestSetPageHWPoison() for the head page with + * page lock held, we can safely set PG_hwpoison bits on tail pages. + */ + if (PageHuge(p)) + set_page_hwpoison_huge_page(hpage); + + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ + wait_on_page_writeback(p); + + /* + * Now take care of user space mappings. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. + * + * When the raw error page is thp tail page, hpage points to the raw + * page after thp split. + */ + if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) + != SWAP_SUCCESS) { + action_result(pfn, MSG_UNMAP_FAILED, IGNORED); + res = -EBUSY; + goto out; + } + + /* + * Torn down by someone else? + */ + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); + res = -EBUSY; + goto out; + } + +identify_page_state: + res = -EBUSY; + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flagss is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) + break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + res = page_action(ps, p, pfn); +out: + unlock_page(hpage); + return res; +} +EXPORT_SYMBOL_GPL(memory_failure); + +#define MEMORY_FAILURE_FIFO_ORDER 4 +#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { + unsigned long pfn; + int trapno; + int flags; +}; + +struct memory_failure_cpu { + DECLARE_KFIFO(fifo, struct memory_failure_entry, + MEMORY_FAILURE_FIFO_SIZE); + spinlock_t lock; + struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); + +/** + * memory_failure_queue - Schedule handling memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: Flags for memory failure handling + * + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Can run in IRQ context. + */ +void memory_failure_queue(unsigned long pfn, int trapno, int flags) +{ + struct memory_failure_cpu *mf_cpu; + unsigned long proc_flags; + struct memory_failure_entry entry = { + .pfn = pfn, + .trapno = trapno, + .flags = flags, + }; + + mf_cpu = &get_cpu_var(memory_failure_cpu); + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + if (kfifo_put(&mf_cpu->fifo, entry)) + schedule_work_on(smp_processor_id(), &mf_cpu->work); + else + pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", + pfn); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work) +{ + struct memory_failure_cpu *mf_cpu; + struct memory_failure_entry entry = { 0, }; + unsigned long proc_flags; + int gotten; + + mf_cpu = this_cpu_ptr(&memory_failure_cpu); + for (;;) { + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + gotten = kfifo_get(&mf_cpu->fifo, &entry); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + if (!gotten) + break; + if (entry.flags & MF_SOFT_OFFLINE) + soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + else + memory_failure(entry.pfn, entry.trapno, entry.flags); + } +} + +static int __init memory_failure_init(void) +{ + struct memory_failure_cpu *mf_cpu; + int cpu; + + for_each_possible_cpu(cpu) { + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + spin_lock_init(&mf_cpu->lock); + INIT_KFIFO(mf_cpu->fifo); + INIT_WORK(&mf_cpu->work, memory_failure_work_func); + } + + return 0; +} +core_initcall(memory_failure_init); + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int freeit = 0; + unsigned int nr_pages; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + if (!PageHWPoison(p)) { + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); + return 0; + } + + /* + * unpoison_memory() can encounter thp only when the thp is being + * worked by memory_failure() and the page lock is not held yet. + * In such case, we yield to memory_failure() and make unpoison fail. + */ + if (!PageHuge(page) && PageTransHuge(page)) { + pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + return 0; + } + + nr_pages = 1 << compound_order(page); + + if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } + if (TestClearPageHWPoison(p)) + atomic_long_dec(&num_poisoned_pages); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); + return 0; + } + + lock_page(page); + /* + * This test is racy because PG_hwpoison is set outside of page lock. + * That's acceptable because that won't trigger kernel panic. Instead, + * the PG_hwpoison page will be caught and isolated on the entrance to + * the free buddy page pool. + */ + if (TestClearPageHWPoison(page)) { + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_sub(nr_pages, &num_poisoned_pages); + freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); + } + unlock_page(page); + + put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) + put_page(page); + + return 0; +} +EXPORT_SYMBOL(unpoison_memory); + +static struct page *new_page(struct page *p, unsigned long private, int **x) +{ + int nid = page_to_nid(p); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, -EIO for a zero refcount page + * that is not free, and 1 for any other page type. + * For 1 the page is returned with increased page count, otherwise not. + */ +static int __get_any_page(struct page *p, unsigned long pfn, int flags) +{ + int ret; + + if (flags & MF_COUNT_INCREASED) + return 1; + + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ + if (!get_page_unless_zero(compound_head(p))) { + if (PageHuge(p)) { + pr_info("%s: %#lx free huge page\n", __func__, pfn); + ret = 0; + } else if (is_free_buddy_page(p)) { + pr_info("%s: %#lx free buddy page\n", __func__, pfn); + ret = 0; + } else { + pr_info("%s: %#lx: unknown zero refcount page type %lx\n", + __func__, pfn, p->flags); + ret = -EIO; + } + } else { + /* Not a free page */ + ret = 1; + } + return ret; +} + +static int get_any_page(struct page *page, unsigned long pfn, int flags) +{ + int ret = __get_any_page(page, pfn, flags); + + if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = __get_any_page(page, pfn, 0); + if (!PageLRU(page)) { + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + } + return ret; +} + +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + /* + * This double-check of PageHWPoison is to avoid the race with + * memory_failure(). See also comment in __soft_offline_page(). + */ + lock_page(hpage); + if (PageHWPoison(hpage)) { + unlock_page(hpage); + put_page(hpage); + pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + unlock_page(hpage); + + ret = isolate_huge_page(hpage, &pagelist); + if (ret) { + /* + * get_any_page() and isolate_huge_page() takes a refcount each, + * so need to drop one here. + */ + put_page(hpage); + } else { + pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); + return -EBUSY; + } + + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (ret) { + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + /* + * We know that soft_offline_huge_page() tries to migrate + * only one hugepage pointed to by hpage, so we need not + * run through the pagelist here. + */ + putback_active_hugepage(hpage); + if (ret > 0) + ret = -EIO; + } else { + /* overcommit hugetlb page will be freed to buddy */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } + } + return ret; +} + +static int __soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + + /* + * Check PageHWPoison again inside page lock because PageHWPoison + * is set by memory_failure() outside page lock. Note that + * memory_failure() also double-checks PageHWPoison inside page lock, + * so there's no race between soft_offline_page() and memory_failure(). + */ + lock_page(page); + wait_on_page_writeback(page); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + /* + * RED-PEN would be better to keep it isolated here, but we + * would need to fix isolation locking first. + */ + if (ret == 1) { + put_page(page); + pr_info("soft_offline: %#lx: invalidated\n", pfn); + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + return 0; + } + + /* + * Simple invalidation didn't work. + * Try to migrate to a new page instead. migrate.c + * handles a large number of cases for us. + */ + ret = isolate_lru_page(page); + /* + * Drop page reference which is came from get_any_page() + * successful isolate_lru_page() already took another one. + */ + put_page(page); + if (!ret) { + LIST_HEAD(pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (ret) { + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + drain_all_pages(page_zone(page)); + SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); + atomic_long_inc(&num_poisoned_pages); + } + } else { + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pfn, ret, page_count(page), page->flags); + } + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } + + get_online_mems(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. + */ + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + set_migratetype_isolate(page, true); + + ret = get_any_page(page, pfn, flags); + put_online_mems(); + if (ret > 0) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else if (ret == 0) { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + if (!dequeue_hwpoisoned_huge_page(hpage)) + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + if (!TestSetPageHWPoison(page)) + atomic_long_inc(&num_poisoned_pages); + } + } + unset_migratetype_isolate(page, MIGRATE_MOVABLE); + return ret; +} diff --git a/kernel/mm/memory.c b/kernel/mm/memory.c new file mode 100644 index 000000000..17734c3c1 --- /dev/null +++ b/kernel/mm/memory.c @@ -0,0 +1,3857 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. +#endif + +#ifndef CONFIG_NEED_MULTIPLE_NODES +/* use the per-pgdat data instead for discontigmem - mbligh */ +unsigned long max_mapnr; +struct page *mem_map; + +EXPORT_SYMBOL(max_mapnr); +EXPORT_SYMBOL(mem_map); +#endif + +/* + * A number of key systems in x86 including ioremap() rely on the assumption + * that high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ +void * high_memory; + +EXPORT_SYMBOL(high_memory); + +/* + * Randomize the address space (stacks, mmaps, brk, etc.). + * + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, + * as ancient (libc5 based) binaries can segfault. ) + */ +int randomize_va_space __read_mostly = +#ifdef CONFIG_COMPAT_BRK + 1; +#else + 2; +#endif + +static int __init disable_randmaps(char *s) +{ + randomize_va_space = 0; + return 1; +} +__setup("norandmaps", disable_randmaps); + +unsigned long zero_pfn __read_mostly; +unsigned long highest_memmap_pfn __read_mostly; + +EXPORT_SYMBOL(zero_pfn); + +/* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +core_initcall(init_zero_pfn); + + +#if defined(SPLIT_RSS_COUNTING) + +void sync_mm_rss(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (current->rss_stat.count[i]) { + add_mm_counter(mm, i, current->rss_stat.count[i]); + current->rss_stat.count[i] = 0; + } + } + current->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + sync_mm_rss(task->mm); +} +#else /* SPLIT_RSS_COUNTING */ + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +#endif /* SPLIT_RSS_COUNTING */ + +#ifdef HAVE_GENERIC_MMU_GATHER + +static int tlb_next_batch(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + batch = tlb->active; + if (batch->next) { + tlb->active = batch->next; + return 1; + } + + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); + if (!batch) + return 0; + + tlb->batch_count++; + batch->next = NULL; + batch->nr = 0; + batch->max = MAX_GATHER_BATCH; + + tlb->active->next = batch; + tlb->active = batch; + + return 1; +} + +/* tlb_gather_mmu + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @fullmm argument is used when @mm is without + * users and we're going to destroy the full address space (exit/execve). + */ +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) +{ + tlb->mm = mm; + + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); + tlb->need_flush_all = 0; + tlb->local.next = NULL; + tlb->local.nr = 0; + tlb->local.max = ARRAY_SIZE(tlb->__pages); + tlb->active = &tlb->local; + tlb->batch_count = 0; + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb->batch = NULL; +#endif + + __tlb_reset_range(tlb); +} + +static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) +{ + if (!tlb->end) + return; + + tlb_flush(tlb); + mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb_table_flush(tlb); +#endif + __tlb_reset_range(tlb); +} + +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { + free_pages_and_swap_cache(batch->pages, batch->nr); + batch->nr = 0; + } + tlb->active = &tlb->local; +} + +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + +/* tlb_finish_mmu + * Called at the end of the shootdown operation to free up any resources + * that were required. + */ +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + struct mmu_gather_batch *batch, *next; + + tlb_flush_mmu(tlb); + + /* keep the page table cache within bounds */ + check_pgt_cache(); + + for (batch = tlb->local.next; batch; batch = next) { + next = batch->next; + free_pages((unsigned long)batch, 0); + } + tlb->local.next = NULL; +} + +/* __tlb_remove_page + * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while + * handling the additional races in SMP caused by other CPUs caching valid + * mappings in their TLBs. Returns the number of free page slots left. + * When out of page slots we must call tlb_flush_mmu(). + */ +int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + struct mmu_gather_batch *batch; + + VM_BUG_ON(!tlb->end); + + batch = tlb->active; + batch->pages[batch->nr++] = page; + if (batch->nr == batch->max) { + if (!tlb_next_batch(tlb)) + return 0; + batch = tlb->active; + } + VM_BUG_ON_PAGE(batch->nr > batch->max, page); + + return batch->max - batch->nr; +} + +#endif /* HAVE_GENERIC_MMU_GATHER */ + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + +/* + * See the comment near struct mmu_table_batch. + */ + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely on + * IRQ disabling. See the comment near struct mmu_table_batch. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + __tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + struct mmu_table_batch *batch; + int i; + + batch = container_of(head, struct mmu_table_batch, rcu); + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + /* + * When there's less then two users of this mm there cannot be a + * concurrent page-table walk. + */ + if (atomic_read(&tlb->mm->mm_users) < 2) { + __tlb_remove_table(table); + return; + } + + if (*batch == NULL) { + *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_table_flush(tlb); +} + +#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + pgtable_t token = pmd_pgtable(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, token, addr); + atomic_long_dec(&tlb->mm->nr_ptes); +} + +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + free_pte_range(tlb, pmd, addr); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); +} + +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + free_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud, start); +} + +/* + * This function frees user-level page tables of a process. + */ +void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + pgd = pgd_offset(tlb->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + free_pud_range(tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); +} + +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + /* + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables + */ + unlink_anon_vmas(vma); + unlink_file_vma(vma); + + if (is_vm_hugetlb_page(vma)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_vm_hugetlb_page(next)) { + vma = next; + next = vma->vm_next; + unlink_anon_vmas(vma); + unlink_file_vma(vma); + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } + vma = next; + } +} + +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) +{ + spinlock_t *ptl; + pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; + if (!new) + return -ENOMEM; + + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_read_barrier_depends() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ + + ptl = pmd_lock(mm, pmd); + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + atomic_long_inc(&mm->nr_ptes); + pmd_populate(mm, pmd, new); + new = NULL; + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; + spin_unlock(ptl); + if (new) + pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); + return 0; +} + +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +{ + pte_t *new = pte_alloc_one_kernel(&init_mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); + spin_unlock(&init_mm.page_table_lock); + if (new) + pte_free_kernel(&init_mm, new); + return 0; +} + +static inline void init_rss_vec(int *rss) +{ + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + if (current->mm == mm) + sync_mm_rss(mm); + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); +} + +/* + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. + * + * The calling function must still handle the error. + */ +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + printk(KERN_ALERT + "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) + dump_page(page, "bad pte"); + printk(KERN_ALERT + "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + /* + * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y + */ + pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", + vma->vm_file, + vma->vm_ops ? vma->vm_ops->fault : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + mapping ? mapping->a_ops->readpage : NULL); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} + +/* + * vm_normal_page -- This function gets the "struct page" associated with a pte. + * + * "Special" mappings do not wish to be associated with a "struct page" (either + * it doesn't exist, or it exists but they don't want to touch it). In this + * case, NULL is returned here. "Normal" mappings do have a struct page. + * + * There are 2 broad cases. Firstly, an architecture may define a pte_special() + * pte bit, in which case this function is trivial. Secondly, an architecture + * may not have a spare pte bit, which requires a more complicated scheme, + * described below. + * + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a + * special mapping (even if there are underlying and valid "struct pages"). + * COWed pages of a VM_PFNMAP are always normal. + * + * The way we recognize COWed pages within VM_PFNMAP mappings is through the + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit + * set, and the vm_pgoff will point to the first PFN mapped: thus every special + * mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * And for normal mappings this is false. + * + * This restricts such mappings to be a linear translation from virtual address + * to pfn. To get around this restriction, we allow arbitrary mappings so long + * as the vma is not a COW mapping; in that case, we know that all ptes are + * special (because none can have been COWed). + * + * + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. + * + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct + * page" backing, however the difference is that _all_ pages with a struct + * page (that is, those where pfn_valid is true) are refcounted and considered + * normal pages by the VM. The disadvantage is that pages are refcounted + * (which can be slower and simply not an option for some PFNMAP users). The + * advantage is that we don't have to follow the strict linearity rule of + * PFNMAP mappings in order to support COWable mappings. + * + */ +#ifdef __HAVE_ARCH_PTE_SPECIAL +# define HAVE_PTE_SPECIAL 1 +#else +# define HAVE_PTE_SPECIAL 0 +#endif +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + if (HAVE_PTE_SPECIAL) { + if (likely(!pte_special(pte))) + goto check_pfn; + if (vma->vm_ops && vma->vm_ops->find_special_page) + return vma->vm_ops->find_special_page(vma, addr); + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (!is_zero_pfn(pfn)) + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } + + /* !HAVE_PTE_SPECIAL case follows: */ + + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + */ + +static inline unsigned long +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss) +{ + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + + /* pte contains position in swap or file, so copy. */ + if (unlikely(!pte_present(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + rss[MM_SWAPENTS]++; + } else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; + + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); + set_pte_at(src_mm, addr, src_pte, pte); + } + } + goto out_set_pte; + } + + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if (is_cow_mapping(vm_flags)) { + ptep_set_wrprotect(src_mm, addr, src_pte); + pte = pte_wrprotect(pte); + } + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page); + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; + } + +out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pte_t *orig_src_pte, *orig_dst_pte; + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; + int rss[NR_MM_COUNTERS]; + swp_entry_t entry = (swp_entry_t){0}; + +again: + init_rss_vec(rss); + + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) + return -ENOMEM; + src_pte = pte_offset_map(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + orig_src_pte = src_pte; + orig_dst_pte = dst_pte; + arch_enter_lazy_mmu_mode(); + + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32) { + progress = 0; + if (need_resched() || + spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) + break; + } + if (pte_none(*src_pte)) { + progress++; + continue; + } + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, addr, rss); + if (entry.val) + break; + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap(orig_src_pte); + add_mm_rss_vec(dst_mm, rss); + pte_unmap_unlock(orig_dst_pte, dst_ptl); + cond_resched(); + + if (entry.val) { + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + return -ENOMEM; + progress = 0; + } + if (addr != end) + goto again; + return 0; +} + +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pmd_t *src_pmd, *dst_pmd; + unsigned long next; + + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; +} + +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pud_t *src_pud, *dst_pud; + unsigned long next; + + dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + if (!dst_pud) + return -ENOMEM; + src_pud = pud_offset(src_pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; +} + +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + bool is_cow; + int ret; + + /* + * Don't copy ptes where a page fault will fill them correctly. + * Fork becomes much lighter when there are big shared or private + * readonly mappings. The tradeoff is that copy_page_range is more + * efficient than faulting. + */ + if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && + !vma->anon_vma) + return 0; + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, vma); + + if (unlikely(vma->vm_flags & VM_PFNMAP)) { + /* + * We do not free on error cases below as remove_vma + * gets called on error from higher level routine + */ + ret = track_pfn_copy(vma); + if (ret) + return ret; + } + + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + is_cow = is_cow_mapping(vma->vm_flags); + mmun_start = addr; + mmun_end = end; + if (is_cow) + mmu_notifier_invalidate_range_start(src_mm, mmun_start, + mmun_end); + + ret = 0; + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + if (is_cow) + mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); + return ret; +} + +static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + struct mm_struct *mm = tlb->mm; + int force_flush = 0; + int rss[NR_MM_COUNTERS]; + spinlock_t *ptl; + pte_t *start_pte; + pte_t *pte; + swp_entry_t entry; + +again: + init_rss_vec(rss); + start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = start_pte; + arch_enter_lazy_mmu_mode(); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) { + continue; + } + + if (pte_present(ptent)) { + struct page *page; + + page = vm_normal_page(vma, addr, ptent); + if (unlikely(details) && page) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping && + details->check_mapping != page->mapping) + continue; + } + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(!page)) + continue; + if (PageAnon(page)) + rss[MM_ANONPAGES]--; + else { + if (pte_dirty(ptent)) { + force_flush = 1; + set_page_dirty(page); + } + if (pte_young(ptent) && + likely(!(vma->vm_flags & VM_SEQ_READ))) + mark_page_accessed(page); + rss[MM_FILEPAGES]--; + } + page_remove_rmap(page); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + if (unlikely(!__tlb_remove_page(tlb, page))) { + force_flush = 1; + addr += PAGE_SIZE; + break; + } + continue; + } + /* If details->check_mapping, we leave swap entries. */ + if (unlikely(details)) + continue; + + entry = pte_to_swp_entry(ptent); + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + else if (is_migration_entry(entry)) { + struct page *page; + + page = migration_entry_to_page(entry); + + if (PageAnon(page)) + rss[MM_ANONPAGES]--; + else + rss[MM_FILEPAGES]--; + } + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, addr != end); + + add_mm_rss_vec(mm, rss); + arch_leave_lazy_mmu_mode(); + + /* Do the actual TLB flush before dropping ptl */ + if (force_flush) + tlb_flush_mmu_tlbonly(tlb); + pte_unmap_unlock(start_pte, ptl); + + /* + * If we forced a TLB flush (either due to running out of + * batch buffers or because we needed to flush dirty TLB + * entries before releasing the ptl), free the batched + * memory too. Restart if we didn't do everything. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu_free(tlb); + + if (addr != end) + goto again; + } + + return addr; +} + +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) { +#ifdef CONFIG_DEBUG_VM + if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { + pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", + __func__, addr, end, + vma->vm_start, + vma->vm_end); + BUG(); + } +#endif + split_huge_page_pmd(vma, addr, pmd); + } else if (zap_huge_pmd(tlb, vma, pmd, addr)) + goto next; + /* fall through */ + } + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_sem in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + goto next; + next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: + cond_resched(); + } while (pmd++, addr = next, addr != end); + + return addr; +} + +static inline unsigned long zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + next = zap_pmd_range(tlb, vma, pud, addr, next, details); + } while (pud++, addr = next, addr != end); + + return addr; +} + +static void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pgd_t *pgd; + unsigned long next; + + if (details && !details->check_mapping) + details = NULL; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = zap_pud_range(tlb, vma, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); + tlb_end_vma(tlb, vma); +} + + +static void unmap_single_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, + struct zap_details *details) +{ + unsigned long start = max(vma->vm_start, start_addr); + unsigned long end; + + if (start >= vma->vm_end) + return; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (vma->vm_file) + uprobe_munmap(vma, start, end); + + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn(vma, 0, 0); + + if (start != end) { + if (unlikely(is_vm_hugetlb_page(vma))) { + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of mmap_region. When + * hugetlbfs ->mmap method fails, + * mmap_region() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + i_mmap_lock_write(vma->vm_file->f_mapping); + __unmap_hugepage_range_final(tlb, vma, start, end, NULL); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + } else + unmap_page_range(tlb, vma, start, end, details); + } +} + +/** + * unmap_vmas - unmap a range of memory covered by a list of vma's + * @tlb: address of the caller's struct mmu_gather + * @vma: the starting vma + * @start_addr: virtual address at which to start unmapping + * @end_addr: virtual address at which to end unmapping + * + * Unmap all pages in the vma list. + * + * Only addresses between `start' and `end' will be unmapped. + * + * The VMA list must be sorted in ascending virtual address order. + * + * unmap_vmas() assumes that the caller will flush the whole unmapped address + * range after unmap_vmas() returns. So the only responsibility here is to + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +void unmap_vmas(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) + unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); +} + +/** + * zap_page_range - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @start: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * Caller must protect the VMA list + */ +void zap_page_range(struct vm_area_struct *vma, unsigned long start, + unsigned long size, struct zap_details *details) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + unsigned long end = start + size; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + mmu_notifier_invalidate_range_start(mm, start, end); + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) + unmap_single_vma(&tlb, vma, start, end, details); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + unsigned long end = address + size; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, address, end); + update_hiwater_rss(mm); + mmu_notifier_invalidate_range_start(mm, address, end); + unmap_single_vma(&tlb, vma, address, end, details); + mmu_notifier_invalidate_range_end(mm, address, end); + tlb_finish_mmu(&tlb, address, end); +} + +/** + * zap_vma_ptes - remove ptes mapping the vma + * @vma: vm_area_struct holding ptes to be zapped + * @address: starting address of pages to zap + * @size: number of bytes to zap + * + * This function only unmaps ptes assigned to VM_PFNMAP vmas. + * + * The entire address range must be fully contained within the vma. + * + * Returns 0 if successful. + */ +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + if (address < vma->vm_start || address + size > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + return -1; + zap_page_range_single(vma, address, size, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(zap_vma_ptes); + +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) +{ + pgd_t * pgd = pgd_offset(mm, addr); + pud_t * pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd_t * pmd = pmd_alloc(mm, pud, addr); + if (pmd) { + VM_BUG_ON(pmd_trans_huge(*pmd)); + return pte_alloc_map_lock(mm, pmd, addr, ptl); + } + } + return NULL; +} + +/* + * This is the old fallback for page remapping. + * + * For historical reasons, it only allows reserved pages. Only + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +static int insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -EINVAL; + if (PageAnon(page)) + goto out; + retval = -ENOMEM; + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter_fast(mm, MM_FILEPAGES); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + + retval = 0; + pte_unmap_unlock(pte, ptl); + return retval; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * + * This allows drivers to insert individual pages they've allocated + * into a user vma. + * + * The page has to be a nice clean _individual_ kernel allocation. + * If you allocate a compound page, you need to have marked it as + * such (__GFP_COMP), or manually just split the page up yourself + * (see split_page()). + * + * NOTE! Traditionally this was done with "remap_pfn_range()" which + * took an arbitrary page protection parameter. This doesn't allow + * that. Your vma protection will have to be set up correctly, which + * means that if you want a shared writable mapping, you'd better + * ask for a shared writable mapping! + * + * The page does not need to be reserved. + * + * Usually this function is called from f_op->mmap() handler + * under mm->mmap_sem write-lock, so it can change vma->vm_flags. + * Caller must set VM_MIXEDMAP on vma if it wants to call this + * function from other places, for example from page-fault handler. + */ +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!page_count(page)) + return -EINVAL; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } + return insert_page(vma, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page); + +static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte, entry; + spinlock_t *ptl; + + retval = -ENOMEM; + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + entry = pte_mkspecial(pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, entry); + update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ + + retval = 0; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/** + * vm_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + */ +int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + int ret; + pgprot_t pgprot = vma->vm_page_prot; + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (track_pfn_insert(vma, &pgprot, pfn)) + return -EINVAL; + + ret = insert_pfn(vma, addr, pfn, pgprot); + + return ret; +} +EXPORT_SYMBOL(vm_insert_pfn); + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + + /* + * If we don't have pte special, then we have to use the pfn_valid() + * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* + * refcount the page if pfn_valid is true (hence insert_page rather + * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP + * without pte special, it would there be refcounted as a normal page. + */ + if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + struct page *page; + + page = pfn_to_page(pfn); + return insert_page(vma, addr, page, vma->vm_page_prot); + } + return insert_pfn(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_mixed); + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + return 0; +} + +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); + do { + next = pmd_addr_end(addr, end); + if (remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + PAGE_ALIGN(size); + struct mm_struct *mm = vma->vm_mm; + int err; + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; + vma->vm_pgoff = pfn; + } + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + if (err) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); + err = remap_pud_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + if (err) + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + + return err; +} +EXPORT_SYMBOL(remap_pfn_range); + +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + pgtable_t token; + spinlock_t *uninitialized_var(ptl); + + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + + BUG_ON(pmd_huge(*pmd)); + + arch_enter_lazy_mmu_mode(); + + token = pmd_pgtable(*pmd); + + do { + err = fn(pte++, token, addr, data); + if (err) + break; + } while (addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + + if (mm != &init_mm) + pte_unmap_unlock(pte-1, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + BUG_ON(pud_huge(*pud)); + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + err = apply_to_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + +/* + * handle_pte_fault chooses page fault handler according to an entry which was + * read non-atomically. Before making any commitment, on those architectures + * or configurations (e.g. i386 with PAE) which might give a mix of unmatched + * parts, do_swap_page must check under lock before unmapping the pte and + * proceeding (but do_wp_page is only called after already making such a check; + * and do_anonymous_page can safely check later on). + */ +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, + pte_t *page_table, pte_t orig_pte) +{ + int same = 1; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + if (sizeof(pte_t) > sizeof(unsigned long)) { + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + same = pte_same(*page_table, orig_pte); + spin_unlock(ptl); + } +#endif + pte_unmap(page_table); + return same; +} + +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +{ + debug_dma_assert_idle(src); + + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + clear_page(kaddr); + kunmap_atomic(kaddr); + flush_dcache_page(dst); + } else + copy_user_highpage(dst, src, va, vma); +} + +/* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = page; + vmf.cow_page = NULL; + + ret = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + if (unlikely(!(ret & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + unlock_page(page); + return 0; /* retry */ + } + ret |= VM_FAULT_LOCKED; + } else + VM_BUG_ON_PAGE(!PageLocked(page), page); + return ret; +} + +/* + * Handle write page faults for pages that can be reused in the current vma + * + * This can happen either due to the mapping being with the VM_SHARED flag, + * or due to us being the last reference standing to the page. In either + * case, all we need to do here is to mark the page as writable and update + * any related book-keeping. + */ +static inline int wp_page_reuse(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + struct page *page, int page_mkwrite, + int dirty_shared) + __releases(ptl) +{ + pte_t entry; + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (page) + page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); + + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, address, page_table, entry, 1)) + update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); + + if (dirty_shared) { + struct address_space *mapping; + int dirtied; + + if (!page_mkwrite) + lock_page(page); + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); + + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!page_mkwrite) + file_update_time(vma->vm_file); + } + + return VM_FAULT_WRITE; +} + +/* + * Handle the case of a page which we actually need to copy to a new page. + * + * Called with mmap_sem locked and the old page referenced, but + * without the ptl held. + * + * High level logic flow: + * + * - Allocate a page, copy the content of the old page to the new one. + * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. + * - Take the PTL. If the pte changed, bail out and release the allocated page + * - If the pte is still the way we remember it, update the page table and all + * relevant references. This includes dropping the reference the page-table + * held to the old page, as well as updating the rmap. + * - In any case, unlock the PTL and drop the reference we took to the old page. + */ +static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + pte_t orig_pte, struct page *old_page) +{ + struct page *new_page = NULL; + spinlock_t *ptl = NULL; + pte_t entry; + int page_copied = 0; + const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ + const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ + struct mem_cgroup *memcg; + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + + if (is_zero_pfn(pte_pfn(orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); + } + __SetPageUptodate(new_page); + + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) + goto oom_free_new; + + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + + /* + * Re-check the pte - we dropped the lock + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { + if (!PageAnon(old_page)) { + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + } else { + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush_notify(vma, address, page_table); + page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); + /* + * We call the notify macro here because, when using secondary + * mmu page tables (such as kvm shadow page tables), we want the + * new page to be mapped directly into the secondary page table. + */ + set_pte_at_notify(mm, address, page_table, entry); + update_mmu_cache(vma, address, page_table); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page); + } + + /* Free the old page.. */ + new_page = old_page; + page_copied = 1; + } else { + mem_cgroup_cancel_charge(new_page, memcg); + } + + if (new_page) + page_cache_release(new_page); + + pte_unmap_unlock(page_table, ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + if (old_page) { + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (page_copied && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); + } + page_cache_release(old_page); + } + return page_copied ? VM_FAULT_WRITE : 0; +oom_free_new: + page_cache_release(new_page); +oom: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; +} + +/* + * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED + * mapping + */ +static int wp_pfn_shared(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + pmd_t *pmd) +{ + if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { + struct vm_fault vmf = { + .page = NULL, + .pgoff = linear_page_index(vma, address), + .virtual_address = (void __user *)(address & PAGE_MASK), + .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, + }; + int ret; + + pte_unmap_unlock(page_table, ptl); + ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); + if (ret & VM_FAULT_ERROR) + return ret; + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* + * We might have raced with another page fault while we + * released the pte_offset_map_lock. + */ + if (!pte_same(*page_table, orig_pte)) { + pte_unmap_unlock(page_table, ptl); + return 0; + } + } + return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, + NULL, 0, 0); +} + +static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, + pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, + struct page *old_page) + __releases(ptl) +{ + int page_mkwrite = 0; + + page_cache_get(old_page); + + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + int tmp; + + pte_unmap_unlock(page_table, ptl); + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; + } + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; + } + page_mkwrite = 1; + } + + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, page_mkwrite, 1); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) +{ + struct page *old_page; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) { + /* + * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a + * VM_PFNMAP VMA. + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable and/or call ops->pfn_mkwrite. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + return wp_pfn_shared(mm, vma, address, page_table, ptl, + orig_pte, pmd); + + pte_unmap_unlock(page_table, ptl); + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); + } + + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page) && !PageKsm(old_page)) { + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; + } + page_cache_release(old_page); + } + if (reuse_swap_page(old_page)) { + /* + * The page is all ours. Move it to our anon_vma so + * the rmap code will not search our parent or siblings. + * Protected against the rmap code by the page lock. + */ + page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); + } + unlock_page(old_page); + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + return wp_page_shared(mm, vma, address, page_table, pmd, + ptl, orig_pte, old_page); + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + + pte_unmap_unlock(page_table, ptl); + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); +} + +static void unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + zap_page_range_single(vma, start_addr, end_addr - start_addr, details); +} + +static inline void unmap_mapping_range_tree(struct rb_root *root, + struct zap_details *details) +{ + struct vm_area_struct *vma; + pgoff_t vba, vea, zba, zea; + + vma_interval_tree_foreach(vma, root, + details->first_index, details->last_index) { + + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma) - 1; + /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ + zba = details->first_index; + if (zba < vba) + zba = vba; + zea = details->last_index; + if (zea > vea) + zea = vea; + + unmap_mapping_range_vma(vma, + ((zba - vba) << PAGE_SHIFT) + vma->vm_start, + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details); + } +} + +/** + * unmap_mapping_range - unmap the portion of all mmaps in the specified + * address_space corresponding to the specified page range in the underlying + * file. + * + * @mapping: the address space containing mmaps to be unmapped. + * @holebegin: byte in first page to unmap, relative to the start of + * the underlying file. This will be rounded down to a PAGE_SIZE + * boundary. Note that this is different from truncate_pagecache(), which + * must keep the partial page. In contrast, we must get rid of + * partial pages. + * @holelen: size of prospective hole in bytes. This will be rounded + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * end of the file. + * @even_cows: 1 when truncating a file, unmap even private COWed pages; + * but 0 when invalidating pagecache, don't throw away private data. + */ +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) +{ + struct zap_details details; + pgoff_t hba = holebegin >> PAGE_SHIFT; + pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Check for overflow. */ + if (sizeof(holelen) > sizeof(hlen)) { + long long holeend = + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (holeend & ~(long long)ULONG_MAX) + hlen = ULONG_MAX - hba + 1; + } + + details.check_mapping = even_cows? NULL: mapping; + details.first_index = hba; + details.last_index = hba + hlen - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + + + /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} +EXPORT_SYMBOL(unmap_mapping_range); + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_sem locked or unlocked in the same cases + * as does filemap_fault(). + */ +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + unsigned int flags, pte_t orig_pte) +{ + spinlock_t *ptl; + struct page *page, *swapcache; + struct mem_cgroup *memcg; + swp_entry_t entry; + pte_t pte; + int locked; + int exclusive = 0; + int ret = 0; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + goto out; + + entry = pte_to_swp_entry(orig_pte); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + } else if (is_hwpoison_entry(entry)) { + ret = VM_FAULT_HWPOISON; + } else { + print_bad_pte(vma, address, orig_pte, NULL); + ret = VM_FAULT_SIGBUS; + } + goto out; + } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); + page = lookup_swap_cache(entry); + if (!page) { + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, address); + if (!page) { + /* + * Back out if somebody else faulted in this pte + * while we released the pte lock. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) + ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto unlock; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(mm, PGMAJFAULT); + } else if (PageHWPoison(page)) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + swapcache = page; + goto out_release; + } + + swapcache = page; + locked = lock_page_or_retry(page, mm, flags); + + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release; + } + + /* + * Make sure try_to_free_swap or reuse_swap_page or swapoff did not + * release the swapcache from under us. The page pin, and pte_same + * test below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not changed. + */ + if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) + goto out_page; + + page = ksm_might_need_to_copy(page, vma, address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; + } + + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { + ret = VM_FAULT_OOM; + goto out_page; + } + + /* + * Back out if somebody else already faulted in this pte. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*page_table, orig_pte))) + goto out_nomap; + + if (unlikely(!PageUptodate(page))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; + } + + /* + * The page isn't present yet, go ahead with the fault. + * + * Be careful about the sequence of operations here. + * To get its accounting right, reuse_swap_page() must be called + * while the page is counted on swap but not yet in mapcount i.e. + * before page_add_anon_rmap() and swap_free(); try_to_free_swap() + * must be called after the swap_free(), or it will never succeed. + */ + + inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); + pte = mk_pte(page, vma->vm_page_prot); + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + flags &= ~FAULT_FLAG_WRITE; + ret |= VM_FAULT_WRITE; + exclusive = 1; + } + flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); + set_pte_at(mm, address, page_table, pte); + if (page == swapcache) { + do_page_add_anon_rmap(page, vma, address, exclusive); + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, address); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } + + swap_free(entry); + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + try_to_free_swap(page); + unlock_page(page); + if (page != swapcache) { + /* + * Hold the lock to avoid the swap entry to be reused + * until we take the PT lock for the pte_same() check + * (to avoid false positives from pte_same). For + * further safety release the lock after the swap_free + * so that the swap count won't change under a + * parallel locked swapcache. + */ + unlock_page(swapcache); + page_cache_release(swapcache); + } + + if (flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); + if (ret & VM_FAULT_ERROR) + ret &= VM_FAULT_ERROR; + goto out; + } + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, page_table); +unlock: + pte_unmap_unlock(page_table, ptl); +out: + return ret; +out_nomap: + mem_cgroup_cancel_charge(page, memcg); + pte_unmap_unlock(page_table, ptl); +out_page: + unlock_page(page); +out_release: + page_cache_release(page); + if (page != swapcache) { + unlock_page(swapcache); + page_cache_release(swapcache); + } + return ret; +} + +/* + * This is like a special single-page "expand_{down|up}wards()", + * except we must first make sure that 'address{-|+}PAGE_SIZE' + * doesn't hit another vma. + */ +static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) +{ + address &= PAGE_MASK; + if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { + struct vm_area_struct *prev = vma->vm_prev; + + /* + * Is there a mapping abutting this one below? + * + * That's only ok if it's the same stack mapping + * that has gotten split.. + */ + if (prev && prev->vm_end == address) + return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; + + return expand_downwards(vma, address - PAGE_SIZE); + } + if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { + struct vm_area_struct *next = vma->vm_next; + + /* As VM_GROWSDOWN but s/below/above/ */ + if (next && next->vm_start == address + PAGE_SIZE) + return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; + + return expand_upwards(vma, address + PAGE_SIZE); + } + return 0; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + unsigned int flags) +{ + struct mem_cgroup *memcg; + struct page *page; + spinlock_t *ptl; + pte_t entry; + + pte_unmap(page_table); + + /* Check if we need to add a guard page to the stack */ + if (check_stack_guard_page(vma, address) < 0) + return VM_FAULT_SIGSEGV; + + /* Use the zero-page for reads */ + if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), + vma->vm_page_prot)); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto unlock; + goto setpte; + } + + /* Allocate our own private page. */ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage_movable(vma, address); + if (!page) + goto oom; + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + goto oom_free_page; + + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + + inc_mm_counter_fast(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); +setpte: + set_pte_at(mm, address, page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, page_table); +unlock: + pte_unmap_unlock(page_table, ptl); + return 0; +release: + mem_cgroup_cancel_charge(page, memcg); + page_cache_release(page); + goto unlock; +oom_free_page: + page_cache_release(page); +oom: + return VM_FAULT_OOM; +} + +/* + * The mmap_sem must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). + */ +static int __do_fault(struct vm_area_struct *vma, unsigned long address, + pgoff_t pgoff, unsigned int flags, + struct page *cow_page, struct page **page) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.page = NULL; + vmf.cow_page = cow_page; + + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + if (!vmf.page) + goto out; + + if (unlikely(PageHWPoison(vmf.page))) { + if (ret & VM_FAULT_LOCKED) + unlock_page(vmf.page); + page_cache_release(vmf.page); + return VM_FAULT_HWPOISON; + } + + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf.page); + else + VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + + out: + *page = vmf.page; + return ret; +} + +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon) +{ + pte_t entry; + + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (anon) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + page_add_file_rmap(page); + } + set_pte_at(vma->vm_mm, address, pte, entry); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); +} + +static unsigned long fault_around_bytes __read_mostly = + rounddown_pow_of_two(65536); + +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) +{ + *val = fault_around_bytes; + return 0; +} + +/* + * fault_around_pages() and fault_around_mask() expects fault_around_bytes + * rounded down to nearest page order. It's what do_fault_around() expects to + * see. + */ +static int fault_around_bytes_set(void *data, u64 val) +{ + if (val / PAGE_SIZE > PTRS_PER_PTE) + return -EINVAL; + if (val > PAGE_SIZE) + fault_around_bytes = rounddown_pow_of_two(val); + else + fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, + fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); + if (!ret) + pr_warn("Failed to create fault_around_bytes in debugfs"); + return 0; +} +late_initcall(fault_around_debugfs); +#endif + +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function is called with the page table lock taken. In the split ptlock + * case the page table lock only protects only those entries which belong to + * the page table corresponding to the fault address. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_pages() defines how many pages we'll try to map. + * do_fault_around() expects it to return a power of two less than or equal to + * PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to the + * fault_around_pages() value (and therefore to page order). This way it's + * easier to guarantee that we don't cross page table boundaries. + */ +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ + unsigned long start_addr, nr_pages, mask; + pgoff_t max_pgoff; + struct vm_fault vmf; + int off; + + nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; + mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; + + start_addr = max(address & mask, vma->vm_start); + off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + pte -= off; + pgoff -= off; + + /* + * max_pgoff is either end of page table or end of vma + * or fault_around_pages() from pgoff, depending what is nearest. + */ + max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, + pgoff + nr_pages - 1); + + /* Check if it makes any sense to call ->map_pages */ + while (!pte_none(*pte)) { + if (++pgoff > max_pgoff) + return; + start_addr += PAGE_SIZE; + if (start_addr >= vma->vm_end) + return; + pte++; + } + + vmf.virtual_address = (void __user *) start_addr; + vmf.pte = pte; + vmf.pgoff = pgoff; + vmf.max_pgoff = max_pgoff; + vmf.flags = flags; + vma->vm_ops->map_pages(vma, &vmf); +} + +static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + spinlock_t *ptl; + pte_t *pte; + int ret = 0; + + /* + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). + */ + if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + do_fault_around(vma, address, pte, pgoff, flags); + if (!pte_same(*pte, orig_pte)) + goto unlock_out; + pte_unmap_unlock(pte, ptl); + } + + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; + } + do_set_pte(vma, address, fault_page, pte, false, false); + unlock_page(fault_page); +unlock_out: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page, *new_page; + struct mem_cgroup *memcg; + spinlock_t *ptl; + pte_t *pte; + int ret; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + return VM_FAULT_OOM; + + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { + page_cache_release(new_page); + return VM_FAULT_OOM; + } + + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; + + if (fault_page) + copy_user_highpage(new_page, fault_page, address, vma); + __SetPageUptodate(new_page); + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } + goto uncharge_out; + } + do_set_pte(vma, address, new_page, pte, true, true); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); + pte_unmap_unlock(pte, ptl); + if (fault_page) { + unlock_page(fault_page); + page_cache_release(fault_page); + } else { + /* + * The fault handler has no page to lock, so it holds + * i_mmap_lock for read to protect against truncate. + */ + i_mmap_unlock_read(vma->vm_file->f_mapping); + } + return ret; +uncharge_out: + mem_cgroup_cancel_charge(new_page, memcg); + page_cache_release(new_page); + return ret; +} + +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + struct address_space *mapping; + spinlock_t *ptl; + pte_t *pte; + int dirtied = 0; + int ret, tmp; + + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + /* + * Check if the backing address space wants to know that the page is + * about to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(fault_page); + tmp = do_page_mkwrite(vma, fault_page, address); + if (unlikely(!tmp || + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(fault_page); + return tmp; + } + } + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; + } + do_set_pte(vma, address, fault_page, pte, true, false); + pte_unmap_unlock(pte, ptl); + + if (set_page_dirty(fault_page)) + dirtied = 1; + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = fault_page->mapping; + unlock_page(fault_page); + if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + if (!vma->vm_ops->page_mkwrite) + file_update_time(vma->vm_file); + + return ret; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults). + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ +static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + unsigned int flags, pte_t orig_pte) +{ + pgoff_t pgoff = (((address & PAGE_MASK) + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + pte_unmap(page_table); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); +} + +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, + int *flags) +{ + get_page(page); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == numa_node_id()) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } + + return mpol_misplaced(page, vma, addr); +} + +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ + struct page *page = NULL; + spinlock_t *ptl; + int page_nid = -1; + int last_cpupid; + int target_nid; + bool migrated = false; + bool was_writable = pte_write(pte); + int flags = 0; + + /* A PROT_NONE fault should not end up here */ + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); + + /* + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + * + * We can safely just do a "set_pte_at()", because the old + * page table entry is not accessible, so there would be no + * concurrent hardware modifications to the PTE. + */ + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*ptep, pte))) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + /* Make it present again */ + pte = pte_modify(pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (was_writable) + pte = pte_mkwrite(pte); + set_pte_at(mm, addr, ptep, pte); + update_mmu_cache(vma, addr, ptep); + + page = vm_normal_page(vma, addr, pte); + if (!page) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + + /* + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as + * much anyway since they can be in shared cache state. This misses + * the case where a mapping is writable but the process never writes + * to it but pte_write gets cleared during protection updates and + * pte_dirty has unpredictable behaviour between PTE scan updates, + * background writeback, dirty balancing and application behaviour. + */ + if (!(vma->vm_flags & VM_WRITE)) + flags |= TNF_NO_GROUP; + + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + last_cpupid = page_cpupid_last(page); + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); + pte_unmap_unlock(ptep, ptl); + if (target_nid == -1) { + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + migrated = migrate_misplaced_page(page, vma, target_nid); + if (migrated) { + page_nid = target_nid; + flags |= TNF_MIGRATED; + } else + flags |= TNF_MIGRATE_FAIL; + +out: + if (page_nid != -1) + task_numa_fault(last_cpupid, page_nid, 1, flags); + return 0; +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with pte unmapped and unlocked. + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ +static int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) +{ + pte_t entry; + spinlock_t *ptl; + + /* + * some architectures can have larger ptes than wordsize, + * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, + * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. + * The code below just needs a consistent view for the ifs and + * we later double check anyway with the ptl lock held. So here + * a barrier will do. + */ + entry = *pte; + barrier(); + if (!pte_present(entry)) { + if (pte_none(entry)) { + if (vma->vm_ops) { + if (likely(vma->vm_ops->fault)) + return do_fault(mm, vma, address, pte, + pmd, flags, entry); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, flags); + } + return do_swap_page(mm, vma, address, + pte, pmd, flags, entry); + } + + if (pte_protnone(entry)) + return do_numa_page(mm, vma, address, entry, pte, pmd); + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*pte, entry))) + goto unlock; + if (flags & FAULT_FLAG_WRITE) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vma, address, pte); + } else { + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vma, address); + } +unlock: + pte_unmap_unlock(pte, ptl); + return 0; +} + +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (unlikely(is_vm_hugetlb_page(vma))) + return hugetlb_fault(mm, vma, address, flags); + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (!pud) + return VM_FAULT_OOM; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + return VM_FAULT_OOM; + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + int ret = VM_FAULT_FALLBACK; + if (!vma->vm_ops) + ret = do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + pmd_t orig_pmd = *pmd; + int ret; + + barrier(); + if (pmd_trans_huge(orig_pmd)) { + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + /* + * If the pmd is splitting, return and retry the + * the fault. Alternative: wait until the split + * is done, and goto retry. + */ + if (pmd_trans_splitting(orig_pmd)) + return 0; + + if (pmd_protnone(orig_pmd)) + return do_huge_pmd_numa_page(mm, vma, address, + orig_pmd, pmd); + + if (dirty && !pmd_write(orig_pmd)) { + ret = do_huge_pmd_wp_page(mm, vma, address, pmd, + orig_pmd); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pmd_set_accessed(mm, vma, address, pmd, + orig_pmd, dirty); + return 0; + } + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) + return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); + + return handle_pte_fault(mm, vma, address, pte, pmd, flags); +} + +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + int ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_oom_enable(); + + ret = __handle_mm_fault(mm, vma, address, flags); + + if (flags & FAULT_FLAG_USER) { + mem_cgroup_oom_disable(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } + + return ret; +} +EXPORT_SYMBOL_GPL(handle_mm_fault); + +#ifndef __PAGETABLE_PUD_FOLDED +/* + * Allocate page upper directory. + * We've already handled the fast-path in-line. + */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new = pud_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) /* Another has populated it */ + pud_free(mm, new); + else + pgd_populate(mm, pgd, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +/* + * Allocate page middle directory. + * We've already handled the fast-path in-line. + */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&mm->page_table_lock); +#ifndef __ARCH_HAS_4LEVEL_HACK + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); + pud_populate(mm, pud, new); + } else /* Another has populated it */ + pmd_free(mm, new); +#else + if (!pgd_present(*pud)) { + mm_inc_nr_pmds(mm); + pgd_populate(mm, pud, new); + } else /* Another has populated it */ + pmd_free(mm, new); +#endif /* __ARCH_HAS_4LEVEL_HACK */ + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_PMD_FOLDED */ + +static int __follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + if (!ptep) + goto out; + if (!pte_present(*ptep)) + goto unlock; + *ptepp = ptep; + return 0; +unlock: + pte_unmap_unlock(ptep, *ptlp); +out: + return -EINVAL; +} + +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte(mm, address, ptepp, ptlp))); + return res; +} + +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + +#ifdef CONFIG_HAVE_IOREMAP_PROT +int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot, resource_size_t *phys) +{ + int ret = -EINVAL; + pte_t *ptep, pte; + spinlock_t *ptl; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + goto out; + pte = *ptep; + + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + + *prot = pgprot_val(pte_pgprot(pte)); + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + + ret = 0; +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return ret; +} + +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void __iomem *maddr; + int offset = addr & (PAGE_SIZE-1); + + if (follow_phys(vma, addr, write, &prot, &phys_addr)) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + iounmap(maddr); + + return len; +} +EXPORT_SYMBOL_GPL(generic_access_phys); +#endif + +/* + * Access another process' address space as given in mm. If non-NULL, use the + * given task for page fault accounting. + */ +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) +{ + struct vm_area_struct *vma; + void *old_buf = buf; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; + void *maddr; + struct page *page = NULL; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) { +#ifndef CONFIG_HAVE_IOREMAP_PROT + break; +#else + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) + break; + bytes = ret; +#endif + } else { + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + } + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + + return buf - old_buf; +} + +/** + * access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, int write) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); + + return ret; +} + +/* + * Print the name of a VMA. + */ +void print_vma_addr(char *prefix, unsigned long ip) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + /* + * Do not print if we are in atomic + * contexts (in exception stacks, etc.): + */ + if (preempt_count()) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma && vma->vm_file) { + struct file *f = vma->vm_file; + char *buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) { + char *p; + + p = d_path(&f->f_path, buf, PAGE_SIZE); + if (IS_ERR(p)) + p = "?"; + printk("%s%s[%lx+%lx]", prefix, kbasename(p), + vma->vm_start, + vma->vm_end - vma->vm_start); + free_page((unsigned long)buf); + } + } + up_read(&mm->mmap_sem); +} + +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) +void __might_fault(const char *file, int line) +{ + /* + * Some code (nfs/sunrpc) uses socket ops on kernel memory while + * holding the mmap_sem, this is safe because kernel memory doesn't + * get paged out, therefore we'll never actually fault, and the + * below annotations will generate false positives. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + return; + if (pagefault_disabled()) + return; + __might_sleep(file, line, 0); +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) + if (current->mm) + might_lock_read(¤t->mm->mmap_sem); +#endif +} +EXPORT_SYMBOL(__might_fault); +#endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + +bool ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); + if (!ptl) + return false; + page->ptl = ptl; + return true; +} + +void ptlock_free(struct page *page) +{ + kmem_cache_free(page_ptl_cachep, page->ptl); +} +#endif diff --git a/kernel/mm/memory_hotplug.c b/kernel/mm/memory_hotplug.c new file mode 100644 index 000000000..9e88f749a --- /dev/null +++ b/kernel/mm/memory_hotplug.c @@ -0,0 +1,2015 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +/* + * online_page_callback contains pointer to current page onlining function. + * Initially it is generic_online_page(). If it is required it could be + * changed by calling set_online_page_callback() for callback registration + * and restore_online_page_callback() for generic callback restore. + */ + +static void generic_online_page(struct page *page); + +static online_page_callback_t online_page_callback = generic_online_page; +static DEFINE_MUTEX(online_page_callback_lock); + +/* The same as the cpu_hotplug lock, but for memory hotplug. */ +static struct { + struct task_struct *active_writer; + struct mutex lock; /* Synchronizes accesses to refcount, */ + /* + * Also blocks the new readers during + * an ongoing mem hotplug operation. + */ + int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} mem_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), + .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "mem_hotplug.lock" }, +#endif +}; + +/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ +#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) +#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) +#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) + +void get_online_mems(void) +{ + might_sleep(); + if (mem_hotplug.active_writer == current) + return; + memhp_lock_acquire_read(); + mutex_lock(&mem_hotplug.lock); + mem_hotplug.refcount++; + mutex_unlock(&mem_hotplug.lock); + +} + +void put_online_mems(void) +{ + if (mem_hotplug.active_writer == current) + return; + mutex_lock(&mem_hotplug.lock); + + if (WARN_ON(!mem_hotplug.refcount)) + mem_hotplug.refcount++; /* try to fix things up */ + + if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) + wake_up_process(mem_hotplug.active_writer); + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); + +} + +void mem_hotplug_begin(void) +{ + mem_hotplug.active_writer = current; + + memhp_lock_acquire(); + for (;;) { + mutex_lock(&mem_hotplug.lock); + if (likely(!mem_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&mem_hotplug.lock); + schedule(); + } +} + +void mem_hotplug_done(void) +{ + mem_hotplug.active_writer = NULL; + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); +} + +/* add this memory to iomem resource */ +static struct resource *register_memory_resource(u64 start, u64 size) +{ + struct resource *res; + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(!res); + + res->name = "System RAM"; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res) < 0) { + pr_debug("System RAM resource %pR cannot be added\n", res); + kfree(res); + res = NULL; + } + return res; +} + +static void release_memory_resource(struct resource *res) +{ + if (!res) + return; + release_resource(res); + kfree(res); + return; +} + +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) +{ + page->lru.next = (struct list_head *) type; + SetPagePrivate(page); + set_page_private(page, info); + atomic_inc(&page->_count); +} + +void put_page_bootmem(struct page *page) +{ + unsigned long type; + + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); + + if (atomic_dec_return(&page->_count) == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + INIT_LIST_HEAD(&page->lru); + free_reserved_page(page); + } +} + +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +#ifndef CONFIG_SPARSEMEM_VMEMMAP +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + /* Get section's memmap address */ + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + /* + * Get page for the memmap's phys address + * XXX: need more consideration for sparse_vmemmap... + */ + page = virt_to_page(memmap); + mapsize = sizeof(struct page) * PAGES_PER_SECTION; + mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; + + /* remember memmap's page */ + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, SECTION_INFO); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); + +} +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long *usemap, mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + + if (!pfn_valid(start_pfn)) + return; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usemap = __nr_to_section(section_nr)->pageblock_flags; + page = virt_to_page(usemap); + + mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +void register_page_bootmem_info_node(struct pglist_data *pgdat) +{ + unsigned long i, pfn, end_pfn, nr_pages; + int node = pgdat->node_id; + struct page *page; + struct zone *zone; + + nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; + page = virt_to_page(pgdat); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + + zone = &pgdat->node_zones[0]; + for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { + if (zone_is_initialized(zone)) { + nr_pages = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; + page = virt_to_page(zone->wait_table); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + } + } + + pfn = pgdat->node_start_pfn; + end_pfn = pgdat_end_pfn(pgdat); + + /* register section info */ + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other nodes. + */ + if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } +} +#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ + +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone_end_pfn(zone); + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - + zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void resize_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + zone_span_writelock(zone); + + if (end_pfn - start_pfn) { + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } else { + /* + * make it consist as free_area_init_core(), + * if spanned_pages = 0, then keep start_pfn = 0 + */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } + + zone_span_writeunlock(zone); +} + +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + enum zone_type zid = zone_idx(zone); + int nid = zone->zone_pgdat->node_id; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) + set_page_links(pfn_to_page(pfn), zid, nid, pfn); +} + +/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or + * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ +static int __ref ensure_zone_is_initialized(struct zone *zone, + unsigned long start_pfn, unsigned long num_pages) +{ + if (!zone_is_initialized(zone)) + return init_currently_empty_zone(zone, start_pfn, num_pages, + MEMMAP_HOTPLUG); + return 0; +} + +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + int ret; + unsigned long flags; + unsigned long z1_start_pfn; + + ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are higher than @z2 */ + if (end_pfn > zone_end_pfn(z2)) + goto out_fail; + /* the move out part must be at the left most of @z2 */ + if (start_pfn > z2->zone_start_pfn) + goto out_fail; + /* must included/overlap */ + if (end_pfn <= z2->zone_start_pfn) + goto out_fail; + + /* use start_pfn for z1's start_pfn if z1 is empty */ + if (!zone_is_empty(z1)) + z1_start_pfn = z1->zone_start_pfn; + else + z1_start_pfn = start_pfn; + + resize_zone(z1, z1_start_pfn, end_pfn); + resize_zone(z2, end_pfn, zone_end_pfn(z2)); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z1, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + int ret; + unsigned long flags; + unsigned long z2_end_pfn; + + ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); + if (ret) + return ret; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are lower than @z1 */ + if (z1->zone_start_pfn > start_pfn) + goto out_fail; + /* the move out part mast at the right most of @z1 */ + if (zone_end_pfn(z1) > end_pfn) + goto out_fail; + /* must included/overlap */ + if (start_pfn >= zone_end_pfn(z1)) + goto out_fail; + + /* use end_pfn for z2's end_pfn if z2 is empty */ + if (!zone_is_empty(z2)) + z2_end_pfn = zone_end_pfn(z2); + else + z2_end_pfn = end_pfn; + + resize_zone(z1, z1->zone_start_pfn, start_pfn); + resize_zone(z2, start_pfn, z2_end_pfn); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z2, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); + + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - + pgdat->node_start_pfn; +} + +static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + unsigned long flags; + int ret; + + zone_type = zone - pgdat->node_zones; + ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); + if (ret) + return ret; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, + phys_start_pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + memmap_init_zone(nr_pages, nid, zone_type, + phys_start_pfn, MEMMAP_HOTPLUG); + return 0; +} + +static int __meminit __add_section(int nid, struct zone *zone, + unsigned long phys_start_pfn) +{ + int ret; + + if (pfn_valid(phys_start_pfn)) + return -EEXIST; + + ret = sparse_add_one_section(zone, phys_start_pfn); + + if (ret < 0) + return ret; + + ret = __add_zone(zone, phys_start_pfn); + + if (ret < 0) + return ret; + + return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); +} + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + int start_sec, end_sec; + /* during initialize mem_map, align hot-added range to section */ + start_sec = pfn_to_section_nr(phys_start_pfn); + end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + + for (i = start_sec; i <= end_sec; i++) { + err = __add_section(nid, zone, section_nr_to_pfn(i)); + + /* + * EEXIST is finally dealt with by ioresource collision + * check. see add_memory() => register_memory_resource() + * Warning will be printed if there is collision. + */ + if (err && (err != -EEXIST)) + break; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(__add_pages); + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ +static int find_smallest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(start_pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(start_pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(start_pfn))) + continue; + + return start_pfn; + } + + return 0; +} + +/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ +static int find_biggest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mem_section *ms; + unsigned long pfn; + + /* pfn is the end pfn of a memory section. */ + pfn = end_pfn - 1; + for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (unlikely(pfn_to_nid(pfn) != nid)) + continue; + + if (zone && zone != page_zone(pfn_to_page(pfn))) + continue; + + return pfn; + } + + return 0; +} + +static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ + unsigned long zone_end_pfn = z; + unsigned long pfn; + struct mem_section *ms; + int nid = zone_to_nid(zone); + + zone_span_writelock(zone); + if (zone_start_pfn == start_pfn) { + /* + * If the section is smallest section in the zone, it need + * shrink zone->zone_start_pfn and zone->zone_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, zone, end_pfn, + zone_end_pfn); + if (pfn) { + zone->zone_start_pfn = pfn; + zone->spanned_pages = zone_end_pfn - pfn; + } + } else if (zone_end_pfn == end_pfn) { + /* + * If the section is biggest section in the zone, it need + * shrink zone->spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, + start_pfn); + if (pfn) + zone->spanned_pages = pfn - zone_start_pfn + 1; + } + + /* + * The section is not biggest or smallest mem_section in the zone, it + * only creates a hole in the zone. So in this case, we need not + * change the zone. But perhaps, the zone has only hole data. Thus + * it check the zone has only hole or not. + */ + pfn = zone_start_pfn; + for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (page_zone(pfn_to_page(pfn)) != zone) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + zone_span_writeunlock(zone); + return; + } + + /* The zone has no valid section */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + zone_span_writeunlock(zone); +} + +static void shrink_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pgdat_start_pfn = pgdat->node_start_pfn; + unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ + unsigned long pgdat_end_pfn = p; + unsigned long pfn; + struct mem_section *ms; + int nid = pgdat->node_id; + + if (pgdat_start_pfn == start_pfn) { + /* + * If the section is smallest section in the pgdat, it need + * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, NULL, end_pfn, + pgdat_end_pfn); + if (pfn) { + pgdat->node_start_pfn = pfn; + pgdat->node_spanned_pages = pgdat_end_pfn - pfn; + } + } else if (pgdat_end_pfn == end_pfn) { + /* + * If the section is biggest section in the pgdat, it need + * shrink pgdat->node_spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, + start_pfn); + if (pfn) + pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; + } + + /* + * If the section is not biggest or smallest mem_section in the pgdat, + * it only creates a hole in the pgdat. So in this case, we need not + * change the pgdat. + * But perhaps, the pgdat has only hole data. Thus it check the pgdat + * has only hole or not. + */ + pfn = pgdat_start_pfn; + for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { + ms = __pfn_to_section(pfn); + + if (unlikely(!valid_section(ms))) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* If the section is current section, it continues the loop */ + if (start_pfn == pfn) + continue; + + /* If we find valid section, we have nothing to do */ + return; + } + + /* The pgdat has no valid section */ + pgdat->node_start_pfn = 0; + pgdat->node_spanned_pages = 0; +} + +static void __remove_zone(struct zone *zone, unsigned long start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int zone_type; + unsigned long flags; + + zone_type = zone - pgdat->node_zones; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); + shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + +static int __remove_section(struct zone *zone, struct mem_section *ms) +{ + unsigned long start_pfn; + int scn_nr; + int ret = -EINVAL; + + if (!valid_section(ms)) + return ret; + + ret = unregister_memory_section(ms); + if (ret) + return ret; + + scn_nr = __section_nr(ms); + start_pfn = section_nr_to_pfn(scn_nr); + __remove_zone(zone, start_pfn); + + sparse_remove_one_section(zone, ms); + return 0; +} + +/** + * __remove_pages() - remove sections of pages from a zone + * @zone: zone from which pages need to be removed + * @phys_start_pfn: starting pageframe (must be aligned to start of a section) + * @nr_pages: number of pages to remove (must be multiple of section size) + * + * Generic helper function to remove section mappings and sysfs entries + * for the section of the memory we are removing. Caller needs to make + * sure that pages are marked reserved and zones are adjust properly by + * calling offline_pages(). + */ +int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int sections_to_remove; + resource_size_t start, size; + int ret = 0; + + /* + * We can only remove entire sections + */ + BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); + BUG_ON(nr_pages % PAGES_PER_SECTION); + + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } + + sections_to_remove = nr_pages / PAGES_PER_SECTION; + for (i = 0; i < sections_to_remove; i++) { + unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + ret = __remove_section(zone, __pfn_to_section(pfn)); + if (ret) + break; + } + return ret; +} +EXPORT_SYMBOL_GPL(__remove_pages); +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +int set_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + get_online_mems(); + mutex_lock(&online_page_callback_lock); + + if (online_page_callback == generic_online_page) { + online_page_callback = callback; + rc = 0; + } + + mutex_unlock(&online_page_callback_lock); + put_online_mems(); + + return rc; +} +EXPORT_SYMBOL_GPL(set_online_page_callback); + +int restore_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + get_online_mems(); + mutex_lock(&online_page_callback_lock); + + if (online_page_callback == callback) { + online_page_callback = generic_online_page; + rc = 0; + } + + mutex_unlock(&online_page_callback_lock); + put_online_mems(); + + return rc; +} +EXPORT_SYMBOL_GPL(restore_online_page_callback); + +void __online_page_set_limits(struct page *page) +{ +} +EXPORT_SYMBOL_GPL(__online_page_set_limits); + +void __online_page_increment_counters(struct page *page) +{ + adjust_managed_page_count(page, 1); +} +EXPORT_SYMBOL_GPL(__online_page_increment_counters); + +void __online_page_free(struct page *page) +{ + __free_reserved_page(page); +} +EXPORT_SYMBOL_GPL(__online_page_free); + +static void generic_online_page(struct page *page) +{ + __online_page_set_limits(page); + __online_page_increment_counters(page); + __online_page_free(page); +} + +static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg) +{ + unsigned long i; + unsigned long onlined_pages = *(unsigned long *)arg; + struct page *page; + if (PageReserved(pfn_to_page(start_pfn))) + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(start_pfn + i); + (*online_page_callback)(page); + onlined_pages++; + } + *(unsigned long *)arg = onlined_pages; + return 0; +} + +#ifdef CONFIG_MOVABLE_NODE +/* + * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have + * normal memory. + */ +static bool can_online_high_movable(struct zone *zone) +{ + return true; +} +#else /* CONFIG_MOVABLE_NODE */ +/* ensure every online node has NORMAL memory */ +static bool can_online_high_movable(struct zone *zone) +{ + return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); +} +#endif /* CONFIG_MOVABLE_NODE */ + +/* check which state of node_states will be changed when online memory */ +static void node_states_check_changes_online(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + int nid = zone_to_nid(zone); + enum zone_type zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_NORMAL, + * set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM nor movable node, + * node_states[N_NORMAL_MEMORY] contains nodes which have zones of + * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * if the memory to be online is in a zone of 0...zone_last, and + * the zones of 0...zone_last don't have memory before online, we will + * need to set the node to node_states[N_NORMAL_MEMORY] after + * the memory is online. + */ + if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; + else + arg->status_change_nid_normal = -1; + +#ifdef CONFIG_HIGHMEM + /* + * If we have movable node, node_states[N_HIGH_MEMORY] + * contains nodes which have zones of 0...ZONE_HIGHMEM, + * set zone_last to ZONE_HIGHMEM. + * + * If we don't have movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_MOVABLE, + * set zone_last to ZONE_MOVABLE. + */ + zone_last = ZONE_HIGHMEM; + if (N_MEMORY == N_HIGH_MEMORY) + zone_last = ZONE_MOVABLE; + + if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid_high = nid; + else + arg->status_change_nid_high = -1; +#else + arg->status_change_nid_high = arg->status_change_nid_normal; +#endif + + /* + * if the node don't have memory befor online, we will need to + * set the node to node_states[N_MEMORY] after the memory + * is online. + */ + if (!node_state(nid, N_MEMORY)) + arg->status_change_nid = nid; + else + arg->status_change_nid = -1; +} + +static void node_states_set_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_set_state(node, N_NORMAL_MEMORY); + + if (arg->status_change_nid_high >= 0) + node_set_state(node, N_HIGH_MEMORY); + + node_set_state(node, N_MEMORY); +} + + +/* Must be protected by mem_hotplug_begin() */ +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) +{ + unsigned long flags; + unsigned long onlined_pages = 0; + struct zone *zone; + int need_zonelists_rebuild = 0; + int nid; + int ret; + struct memory_notify arg; + + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_mutex. + */ + zone = page_zone(pfn_to_page(pfn)); + + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && + !can_online_high_movable(zone)) + return -EINVAL; + + if (online_type == MMOP_ONLINE_KERNEL && + zone_idx(zone) == ZONE_MOVABLE) { + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) + return -EINVAL; + } + if (online_type == MMOP_ONLINE_MOVABLE && + zone_idx(zone) == ZONE_MOVABLE - 1) { + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) + return -EINVAL; + } + + /* Previous code may changed the zone of the pfn range */ + zone = page_zone(pfn_to_page(pfn)); + + arg.start_pfn = pfn; + arg.nr_pages = nr_pages; + node_states_check_changes_online(nr_pages, zone, &arg); + + nid = pfn_to_nid(pfn); + + ret = memory_notify(MEM_GOING_ONLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) { + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; + } + /* + * If this zone is not populated, then it is not in zonelist. + * This means the page allocator ignores this zone. + * So, zonelist must be updated after online. + */ + mutex_lock(&zonelists_mutex); + if (!populated_zone(zone)) { + need_zonelists_rebuild = 1; + build_all_zonelists(NULL, zone); + } + + ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, + online_pages_range); + if (ret) { + if (need_zonelists_rebuild) + zone_pcp_reset(zone); + mutex_unlock(&zonelists_mutex); + printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) + << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; + } + + zone->present_pages += onlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages += onlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + if (onlined_pages) { + node_states_set_node(zone_to_nid(zone), &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL, NULL); + else + zone_pcp_update(zone); + } + + mutex_unlock(&zonelists_mutex); + + init_per_zone_wmark_min(); + + if (onlined_pages) + kswapd_run(zone_to_nid(zone)); + + vm_total_pages = nr_free_pagecache_pages(); + + writeback_set_ratelimit(); + + if (onlined_pages) + memory_notify(MEM_ONLINE, &arg); + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ + +static void reset_node_present_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->present_pages = 0; + + pgdat->node_present_pages = 0; +} + +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +{ + struct pglist_data *pgdat; + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; + unsigned long start_pfn = PFN_DOWN(start); + + pgdat = NODE_DATA(nid); + if (!pgdat) { + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) + return NULL; + + arch_refresh_nodedata(nid, pgdat); + } else { + /* Reset the nr_zones and classzone_idx to 0 before reuse */ + pgdat->nr_zones = 0; + pgdat->classzone_idx = 0; + } + + /* we can use NODE_DATA(nid) from here */ + + /* init node's zones as empty zones, we don't have any present pages.*/ + free_area_init_node(nid, zones_size, start_pfn, zholes_size); + + /* + * The node we allocated has no zone fallback lists. For avoiding + * to access not-initialized zonelist, build here. + */ + mutex_lock(&zonelists_mutex); + build_all_zonelists(pgdat, NULL); + mutex_unlock(&zonelists_mutex); + + /* + * zone->managed_pages is set to an approximate value in + * free_area_init_core(), which will cause + * /sys/device/system/node/nodeX/meminfo has wrong data. + * So reset it to 0 before any memory is onlined. + */ + reset_node_managed_pages(pgdat); + + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages because they will be updated in + * online_pages() and offline_pages(). + */ + reset_node_present_pages(pgdat); + + return pgdat; +} + +static void rollback_node_hotadd(int nid, pg_data_t *pgdat) +{ + arch_refresh_nodedata(nid, NULL); + arch_free_nodedata(pgdat); + return; +} + + +/** + * try_online_node - online a node if offlined + * + * called by cpu_up() to online a node without onlined memory. + */ +int try_online_node(int nid) +{ + pg_data_t *pgdat; + int ret; + + if (node_online(nid)) + return 0; + + mem_hotplug_begin(); + pgdat = hotadd_new_pgdat(nid, 0); + if (!pgdat) { + pr_err("Cannot online node %d due to NULL pgdat\n", nid); + ret = -ENOMEM; + goto out; + } + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } + +out: + mem_hotplug_done(); + return ret; +} + +static int check_hotplug_memory_range(u64 start, u64 size) +{ + u64 start_pfn = PFN_DOWN(start); + u64 nr_pages = size >> PAGE_SHIFT; + + /* Memory range must be aligned with section */ + if ((start_pfn & ~PAGE_SECTION_MASK) || + (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { + pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", + (unsigned long long)start, + (unsigned long long)size); + return -EINVAL; + } + + return 0; +} + +/* + * If movable zone has already been setup, newly added memory should be check. + * If its address is higher than movable zone, it should be added as movable. + * Without this check, movable zone may overlap with other zone. + */ +static int should_add_memory_movable(int nid, u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; + + if (zone_is_empty(movable_zone)) + return 0; + + if (movable_zone->zone_start_pfn <= start_pfn) + return 1; + + return 0; +} + +int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +{ + if (should_add_memory_movable(nid, start, size)) + return ZONE_MOVABLE; + + return zone_default; +} + +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +int __ref add_memory(int nid, u64 start, u64 size) +{ + pg_data_t *pgdat = NULL; + bool new_pgdat; + bool new_node; + struct resource *res; + int ret; + + ret = check_hotplug_memory_range(start, size); + if (ret) + return ret; + + res = register_memory_resource(start, size); + ret = -EEXIST; + if (!res) + return ret; + + { /* Stupid hack to suppress address-never-null warning */ + void *p = NODE_DATA(nid); + new_pgdat = !p; + } + + mem_hotplug_begin(); + + new_node = !node_online(nid); + if (new_node) { + pgdat = hotadd_new_pgdat(nid, start); + ret = -ENOMEM; + if (!pgdat) + goto error; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, start, size); + + if (ret < 0) + goto error; + + /* we online node here. we can't roll back from here. */ + node_set_online(nid); + + if (new_node) { + ret = register_one_node(nid); + /* + * If sysfs file of new node can't create, cpu on the node + * can't be hot-added. There is no rollback way now. + * So, check by BUG_ON() to catch it reluctantly.. + */ + BUG_ON(ret); + } + + /* create new memmap entry */ + firmware_map_add_hotplug(start, start + size, "System RAM"); + + goto out; + +error: + /* rollback pgdat allocation and others */ + if (new_pgdat) + rollback_node_hotadd(nid, pgdat); + release_memory_resource(res); + +out: + mem_hotplug_done(); + return ret; +} +EXPORT_SYMBOL_GPL(add_memory); + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy + * set and the size of the free page is given by page_order(). Using this, + * the function determines if the pageblock contains only free pages. + * Due to buddy contraints, a free page at least the size of a pageblock will + * be located at the start of the pageblock + */ +static inline int pageblock_free(struct page *page) +{ + return PageBuddy(page) && page_order(page) >= pageblock_order; +} + +/* Return the start of the next active pageblock after a given page */ +static struct page *next_active_pageblock(struct page *page) +{ + /* Ensure the starting page is pageblock-aligned */ + BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); + + /* If the entire pageblock is free, move to the end of free page */ + if (pageblock_free(page)) { + int order; + /* be careful. we don't have locks, page_order can be changed.*/ + order = page_order(page); + if ((order < MAX_ORDER) && (order >= pageblock_order)) + return page + (1 << order); + } + + return page + pageblock_nr_pages; +} + +/* Checks if this range of memory is likely to be hot-removable. */ +int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +{ + struct page *page = pfn_to_page(start_pfn); + struct page *end_page = page + nr_pages; + + /* Check the starting page of each pageblock within the range */ + for (; page < end_page; page = next_active_pageblock(page)) { + if (!is_pageblock_removable_nolock(page)) + return 0; + cond_resched(); + } + + /* All pageblocks in the memory block are likely to be hot-removable */ + return 1; +} + +/* + * Confirm all pages in a range [start, end) is belongs to the same zone. + */ +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct zone *zone = NULL; + struct page *page; + int i; + for (pfn = start_pfn; + pfn < end_pfn; + pfn += MAX_ORDER_NR_PAGES) { + i = 0; + /* This is just a CONFIG_HOLES_IN_ZONE check.*/ + while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) + i++; + if (i == MAX_ORDER_NR_PAGES) + continue; + page = pfn_to_page(pfn + i); + if (zone && page_zone(page) != zone) + return 0; + zone = page_zone(page); + } + return 1; +} + +/* + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages + * and hugepages). We scan pfn because it's much easier than scanning over + * linked list. This function returns the pfn of the first found movable + * page if it's found, otherwise 0. + */ +static unsigned long scan_movable_pages(unsigned long start, unsigned long end) +{ + unsigned long pfn; + struct page *page; + for (pfn = start; pfn < end; pfn++) { + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageLRU(page)) + return pfn; + if (PageHuge(page)) { + if (page_huge_active(page)) + return pfn; + else + pfn = round_up(pfn + 1, + 1 << compound_order(page)) - 1; + } + } + } + return 0; +} + +#define NR_OFFLINE_AT_ONCE_PAGES (256) +static int +do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + int move_pages = NR_OFFLINE_AT_ONCE_PAGES; + int not_managed = 0; + int ret = 0; + LIST_HEAD(source); + + for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + + if (PageHuge(page)) { + struct page *head = compound_head(page); + pfn = page_to_pfn(head) + (1< PFN_SECTION_SHIFT) { + ret = -EBUSY; + break; + } + if (isolate_huge_page(page, &source)) + move_pages -= 1 << compound_order(head); + continue; + } + + if (!get_page_unless_zero(page)) + continue; + /* + * We can skip free pages. And we can only deal with pages on + * LRU. + */ + ret = isolate_lru_page(page); + if (!ret) { /* Success */ + put_page(page); + list_add_tail(&page->lru, &source); + move_pages--; + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + + } else { +#ifdef CONFIG_DEBUG_VM + printk(KERN_ALERT "removing pfn %lx from LRU failed\n", + pfn); + dump_page(page, "failed to remove from LRU"); +#endif + put_page(page); + /* Because we don't have big zone->lock. we should + check this again here. */ + if (page_count(page)) { + not_managed++; + ret = -EBUSY; + break; + } + } + } + if (!list_empty(&source)) { + if (not_managed) { + putback_movable_pages(&source); + goto out; + } + + /* + * alloc_migrate_target should be improooooved!! + * migrate_pages returns # of failed pages. + */ + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, + MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + if (ret) + putback_movable_pages(&source); + } +out: + return ret; +} + +/* + * remove from free_area[] and mark all as Reserved. + */ +static int +offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, + void *data) +{ + __offline_isolated_pages(start, start + nr_pages); + return 0; +} + +static void +offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, + offline_isolated_pages_cb); +} + +/* + * Check all pages in range, recoreded as memory resource, are isolated. + */ +static int +check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, + void *data) +{ + int ret; + long offlined = *(long *)data; + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); + offlined = nr_pages; + if (!ret) + *(long *)data += offlined; + return ret; +} + +static long +check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +{ + long offlined = 0; + int ret; + + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, + check_pages_isolated_cb); + if (ret < 0) + offlined = (long)ret; + return offlined; +} + +#ifdef CONFIG_MOVABLE_NODE +/* + * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have + * normal memory. + */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + return true; +} +#else /* CONFIG_MOVABLE_NODE */ +/* ensure the node has NORMAL memory if it is still online */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt; + + for (zt = 0; zt <= ZONE_NORMAL; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + if (present_pages > nr_pages) + return true; + + present_pages = 0; + for (; zt <= ZONE_MOVABLE; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + /* + * we can't offline the last normal memory until all + * higher memory is offlined. + */ + return present_pages == 0; +} +#endif /* CONFIG_MOVABLE_NODE */ + +static int __init cmdline_parse_movable_node(char *p) +{ +#ifdef CONFIG_MOVABLE_NODE + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + memblock_set_bottom_up(true); + movable_node_enabled = true; +#else + pr_warn("movable_node option not supported\n"); +#endif + return 0; +} +early_param("movable_node", cmdline_parse_movable_node); + +/* check which state of node_states will be changed when offline memory */ +static void node_states_check_changes_offline(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt, zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_NORMAL, + * set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM nor movable node, + * node_states[N_NORMAL_MEMORY] contains nodes which have zones of + * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is in a zone of 0...zone_last, + * and it is the last present memory, 0...zone_last will + * become empty after offline , thus we can determind we will + * need to clear the node from node_states[N_NORMAL_MEMORY]. + */ + for (zt = 0; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_normal = zone_to_nid(zone); + else + arg->status_change_nid_normal = -1; + +#ifdef CONFIG_HIGHMEM + /* + * If we have movable node, node_states[N_HIGH_MEMORY] + * contains nodes which have zones of 0...ZONE_HIGHMEM, + * set zone_last to ZONE_HIGHMEM. + * + * If we don't have movable node, node_states[N_NORMAL_MEMORY] + * contains nodes which have zones of 0...ZONE_MOVABLE, + * set zone_last to ZONE_MOVABLE. + */ + zone_last = ZONE_HIGHMEM; + if (N_MEMORY == N_HIGH_MEMORY) + zone_last = ZONE_MOVABLE; + + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_high = zone_to_nid(zone); + else + arg->status_change_nid_high = -1; +#else + arg->status_change_nid_high = arg->status_change_nid_normal; +#endif + + /* + * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + */ + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_HIGH_MEMORY] will be changed + * If we try to offline the last present @nr_pages from the node, + * we can determind we will need to clear the node from + * node_states[N_HIGH_MEMORY]. + */ + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (nr_pages >= present_pages) + arg->status_change_nid = zone_to_nid(zone); + else + arg->status_change_nid = -1; +} + +static void node_states_clear_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_clear_state(node, N_NORMAL_MEMORY); + + if ((N_MEMORY != N_NORMAL_MEMORY) && + (arg->status_change_nid_high >= 0)) + node_clear_state(node, N_HIGH_MEMORY); + + if ((N_MEMORY != N_HIGH_MEMORY) && + (arg->status_change_nid >= 0)) + node_clear_state(node, N_MEMORY); +} + +static int __ref __offline_pages(unsigned long start_pfn, + unsigned long end_pfn, unsigned long timeout) +{ + unsigned long pfn, nr_pages, expire; + long offlined_pages; + int ret, drain, retry_max, node; + unsigned long flags; + struct zone *zone; + struct memory_notify arg; + + /* at least, alignment against pageblock is necessary */ + if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) + return -EINVAL; + if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) + return -EINVAL; + /* This makes hotplug much easier...and readable. + we assume this for now. .*/ + if (!test_pages_in_a_zone(start_pfn, end_pfn)) + return -EINVAL; + + zone = page_zone(pfn_to_page(start_pfn)); + node = zone_to_nid(zone); + nr_pages = end_pfn - start_pfn; + + if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) + return -EINVAL; + + /* set above range as isolated */ + ret = start_isolate_page_range(start_pfn, end_pfn, + MIGRATE_MOVABLE, true); + if (ret) + return ret; + + arg.start_pfn = start_pfn; + arg.nr_pages = nr_pages; + node_states_check_changes_offline(nr_pages, zone, &arg); + + ret = memory_notify(MEM_GOING_OFFLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) + goto failed_removal; + + pfn = start_pfn; + expire = jiffies + timeout; + drain = 0; + retry_max = 5; +repeat: + /* start memory hot removal */ + ret = -EAGAIN; + if (time_after(jiffies, expire)) + goto failed_removal; + ret = -EINTR; + if (signal_pending(current)) + goto failed_removal; + ret = 0; + if (drain) { + lru_add_drain_all(); + cond_resched(); + drain_all_pages(zone); + } + + pfn = scan_movable_pages(start_pfn, end_pfn); + if (pfn) { /* We have movable pages */ + ret = do_migrate_range(pfn, end_pfn); + if (!ret) { + drain = 1; + goto repeat; + } else { + if (ret < 0) + if (--retry_max == 0) + goto failed_removal; + yield(); + drain = 1; + goto repeat; + } + } + /* drain all zone's lru pagevec, this is asynchronous... */ + lru_add_drain_all(); + yield(); + /* drain pcp pages, this is synchronous. */ + drain_all_pages(zone); + /* + * dissolve free hugepages in the memory block before doing offlining + * actually in order to make hugetlbfs's object counting consistent. + */ + dissolve_free_huge_pages(start_pfn, end_pfn); + /* check again */ + offlined_pages = check_pages_isolated(start_pfn, end_pfn); + if (offlined_pages < 0) { + ret = -EBUSY; + goto failed_removal; + } + printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); + /* Ok, all of our target is isolated. + We cannot do rollback at this point. */ + offline_isolated_pages(start_pfn, end_pfn); + /* reset pagetype flags and makes migrate type to be MOVABLE */ + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + /* removal success */ + adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); + zone->present_pages -= offlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages -= offlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + init_per_zone_wmark_min(); + + if (!populated_zone(zone)) { + zone_pcp_reset(zone); + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } else + zone_pcp_update(zone); + + node_states_clear_node(node, &arg); + if (arg.status_change_nid >= 0) + kswapd_stop(node); + + vm_total_pages = nr_free_pagecache_pages(); + writeback_set_ratelimit(); + + memory_notify(MEM_OFFLINE, &arg); + return 0; + +failed_removal: + printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_OFFLINE, &arg); + /* pushback to free area */ + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + return ret; +} + +/* Must be protected by mem_hotplug_begin() */ +int offline_pages(unsigned long start_pfn, unsigned long nr_pages) +{ + return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +/** + * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) + * @start_pfn: start pfn of the memory range + * @end_pfn: end pfn of the memory range + * @arg: argument passed to func + * @func: callback for each memory section walked + * + * This function walks through all present mem sections in range + * [start_pfn, end_pfn) and call func on each mem section. + * + * Returns the return value of func. + */ +int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, + void *arg, int (*func)(struct memory_block *, void *)) +{ + struct memory_block *mem = NULL; + struct mem_section *section; + unsigned long pfn, section_nr; + int ret; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + section_nr = pfn_to_section_nr(pfn); + if (!present_section_nr(section_nr)) + continue; + + section = __nr_to_section(section_nr); + /* same memblock? */ + if (mem) + if ((section_nr >= mem->start_section_nr) && + (section_nr <= mem->end_section_nr)) + continue; + + mem = find_memory_block_hinted(section, mem); + if (!mem) + continue; + + ret = func(mem, arg); + if (ret) { + kobject_put(&mem->dev.kobj); + return ret; + } + } + + if (mem) + kobject_put(&mem->dev.kobj); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) +{ + int ret = !is_memblock_offlined(mem); + + if (unlikely(ret)) { + phys_addr_t beginpa, endpa; + + beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); + endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; + pr_warn("removing memory fails, because memory " + "[%pa-%pa] is onlined\n", + &beginpa, &endpa); + } + + return ret; +} + +static int check_cpu_on_node(pg_data_t *pgdat) +{ + int cpu; + + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == pgdat->node_id) + /* + * the cpu on this node isn't removed, and we can't + * offline this node. + */ + return -EBUSY; + } + + return 0; +} + +static void unmap_cpu_on_node(pg_data_t *pgdat) +{ +#ifdef CONFIG_ACPI_NUMA + int cpu; + + for_each_possible_cpu(cpu) + if (cpu_to_node(cpu) == pgdat->node_id) + numa_clear_node(cpu); +#endif +} + +static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) +{ + int ret; + + ret = check_cpu_on_node(pgdat); + if (ret) + return ret; + + /* + * the node will be offlined when we come here, so we can clear + * the cpu_to_node() now. + */ + + unmap_cpu_on_node(pgdat); + return 0; +} + +/** + * try_offline_node + * + * Offline a node if all memory sections and cpus of the node are removed. + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call. + */ +void try_offline_node(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = pgdat->node_start_pfn; + unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; + unsigned long pfn; + int i; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!present_section_nr(section_nr)) + continue; + + if (pfn_to_nid(pfn) != nid) + continue; + + /* + * some memory sections of this node are not removed, and we + * can't offline node now. + */ + return; + } + + if (check_and_unmap_cpu_on_node(pgdat)) + return; + + /* + * all memory/cpu of this node are removed, we can offline this + * node now. + */ + node_set_offline(nid); + unregister_one_node(nid); + + /* free waittable in each zone */ + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + + /* + * wait_table may be allocated from boot memory, + * here only free if it's allocated by vmalloc. + */ + if (is_vmalloc_addr(zone->wait_table)) { + vfree(zone->wait_table); + zone->wait_table = NULL; + } + } +} +EXPORT_SYMBOL(try_offline_node); + +/** + * remove_memory + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call, as required by + * try_offline_node(). + */ +void __ref remove_memory(int nid, u64 start, u64 size) +{ + int ret; + + BUG_ON(check_hotplug_memory_range(start, size)); + + mem_hotplug_begin(); + + /* + * All memory blocks must be offlined before removing memory. Check + * whether all memory blocks in question are offline and trigger a BUG() + * if this is not the case. + */ + ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, + check_memblock_offlined_cb); + if (ret) + BUG(); + + /* remove memmap entry */ + firmware_map_remove(start, start + size, "System RAM"); + + arch_remove_memory(start, size); + + try_offline_node(nid); + + mem_hotplug_done(); +} +EXPORT_SYMBOL_GPL(remove_memory); +#endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/kernel/mm/mempolicy.c b/kernel/mm/mempolicy.c new file mode 100644 index 000000000..99d4c1d0b --- /dev/null +++ b/kernel/mm/mempolicy.c @@ -0,0 +1,2831 @@ +/* + * Simple NUMA memory policy for the Linux kernel. + * + * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. + * Subject to the GNU Public License, version 2. + * + * NUMA policy allows the user to give hints in which node(s) memory should + * be allocated. + * + * Support four policies per VMA and per process: + * + * The VMA policy has priority over the process policy for a page fault. + * + * interleave Allocate memory interleaved over a set of nodes, + * with normal fallback if it fails. + * For VMA based allocations this interleaves based on the + * offset into the backing object or offset into the mapping + * for anonymous memory. For process policy an process counter + * is used. + * + * bind Only allocate memory on a specific set of nodes, + * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * + * preferred Try a specific node first before normal fallback. + * As a special case NUMA_NO_NODE here means do the allocation + * on the local CPU. This is normally identical to default, + * but useful to set in a VMA when you have a non default + * process policy. + * + * default Allocate on the local node first, or when on a VMA + * use the process policy. This is what Linux always did + * in a NUMA aware kernel and still does by, ahem, default. + * + * The process policy is applied for most non interrupt memory allocations + * in that process' context. Interrupts ignore the policies and always + * try to allocate on the local CPU. The VMA policy is only applied for memory + * allocations for a VMA in the VM. + * + * Currently there are a few corner cases in swapping where the policy + * is not applied, but the majority should be handled. When process policy + * is used it is not remembered over swap outs/swap ins. + * + * Only the highest zone in the zone hierarchy gets policied. Allocations + * requesting a lower zone just use default policy. This implies that + * on systems with highmem kernel lowmem allocation don't get policied. + * Same with GFP_DMA allocations. + * + * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * all users and remembered even when nobody has memory mapped. + */ + +/* Notebook: + fix mmap readahead to honour policy and enable policy for any page cache + object + statistics for bigpages + global policy for page cache? currently it uses process policy. Requires + first item above. + handle mremap for shared memory (currently ignored for the policy) + grows down? + make bind policy root only? It can trigger oom much faster and the + kernel is not always grateful with that. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +/* Internal flags */ +#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ + +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; + +/* Highest zone. An specific allocation for a zone below that is not + policied. */ +enum zone_type policy_zone = 0; + +/* + * run-time system-wide default policy => local allocation + */ +static struct mempolicy default_policy = { + .refcnt = ATOMIC_INIT(1), /* never free it */ + .mode = MPOL_PREFERRED, + .flags = MPOL_F_LOCAL, +}; + +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + int node; + + if (pol) + return pol; + + node = numa_node_id(); + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* preferred_node_policy is not initialised early in boot */ + if (pol->mode) + return pol; + } + + return &default_policy; +} + +static const struct mempolicy_operations { + int (*create)(struct mempolicy *pol, const nodemask_t *nodes); + /* + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step); +} mpol_ops[MPOL_MAX]; + +static inline int mpol_store_user_nodemask(const struct mempolicy *pol) +{ + return pol->flags & MPOL_MODE_FLAGS; +} + +static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, + const nodemask_t *rel) +{ + nodemask_t tmp; + nodes_fold(tmp, *orig, nodes_weight(*rel)); + nodes_onto(*ret, tmp, *rel); +} + +static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.nodes = *nodes; + return 0; +} + +static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (!nodes) + pol->flags |= MPOL_F_LOCAL; /* local allocation */ + else if (nodes_empty(*nodes)) + return -EINVAL; /* no allowed nodes */ + else + pol->v.preferred_node = first_node(*nodes); + return 0; +} + +static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->v.nodes = *nodes; + return 0; +} + +/* + * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if + * any, for the new policy. mpol_new() has already validated the nodes + * parameter with respect to the policy mode and flags. But, we need to + * handle an empty nodemask with MPOL_PREFERRED here. + * + * Must be called holding task's alloc_lock to protect task's mems_allowed + * and mempolicy. May also be called holding the mmap_semaphore for write. + */ +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) +{ + int ret; + + /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ + if (pol == NULL) + return 0; + /* Check N_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_MEMORY]); + + VM_BUG_ON(!nodes); + if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) + nodes = NULL; /* explicit local allocation */ + else { + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); + + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; + else + pol->w.cpuset_mems_allowed = + cpuset_current_mems_allowed; + } + + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); + return ret; +} + +/* + * This function just creates a new policy, does some check and simple + * initialization. You must invoke mpol_set_nodemask() to set nodes. + */ +static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *policy; + + pr_debug("setting mode %d flags %d nodes[0] %lx\n", + mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); + + if (mode == MPOL_DEFAULT) { + if (nodes && !nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + return NULL; + } + VM_BUG_ON(!nodes); + + /* + * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or + * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). + * All other modes require a valid pointer to a non-empty nodemask. + */ + if (mode == MPOL_PREFERRED) { + if (nodes_empty(*nodes)) { + if (((flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES))) + return ERR_PTR(-EINVAL); + } + } else if (mode == MPOL_LOCAL) { + if (!nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + mode = MPOL_PREFERRED; + } else if (nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!policy) + return ERR_PTR(-ENOMEM); + atomic_set(&policy->refcnt, 1); + policy->mode = mode; + policy->flags = flags; + + return policy; +} + +/* Slow path of a mpol destructor. */ +void __mpol_put(struct mempolicy *p) +{ + if (!atomic_dec_and_test(&p->refcnt)) + return; + kmem_cache_free(policy_cache, p); +} + +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) +{ +} + +/* + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) +{ + nodemask_t tmp; + + if (pol->flags & MPOL_F_STATIC_NODES) + nodes_and(tmp, pol->w.user_nodemask, *nodes); + else if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); + else { + /* + * if step == 1, we use ->w.cpuset_mems_allowed to cache the + * result + */ + if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { + nodes_remap(tmp, pol->v.nodes, + pol->w.cpuset_mems_allowed, *nodes); + pol->w.cpuset_mems_allowed = step ? tmp : *nodes; + } else if (step == MPOL_REBIND_STEP2) { + tmp = pol->w.cpuset_mems_allowed; + pol->w.cpuset_mems_allowed = *nodes; + } else + BUG(); + } + + if (nodes_empty(tmp)) + tmp = *nodes; + + if (step == MPOL_REBIND_STEP1) + nodes_or(pol->v.nodes, pol->v.nodes, tmp); + else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) + pol->v.nodes = tmp; + else + BUG(); + + if (!node_isset(current->il_next, tmp)) { + current->il_next = next_node(current->il_next, tmp); + if (current->il_next >= MAX_NUMNODES) + current->il_next = first_node(tmp); + if (current->il_next >= MAX_NUMNODES) + current->il_next = numa_node_id(); + } +} + +static void mpol_rebind_preferred(struct mempolicy *pol, + const nodemask_t *nodes, + enum mpol_rebind_step step) +{ + nodemask_t tmp; + + if (pol->flags & MPOL_F_STATIC_NODES) { + int node = first_node(pol->w.user_nodemask); + + if (node_isset(node, *nodes)) { + pol->v.preferred_node = node; + pol->flags &= ~MPOL_F_LOCAL; + } else + pol->flags |= MPOL_F_LOCAL; + } else if (pol->flags & MPOL_F_RELATIVE_NODES) { + mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); + pol->v.preferred_node = first_node(tmp); + } else if (!(pol->flags & MPOL_F_LOCAL)) { + pol->v.preferred_node = node_remap(pol->v.preferred_node, + pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = *nodes; + } +} + +/* + * mpol_rebind_policy - Migrate a policy to a different set of nodes + * + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, + enum mpol_rebind_step step) +{ + if (!pol) + return; + if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) + return; + + if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) + return; + + if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) + BUG(); + + if (step == MPOL_REBIND_STEP1) + pol->flags |= MPOL_F_REBINDING; + else if (step == MPOL_REBIND_STEP2) + pol->flags &= ~MPOL_F_REBINDING; + else if (step >= MPOL_REBIND_NSTEP) + BUG(); + + mpol_ops[pol->mode].rebind(pol, newmask, step); +} + +/* + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. + * + * Called with task's alloc_lock held. + */ + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step) +{ + mpol_rebind_policy(tsk->mempolicy, new, step); +} + +/* + * Rebind each vma in mm to new nodemask. + * + * Call holding a reference to mm. Takes mm->mmap_sem during call. + */ + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); + up_write(&mm->mmap_sem); +} + +static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { + [MPOL_DEFAULT] = { + .rebind = mpol_rebind_default, + }, + [MPOL_INTERLEAVE] = { + .create = mpol_new_interleave, + .rebind = mpol_rebind_nodemask, + }, + [MPOL_PREFERRED] = { + .create = mpol_new_preferred, + .rebind = mpol_rebind_preferred, + }, + [MPOL_BIND] = { + .create = mpol_new_bind, + .rebind = mpol_rebind_nodemask, + }, +}; + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); + +struct queue_pages { + struct list_head *pagelist; + unsigned long flags; + nodemask_t *nmask; + struct vm_area_struct *prev; +}; + +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct page *page; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + int nid; + pte_t *pte; + spinlock_t *ptl; + + split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (!page) + continue; + /* + * vm_normal_page() filters out zero pages, but there might + * still be PageReserved pages to skip, perhaps in a VDSO. + */ + if (PageReserved(page)) + continue; + nid = page_to_nid(page); + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + continue; + + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, qp->pagelist, flags); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + int nid; + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); + nid = page_to_nid(page); + if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) + goto unlock; + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + isolate_huge_page(page, qp->pagelist); +unlock: + spin_unlock(ptl); +#else + BUG(); +#endif + return 0; +} + +#ifdef CONFIG_NUMA_BALANCING +/* + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. + */ +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + int nr_updated; + + nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + + return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +static int queue_pages_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct queue_pages *qp = walk->private; + unsigned long endvma = vma->vm_end; + unsigned long flags = qp->flags; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return -EFAULT; + if (qp->prev && qp->prev->vm_end < vma->vm_start) + return -EFAULT; + } + + qp->prev = vma; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (flags & MPOL_MF_LAZY) { + /* Similar to task_numa_work, skip inaccessible VMAs */ + if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + change_prot_numa(vma, start, endvma); + return 1; + } + + if ((flags & MPOL_MF_STRICT) || + ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && + vma_migratable(vma))) + /* queue pages from current vma */ + return 0; + return 1; +} + +/* + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) + */ +static int +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, + nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) +{ + struct queue_pages qp = { + .pagelist = pagelist, + .flags = flags, + .nmask = nodes, + .prev = NULL, + }; + struct mm_walk queue_pages_walk = { + .hugetlb_entry = queue_pages_hugetlb, + .pmd_entry = queue_pages_pte_range, + .test_walk = queue_pages_test_walk, + .mm = mm, + .private = &qp, + }; + + return walk_page_range(start, end, &queue_pages_walk); +} + +/* + * Apply policy to a single VMA + * This must be called with the mmap_sem held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, + struct mempolicy *pol) +{ + int err; + struct mempolicy *old; + struct mempolicy *new; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + new = mpol_dup(pol); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (vma->vm_ops && vma->vm_ops->set_policy) { + err = vma->vm_ops->set_policy(vma, new); + if (err) + goto err_out; + } + + old = vma->vm_policy; + vma->vm_policy = new; /* protected by mmap_sem */ + mpol_put(old); + + return 0; + err_out: + mpol_put(new); + return err; +} + +/* Step 2: apply policy to a range and do splits. */ +static int mbind_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mempolicy *new_pol) +{ + struct vm_area_struct *next; + struct vm_area_struct *prev; + struct vm_area_struct *vma; + int err = 0; + pgoff_t pgoff; + unsigned long vmstart; + unsigned long vmend; + + vma = find_vma(mm, start); + if (!vma || vma->vm_start > start) + return -EFAULT; + + prev = vma->vm_prev; + if (start > vma->vm_start) + prev = vma; + + for (; vma && vma->vm_start < end; prev = vma, vma = next) { + next = vma->vm_next; + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + + pgoff = vma->vm_pgoff + + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, + new_pol); + if (prev) { + vma = prev; + next = vma->vm_next; + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + /* vma_merge() joined vma && vma->next, case 8 */ + goto replace; + } + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); + if (err) + goto out; + } + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + goto out; + } + replace: + err = vma_replace_policy(vma, new_pol); + if (err) + goto out; + } + + out: + return err; +} + +/* Set the process memory policy */ +static long do_set_mempolicy(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *new, *old; + NODEMASK_SCRATCH(scratch); + int ret; + + if (!scratch) + return -ENOMEM; + + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } + + task_lock(current); + ret = mpol_set_nodemask(new, nodes, scratch); + if (ret) { + task_unlock(current); + mpol_put(new); + goto out; + } + old = current->mempolicy; + current->mempolicy = new; + if (new && new->mode == MPOL_INTERLEAVE && + nodes_weight(new->v.nodes)) + current->il_next = first_node(new->v.nodes); + task_unlock(current); + mpol_put(old); + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; +} + +/* + * Return nodemask for policy for get_mempolicy() query + * + * Called with task's alloc_lock held + */ +static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +{ + nodes_clear(*nodes); + if (p == &default_policy) + return; + + switch (p->mode) { + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *nodes = p->v.nodes; + break; + case MPOL_PREFERRED: + if (!(p->flags & MPOL_F_LOCAL)) + node_set(p->v.preferred_node, *nodes); + /* else return empty node mask for local allocation */ + break; + default: + BUG(); + } +} + +static int lookup_node(struct mm_struct *mm, unsigned long addr) +{ + struct page *p; + int err; + + err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + if (err >= 0) { + err = page_to_nid(p); + put_page(p); + } + return err; +} + +/* Retrieve NUMA policy */ +static long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) +{ + int err; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct mempolicy *pol = current->mempolicy; + + if (flags & + ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) + return -EINVAL; + + if (flags & MPOL_F_MEMS_ALLOWED) { + if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + *policy = 0; /* just so it's initialized */ + task_lock(current); + *nmask = cpuset_current_mems_allowed; + task_unlock(current); + return 0; + } + + if (flags & MPOL_F_ADDR) { + /* + * Do NOT fall back to task policy if the + * vma/shared policy at addr is NULL. We + * want to return MPOL_DEFAULT in this case. + */ + down_read(&mm->mmap_sem); + vma = find_vma_intersection(mm, addr, addr+1); + if (!vma) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else + pol = vma->vm_policy; + } else if (addr) + return -EINVAL; + + if (!pol) + pol = &default_policy; /* indicates default behavior */ + + if (flags & MPOL_F_NODE) { + if (flags & MPOL_F_ADDR) { + err = lookup_node(mm, addr); + if (err < 0) + goto out; + *policy = err; + } else if (pol == current->mempolicy && + pol->mode == MPOL_INTERLEAVE) { + *policy = current->il_next; + } else { + err = -EINVAL; + goto out; + } + } else { + *policy = pol == &default_policy ? MPOL_DEFAULT : + pol->mode; + /* + * Internal mempolicy flags must be masked off before exposing + * the policy to userspace. + */ + *policy |= (pol->flags & MPOL_MODE_FLAGS); + } + + if (vma) { + up_read(¤t->mm->mmap_sem); + vma = NULL; + } + + err = 0; + if (nmask) { + if (mpol_store_user_nodemask(pol)) { + *nmask = pol->w.user_nodemask; + } else { + task_lock(current); + get_policy_nodemask(pol, nmask); + task_unlock(current); + } + } + + out: + mpol_cond_put(pol); + if (vma) + up_read(¤t->mm->mmap_sem); + return err; +} + +#ifdef CONFIG_MIGRATION +/* + * page migration + */ +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ + /* + * Avoid migrating a page that is shared with others. + */ + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (!isolate_lru_page(page)) { + list_add_tail(&page->lru, pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } + } +} + +static struct page *new_node_page(struct page *page, unsigned long node, int **x) +{ + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + node); + else + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE, 0); +} + +/* + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +static int migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) +{ + nodemask_t nmask; + LIST_HEAD(pagelist); + int err = 0; + + nodes_clear(nmask); + node_set(source, nmask); + + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, new_node_page, NULL, dest, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(&pagelist); + } + + return err; +} + +/* + * Move pages between the two nodesets so as to preserve the physical + * layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) +{ + int busy = 0; + int err; + nodemask_t tmp; + + err = migrate_prep(); + if (err) + return err; + + down_read(&mm->mmap_sem); + + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scanning from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from; + while (!nodes_empty(tmp)) { + int s,d; + int source = NUMA_NO_NODE; + int dest = 0; + + for_each_node_mask(s, tmp) { + + /* + * do_migrate_pages() tries to maintain the relative + * node relationship of the pages established between + * threads and memory areas. + * + * However if the number of source nodes is not equal to + * the number of destination nodes we can not preserve + * this node relative relationship. In that case, skip + * copying memory from a node that is in the destination + * mask. + * + * Example: [2,3,4] -> [3,4,5] moves everything. + * [0-7] - > [3,4,5] moves only 0,1,2,6,7. + */ + + if ((nodes_weight(*from) != nodes_weight(*to)) && + (node_isset(s, *to))) + continue; + + d = node_remap(s, *from, *to); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == NUMA_NO_NODE) + break; + + node_clear(source, tmp); + err = migrate_to_node(mm, source, dest, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } + up_read(&mm->mmap_sem); + if (err < 0) + return err; + return busy; + +} + +/* + * Allocate a new page for page migration based on vma policy. + * Start by assuming the page is mapped by the same vma as contains @start. + * Search forward from there, if not. N.B., this assumes that the + * list of pages handed to migrate_pages()--which is how we get here-- + * is in virtual address order. + */ +static struct page *new_page(struct page *page, unsigned long start, int **x) +{ + struct vm_area_struct *vma; + unsigned long uninitialized_var(address); + + vma = find_vma(current->mm, start); + while (vma) { + address = page_address_in_vma(page, vma); + if (address != -EFAULT) + break; + vma = vma->vm_next; + } + + if (PageHuge(page)) { + BUG_ON(!vma); + return alloc_huge_page_noerr(vma, address, 1); + } + /* + * if !vma, alloc_page_vma() will use task or system default policy + */ + return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); +} +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) +{ + return -ENOSYS; +} + +static struct page *new_page(struct page *page, unsigned long start, int **x) +{ + return NULL; +} +#endif + +static long do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + int err; + LIST_HEAD(pagelist); + + if (flags & ~(unsigned long)MPOL_MF_VALID) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + new = mpol_new(mode, mode_flags, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", + start, start + len, mode, mode_flags, + nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); + + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + err = migrate_prep(); + if (err) + goto mpol_out; + } + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } + if (err) + goto mpol_out; + + err = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + if (!err) + err = mbind_range(mm, start, end, new); + + if (!err) { + int nr_failed = 0; + + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); + nr_failed = migrate_pages(&pagelist, new_page, NULL, + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + if (nr_failed) + putback_movable_pages(&pagelist); + } + + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + } else + putback_movable_pages(&pagelist); + + up_write(&mm->mmap_sem); + mpol_out: + mpol_put(new); + return err; +} + +/* + * User space interface with variable sized bitmaps for nodelists. + */ + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes_addr(*nodes)[nlongs-1] &= endmask; + return 0; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, const unsigned long __user *, nmask, + unsigned long, maxnode, unsigned, flags) +{ + nodemask_t nodes; + int err; + unsigned short mode_flags; + + mode_flags = mode & MPOL_MODE_FLAGS; + mode &= ~MPOL_MODE_FLAGS; + if (mode >= MPOL_MAX) + return -EINVAL; + if ((mode_flags & MPOL_F_STATIC_NODES) && + (mode_flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_mbind(start, len, mode, mode_flags, &nodes, flags); +} + +/* Set the process memory policy */ +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, + unsigned long, maxnode) +{ + int err; + nodemask_t nodes; + unsigned short flags; + + flags = mode & MPOL_MODE_FLAGS; + mode &= ~MPOL_MODE_FLAGS; + if ((unsigned int)mode >= MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_set_mempolicy(mode, flags, &nodes); +} + +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) +{ + const struct cred *cred = current_cred(), *tcred; + struct mm_struct *mm = NULL; + struct task_struct *task; + nodemask_t task_nodes; + int err; + nodemask_t *old; + nodemask_t *new; + NODEMASK_SCRATCH(scratch); + + if (!scratch) + return -ENOMEM; + + old = &scratch->mask1; + new = &scratch->mask2; + + err = get_nodes(old, old_nodes, maxnode); + if (err) + goto out; + + err = get_nodes(new, new_nodes, maxnode); + if (err) + goto out; + + /* Find the mm_struct */ + rcu_read_lock(); + task = pid ? find_task_by_vpid(pid) : current; + if (!task) { + rcu_read_unlock(); + err = -ESRCH; + goto out; + } + get_task_struct(task); + + err = -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + tcred = __task_cred(task); + if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && + !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + err = -EPERM; + goto out_put; + } + rcu_read_unlock(); + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out_put; + } + + if (!nodes_subset(*new, node_states[N_MEMORY])) { + err = -EINVAL; + goto out_put; + } + + err = security_task_movememory(task); + if (err) + goto out_put; + + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + err = -EINVAL; + goto out; + } + + err = do_migrate_pages(mm, old, new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); + + mmput(mm); +out: + NODEMASK_SCRATCH_FREE(scratch); + + return err; + +out_put: + put_task_struct(task); + goto out; + +} + + +/* Retrieve NUMA policy */ +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) +{ + int err; + int uninitialized_var(pval); + nodemask_t nodes; + + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + + return err; +} + +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) +{ + long err; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) + nm = compat_alloc_user_space(alloc_size); + + err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); + + if (!err && nmask) { + unsigned long copy_size; + copy_size = min_t(unsigned long, sizeof(bm), alloc_size); + err = copy_from_user(bm, nm, copy_size); + /* ensure entire bitmap is zeroed */ + err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); + err |= compat_put_bitmap(nmask, bm, nr_bits); + } + + return err; +} + +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + DECLARE_BITMAP(bm, MAX_NUMNODES); + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(bm, nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, bm, alloc_size); + } + + if (err) + return -EFAULT; + + return sys_set_mempolicy(mode, nm, nr_bits+1); +} + +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) +{ + long err = 0; + unsigned long __user *nm = NULL; + unsigned long nr_bits, alloc_size; + nodemask_t bm; + + nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + + if (nmask) { + err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); + nm = compat_alloc_user_space(alloc_size); + err |= copy_to_user(nm, nodes_addr(bm), alloc_size); + } + + if (err) + return -EFAULT; + + return sys_mbind(start, len, mode, nm, nr_bits+1, flags); +} + +#endif + +struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol = NULL; + + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + pol = vma->vm_ops->get_policy(vma, addr); + } else if (vma->vm_policy) { + pol = vma->vm_policy; + + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); + } + } + + return pol; +} + +/* + * get_vma_policy(@vma, @addr) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to current->mempolicy or system default policy, as necessary. + * Shared policies [those marked as MPOL_F_SHARED] require an extra reference + * count--added by the get_policy() vm_op, as appropriate--to protect against + * freeing by another task. It is the caller's responsibility to free the + * extra reference for shared policies. + */ +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol = __get_vma_policy(vma, addr); + + if (!pol) + pol = get_task_policy(current); + + return pol; +} + +bool vma_policy_mof(struct vm_area_struct *vma) +{ + struct mempolicy *pol; + + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } + + pol = vma->vm_policy; + if (!pol) + pol = get_task_policy(current); + + return pol->flags & MPOL_F_MOF; +} + +static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +{ + enum zone_type dynamic_policy_zone = policy_zone; + + BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); + + /* + * if policy->v.nodes has movable memory only, + * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. + * + * policy->v.nodes is intersect with node_states[N_MEMORY]. + * so if the following test faile, it implies + * policy->v.nodes has movable memory only. + */ + if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) + dynamic_policy_zone = ZONE_MOVABLE; + + return zone >= dynamic_policy_zone; +} + +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) +{ + /* Lower zones don't get a nodemask applied for MPOL_BIND */ + if (unlikely(policy->mode == MPOL_BIND) && + apply_policy_zone(policy, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) + return &policy->v.nodes; + + return NULL; +} + +/* Return a zonelist indicated by gfp for node representing a mempolicy */ +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) +{ + switch (policy->mode) { + case MPOL_PREFERRED: + if (!(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; + break; + case MPOL_BIND: + /* + * Normally, MPOL_BIND allocations are node-local within the + * allowed nodemask. However, if __GFP_THISNODE is set and the + * current node isn't part of the mask, we use the zonelist for + * the first node in the mask instead. + */ + if (unlikely(gfp & __GFP_THISNODE) && + unlikely(!node_isset(nd, policy->v.nodes))) + nd = first_node(policy->v.nodes); + break; + default: + BUG(); + } + return node_zonelist(nd, gfp); +} + +/* Do dynamic interleaving for a process */ +static unsigned interleave_nodes(struct mempolicy *policy) +{ + unsigned nid, next; + struct task_struct *me = current; + + nid = me->il_next; + next = next_node(nid, policy->v.nodes); + if (next >= MAX_NUMNODES) + next = first_node(policy->v.nodes); + if (next < MAX_NUMNODES) + me->il_next = next; + return nid; +} + +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned int mempolicy_slab_node(void) +{ + struct mempolicy *policy; + int node = numa_mem_id(); + + if (in_interrupt()) + return node; + + policy = current->mempolicy; + if (!policy || policy->flags & MPOL_F_LOCAL) + return node; + + switch (policy->mode) { + case MPOL_PREFERRED: + /* + * handled MPOL_F_LOCAL above + */ + return policy->v.preferred_node; + + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: { + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + struct zonelist *zonelist; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); + zonelist = &NODE_DATA(node)->node_zonelists[0]; + (void)first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes, + &zone); + return zone ? zone->node : node; + } + + default: + BUG(); + } +} + +/* Do static interleaving for a VMA with known offset. */ +static unsigned offset_il_node(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long off) +{ + unsigned nnodes = nodes_weight(pol->v.nodes); + unsigned target; + int c; + int nid = NUMA_NO_NODE; + + if (!nnodes) + return numa_node_id(); + target = (unsigned int)off % nnodes; + c = 0; + do { + nid = next_node(nid, pol->v.nodes); + c++; + } while (c <= target); + return nid; +} + +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, vma, off); + } else + return interleave_nodes(pol); +} + +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = NUMA_NO_NODE; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} + +#ifdef CONFIG_HUGETLBFS +/* + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask + * + * Returns a zonelist suitable for a huge page allocation and a pointer + * to the struct mempolicy for conditional unref after allocation. + * If the effective policy is 'BIND, returns a pointer to the mempolicy's + * @nodemask for filtering the zonelist. + * + * Must be protected by read_mems_allowed_begin() + */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, + gfp_t gfp_flags, struct mempolicy **mpol, + nodemask_t **nodemask) +{ + struct zonelist *zl; + + *mpol = get_vma_policy(vma, addr); + *nodemask = NULL; /* assume !MPOL_BIND */ + + if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + zl = node_zonelist(interleave_nid(*mpol, vma, addr, + huge_page_shift(hstate_vma(vma))), gfp_flags); + } else { + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); + if ((*mpol)->mode == MPOL_BIND) + *nodemask = &(*mpol)->v.nodes; + } + return zl; +} + +/* + * init_nodemask_of_mempolicy + * + * If the current task's mempolicy is "default" [NULL], return 'false' + * to indicate default policy. Otherwise, extract the policy nodemask + * for 'bind' or 'interleave' policy into the argument nodemask, or + * initialize the argument nodemask to contain the single node for + * 'preferred' or 'local' policy and return 'true' to indicate presence + * of non-default mempolicy. + * + * We don't bother with reference counting the mempolicy [mpol_get/put] + * because the current task is examining it's own mempolicy and a task's + * mempolicy is only ever changed by the task itself. + * + * N.B., it is the caller's responsibility to free a returned nodemask. + */ +bool init_nodemask_of_mempolicy(nodemask_t *mask) +{ + struct mempolicy *mempolicy; + int nid; + + if (!(mask && current->mempolicy)) + return false; + + task_lock(current); + mempolicy = current->mempolicy; + switch (mempolicy->mode) { + case MPOL_PREFERRED: + if (mempolicy->flags & MPOL_F_LOCAL) + nid = numa_node_id(); + else + nid = mempolicy->v.preferred_node; + init_nodemask_of_node(mask, nid); + break; + + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *mask = mempolicy->v.nodes; + break; + + default: + BUG(); + } + task_unlock(current); + + return true; +} +#endif + +/* + * mempolicy_nodemask_intersects + * + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default + * policy. Otherwise, check for intersection between mask and the policy + * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' + * policy, always return true since it may allocate elsewhere on fallback. + * + * Takes task_lock(tsk) to prevent freeing of its mempolicy. + */ +bool mempolicy_nodemask_intersects(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct mempolicy *mempolicy; + bool ret = true; + + if (!mask) + return ret; + task_lock(tsk); + mempolicy = tsk->mempolicy; + if (!mempolicy) + goto out; + + switch (mempolicy->mode) { + case MPOL_PREFERRED: + /* + * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to + * allocate from, they may fallback to other nodes when oom. + * Thus, it's possible for tsk to have allocated memory from + * nodes in mask. + */ + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + ret = nodes_intersects(mempolicy->v.nodes, *mask); + break; + default: + BUG(); + } +out: + task_unlock(tsk); + return ret; +} + +/* Allocate a page in interleaved policy. + Own path because it needs to do special accounting. */ +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + unsigned nid) +{ + struct zonelist *zl; + struct page *page; + + zl = node_zonelist(nid, gfp); + page = __alloc_pages(gfp, order, zl); + if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); + return page; +} + +/** + * alloc_pages_vma - Allocate a page for a VMA. + * + * @gfp: + * %GFP_USER user allocation. + * %GFP_KERNEL kernel allocations, + * %GFP_HIGHMEM highmem/user allocations, + * %GFP_FS allocation should not call back into a file system. + * %GFP_ATOMIC don't sleep. + * + * @order:Order of the GFP allocation. + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual Address of the allocation. Must be inside the VMA. + * @node: Which node to prefer for allocation (modulo policy). + * @hugepage: for hugepages try only the preferred node if possible + * + * This function allocates a page from the kernel page pool and applies + * a NUMA policy associated with the VMA or the current process. + * When VMA is not NULL caller must hold down_read on the mmap_sem of the + * mm_struct of the VMA to prevent it from going away. Should be used for + * all allocations for pages that will be mapped into user space. Returns + * NULL when no page can be allocated. + */ +struct page * +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, int node, bool hugepage) +{ + struct mempolicy *pol; + struct page *page; + unsigned int cpuset_mems_cookie; + struct zonelist *zl; + nodemask_t *nmask; + +retry_cpuset: + pol = get_vma_policy(vma, addr); + cpuset_mems_cookie = read_mems_allowed_begin(); + + if (pol->mode == MPOL_INTERLEAVE) { + unsigned nid; + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + mpol_cond_put(pol); + page = alloc_page_interleave(gfp, order, nid); + goto out; + } + + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + + /* + * For hugepage allocation and non-interleave policy which + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. + * + * If the policy is interleave, or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + if (pol->mode == MPOL_PREFERRED && + !(pol->flags & MPOL_F_LOCAL)) + hpage_node = pol->v.preferred_node; + + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); + page = alloc_pages_exact_node(hpage_node, + gfp | __GFP_THISNODE, order); + goto out; + } + } + + nmask = policy_nodemask(gfp, pol); + zl = policy_zonelist(gfp, pol, node); + mpol_cond_put(pol); + page = __alloc_pages_nodemask(gfp, order, zl, nmask); +out: + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return page; +} + +/** + * alloc_pages_current - Allocate pages. + * + * @gfp: + * %GFP_USER user allocation, + * %GFP_KERNEL kernel allocation, + * %GFP_HIGHMEM highmem allocation, + * %GFP_FS don't call back into a file system. + * %GFP_ATOMIC don't sleep. + * @order: Power of two of allocation size in pages. 0 is a single page. + * + * Allocate a page from the kernel page pool. When not in + * interrupt context and apply the current process NUMA policy. + * Returns NULL when no page can be allocated. + * + * Don't call cpuset_update_task_memory_state() unless + * 1) it's ok to take cpuset_sem (can WAIT), and + * 2) allocating for current task (not interrupt). + */ +struct page *alloc_pages_current(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; + struct page *page; + unsigned int cpuset_mems_cookie; + + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (pol->mode == MPOL_INTERLEAVE) + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); + + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return page; +} +EXPORT_SYMBOL(alloc_pages_current); + +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + +/* + * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + * + * current's mempolicy may be rebinded by the other task(the task that changes + * cpuset's mems), so we needn't do rebind work for current task. + */ + +/* Slow path of a mempolicy duplicate */ +struct mempolicy *__mpol_dup(struct mempolicy *old) +{ + struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + + if (!new) + return ERR_PTR(-ENOMEM); + + /* task's mempolicy is protected by alloc_lock */ + if (old == current->mempolicy) { + task_lock(current); + *new = *old; + task_unlock(current); + } else + *new = *old; + + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + if (new->flags & MPOL_F_REBINDING) + mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); + else + mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + } + atomic_set(&new->refcnt, 1); + return new; +} + +/* Slow path of a mempolicy comparison */ +bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + if (!a || !b) + return false; + if (a->mode != b->mode) + return false; + if (a->flags != b->flags) + return false; + if (mpol_store_user_nodemask(a)) + if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) + return false; + + switch (a->mode) { + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + return !!nodes_equal(a->v.nodes, b->v.nodes); + case MPOL_PREFERRED: + return a->v.preferred_node == b->v.preferred_node; + default: + BUG(); + return false; + } +} + +/* + * Shared memory backing store policy support. + * + * Remember policies even when nobody has shared memory mapped. + * The policies are kept in Red-Black tree linked from the inode. + * They are protected by the sp->lock spinlock, which should be held + * for any accesses to the tree. + */ + +/* lookup first element intersecting start-end */ +/* Caller holds sp->lock */ +static struct sp_node * +sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +{ + struct rb_node *n = sp->root.rb_node; + + while (n) { + struct sp_node *p = rb_entry(n, struct sp_node, nd); + + if (start >= p->end) + n = n->rb_right; + else if (end <= p->start) + n = n->rb_left; + else + break; + } + if (!n) + return NULL; + for (;;) { + struct sp_node *w = NULL; + struct rb_node *prev = rb_prev(n); + if (!prev) + break; + w = rb_entry(prev, struct sp_node, nd); + if (w->end <= start) + break; + n = prev; + } + return rb_entry(n, struct sp_node, nd); +} + +/* Insert a new shared policy into the list. */ +/* Caller holds sp->lock */ +static void sp_insert(struct shared_policy *sp, struct sp_node *new) +{ + struct rb_node **p = &sp->root.rb_node; + struct rb_node *parent = NULL; + struct sp_node *nd; + + while (*p) { + parent = *p; + nd = rb_entry(parent, struct sp_node, nd); + if (new->start < nd->start) + p = &(*p)->rb_left; + else if (new->end > nd->end) + p = &(*p)->rb_right; + else + BUG(); + } + rb_link_node(&new->nd, parent, p); + rb_insert_color(&new->nd, &sp->root); + pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, + new->policy ? new->policy->mode : 0); +} + +/* Find shared policy intersecting idx */ +struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + struct mempolicy *pol = NULL; + struct sp_node *sn; + + if (!sp->root.rb_node) + return NULL; + spin_lock(&sp->lock); + sn = sp_lookup(sp, idx, idx+1); + if (sn) { + mpol_get(sn->policy); + pol = sn->policy; + } + spin_unlock(&sp->lock); + return pol; +} + +static void sp_free(struct sp_node *n) +{ + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + * -1 - not misplaced, page is in the right node + * node - node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zone *zone; + int curnid = page_to_nid(page); + unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); + int polnid = -1; + int ret = -1; + + BUG_ON(!vma); + + pol = get_vma_policy(vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, vma, pgoff); + break; + + case MPOL_PREFERRED: + if (pol->flags & MPOL_F_LOCAL) + polnid = numa_node_id(); + else + polnid = pol->v.preferred_node; + break; + + case MPOL_BIND: + /* + * allows binding to multiple nodes. + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->v.nodes)) + goto out; + (void)first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->v.nodes, &zone); + polnid = zone->node; + break; + + default: + BUG(); + } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) { + polnid = thisnid; + + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) + goto out; + } + + if (curnid != polnid) + ret = polnid; +out: + mpol_cond_put(pol); + + return ret; +} + +static void sp_delete(struct shared_policy *sp, struct sp_node *n) +{ + pr_debug("deleting %lx-l%lx\n", n->start, n->end); + rb_erase(&n->nd, &sp->root); + sp_free(n); +} + +static void sp_node_init(struct sp_node *node, unsigned long start, + unsigned long end, struct mempolicy *pol) +{ + node->start = start; + node->end = end; + node->policy = pol; +} + +static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + struct mempolicy *pol) +{ + struct sp_node *n; + struct mempolicy *newpol; + + n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n) + return NULL; + + newpol = mpol_dup(pol); + if (IS_ERR(newpol)) { + kmem_cache_free(sn_cache, n); + return NULL; + } + newpol->flags |= MPOL_F_SHARED; + sp_node_init(n, start, end, newpol); + + return n; +} + +/* Replace a policy range. */ +static int shared_policy_replace(struct shared_policy *sp, unsigned long start, + unsigned long end, struct sp_node *new) +{ + struct sp_node *n; + struct sp_node *n_new = NULL; + struct mempolicy *mpol_new = NULL; + int ret = 0; + +restart: + spin_lock(&sp->lock); + n = sp_lookup(sp, start, end); + /* Take care of old policies in the same range. */ + while (n && n->start < end) { + struct rb_node *next = rb_next(&n->nd); + if (n->start >= start) { + if (n->end <= end) + sp_delete(sp, n); + else + n->start = end; + } else { + /* Old policy spanning whole new range. */ + if (n->end > end) { + if (!n_new) + goto alloc_new; + + *mpol_new = *n->policy; + atomic_set(&mpol_new->refcnt, 1); + sp_node_init(n_new, end, n->end, mpol_new); + n->end = start; + sp_insert(sp, n_new); + n_new = NULL; + mpol_new = NULL; + break; + } else + n->end = start; + } + if (!next) + break; + n = rb_entry(next, struct sp_node, nd); + } + if (new) + sp_insert(sp, new); + spin_unlock(&sp->lock); + ret = 0; + +err_out: + if (mpol_new) + mpol_put(mpol_new); + if (n_new) + kmem_cache_free(sn_cache, n_new); + + return ret; + +alloc_new: + spin_unlock(&sp->lock); + ret = -ENOMEM; + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n_new) + goto err_out; + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!mpol_new) + goto err_out; + goto restart; +} + +/** + * mpol_shared_policy_init - initialize shared policy for inode + * @sp: pointer to inode shared policy + * @mpol: struct mempolicy to install + * + * Install non-NULL @mpol in inode's shared policy rb-tree. + * On entry, the current task has a reference on a non-NULL @mpol. + * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. + */ +void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) +{ + int ret; + + sp->root = RB_ROOT; /* empty tree == default mempolicy */ + spin_lock_init(&sp->lock); + + if (mpol) { + struct vm_area_struct pvma; + struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + + if (!scratch) + goto put_mpol; + /* contextualize the tmpfs mount point mempolicy */ + new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(new)) + goto free_scratch; /* no valid nodemask intersection */ + + task_lock(current); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + task_unlock(current); + if (ret) + goto put_new; + + /* Create pseudo-vma that contains just the policy */ + memset(&pvma, 0, sizeof(struct vm_area_struct)); + pvma.vm_end = TASK_SIZE; /* policy covers entire file */ + mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + +put_new: + mpol_put(new); /* drop initial ref */ +free_scratch: + NODEMASK_SCRATCH_FREE(scratch); +put_mpol: + mpol_put(mpol); /* drop our incoming ref on sb mpol */ + } +} + +int mpol_set_shared_policy(struct shared_policy *info, + struct vm_area_struct *vma, struct mempolicy *npol) +{ + int err; + struct sp_node *new = NULL; + unsigned long sz = vma_pages(vma); + + pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", + vma->vm_pgoff, + sz, npol ? npol->mode : -1, + npol ? npol->flags : -1, + npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); + + if (npol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (!new) + return -ENOMEM; + } + err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + if (err && new) + sp_free(new); + return err; +} + +/* Free a backing policy store on inode delete. */ +void mpol_free_shared_policy(struct shared_policy *p) +{ + struct sp_node *n; + struct rb_node *next; + + if (!p->root.rb_node) + return; + spin_lock(&p->lock); + next = rb_first(&p->root); + while (next) { + n = rb_entry(next, struct sp_node, nd); + next = rb_next(&n->nd); + sp_delete(p, n); + } + spin_unlock(&p->lock); +} + +#ifdef CONFIG_NUMA_BALANCING +static int __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ + if (numabalancing_override) + set_numabalancing_state(numabalancing_override == 1); + + if (num_online_nodes() > 1 && !numabalancing_override) { + pr_info("%s automatic NUMA balancing. " + "Configure with numa_balancing= or the " + "kernel.numa_balancing sysctl", + numabalancing_default ? "Enabling" : "Disabling"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + numabalancing_override = 1; + ret = 1; + } else if (!strcmp(str, "disable")) { + numabalancing_override = -1; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + +/* assumes fs == KERNEL_DS */ +void __init numa_policy_init(void) +{ + nodemask_t interleave_nodes; + unsigned long largest = 0; + int nid, prefer = 0; + + policy_cache = kmem_cache_create("numa_policy", + sizeof(struct mempolicy), + 0, SLAB_PANIC, NULL); + + sn_cache = kmem_cache_create("shared_policy_node", + sizeof(struct sp_node), + 0, SLAB_PANIC, NULL); + + for_each_node(nid) { + preferred_node_policy[nid] = (struct mempolicy) { + .refcnt = ATOMIC_INIT(1), + .mode = MPOL_PREFERRED, + .flags = MPOL_F_MOF | MPOL_F_MORON, + .v = { .preferred_node = nid, }, + }; + } + + /* + * Set interleaving policy for system init. Interleaving is only + * enabled across suitably sized nodes (default is >= 16MB), or + * fall back to the largest node if they're all smaller. + */ + nodes_clear(interleave_nodes); + for_each_node_state(nid, N_MEMORY) { + unsigned long total_pages = node_present_pages(nid); + + /* Preserve the largest node */ + if (largest < total_pages) { + largest = total_pages; + prefer = nid; + } + + /* Interleave this node? */ + if ((total_pages << PAGE_SHIFT) >= (16 << 20)) + node_set(nid, interleave_nodes); + } + + /* All too small, use the largest */ + if (unlikely(nodes_empty(interleave_nodes))) + node_set(prefer, interleave_nodes); + + if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) + pr_err("%s: interleaving failed\n", __func__); + + check_numabalancing_enable(); +} + +/* Reset policy of current process to default */ +void numa_default_policy(void) +{ + do_set_mempolicy(MPOL_DEFAULT, 0, NULL); +} + +/* + * Parse and format mempolicy from/to strings + */ + +/* + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. + */ +static const char * const policy_modes[] = +{ + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "prefer", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local", +}; + + +#ifdef CONFIG_TMPFS +/** + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. + * @str: string containing mempolicy to parse + * @mpol: pointer to struct mempolicy pointer, returned on success. + * + * Format of input: + * [=][:] + * + * On success, returns 0, else 1 + */ +int mpol_parse_str(char *str, struct mempolicy **mpol) +{ + struct mempolicy *new = NULL; + unsigned short mode; + unsigned short mode_flags; + nodemask_t nodes; + char *nodelist = strchr(str, ':'); + char *flags = strchr(str, '='); + int err = 1; + + if (nodelist) { + /* NUL-terminate mode or flags string */ + *nodelist++ = '\0'; + if (nodelist_parse(nodelist, nodes)) + goto out; + if (!nodes_subset(nodes, node_states[N_MEMORY])) + goto out; + } else + nodes_clear(nodes); + + if (flags) + *flags++ = '\0'; /* terminate mode string */ + + for (mode = 0; mode < MPOL_MAX; mode++) { + if (!strcmp(str, policy_modes[mode])) { + break; + } + } + if (mode >= MPOL_MAX) + goto out; + + switch (mode) { + case MPOL_PREFERRED: + /* + * Insist on a nodelist of one node only + */ + if (nodelist) { + char *rest = nodelist; + while (isdigit(*rest)) + rest++; + if (*rest) + goto out; + } + break; + case MPOL_INTERLEAVE: + /* + * Default to online nodes with memory if no nodelist + */ + if (!nodelist) + nodes = node_states[N_MEMORY]; + break; + case MPOL_LOCAL: + /* + * Don't allow a nodelist; mpol_new() checks flags + */ + if (nodelist) + goto out; + mode = MPOL_PREFERRED; + break; + case MPOL_DEFAULT: + /* + * Insist on a empty nodelist + */ + if (!nodelist) + err = 0; + goto out; + case MPOL_BIND: + /* + * Insist on a nodelist + */ + if (!nodelist) + goto out; + } + + mode_flags = 0; + if (flags) { + /* + * Currently, we only support two mutually exclusive + * mode flags. + */ + if (!strcmp(flags, "static")) + mode_flags |= MPOL_F_STATIC_NODES; + else if (!strcmp(flags, "relative")) + mode_flags |= MPOL_F_RELATIVE_NODES; + else + goto out; + } + + new = mpol_new(mode, mode_flags, &nodes); + if (IS_ERR(new)) + goto out; + + /* + * Save nodes for mpol_to_str() to show the tmpfs mount options + * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. + */ + if (mode != MPOL_PREFERRED) + new->v.nodes = nodes; + else if (nodelist) + new->v.preferred_node = first_node(nodes); + else + new->flags |= MPOL_F_LOCAL; + + /* + * Save nodes for contextualization: this will be used to "clone" + * the mempolicy in a specific context [cpuset] at a later time. + */ + new->w.user_nodemask = nodes; + + err = 0; + +out: + /* Restore string for error message */ + if (nodelist) + *--nodelist = ':'; + if (flags) + *--flags = '='; + if (!err) + *mpol = new; + return err; +} +#endif /* CONFIG_TMPFS */ + +/** + * mpol_to_str - format a mempolicy structure for printing + * @buffer: to contain formatted mempolicy string + * @maxlen: length of @buffer + * @pol: pointer to mempolicy to be formatted + * + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. + */ +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +{ + char *p = buffer; + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; + + if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + mode = pol->mode; + flags = pol->flags; + } + + switch (mode) { + case MPOL_DEFAULT: + break; + case MPOL_PREFERRED: + if (flags & MPOL_F_LOCAL) + mode = MPOL_LOCAL; + else + node_set(pol->v.preferred_node, nodes); + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + nodes = pol->v.nodes; + break; + default: + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; + } + + p += snprintf(p, maxlen, "%s", policy_modes[mode]); + + if (flags & MPOL_MODE_FLAGS) { + p += snprintf(p, buffer + maxlen - p, "="); + + /* + * Currently, the only defined flags are mutually exclusive + */ + if (flags & MPOL_F_STATIC_NODES) + p += snprintf(p, buffer + maxlen - p, "static"); + else if (flags & MPOL_F_RELATIVE_NODES) + p += snprintf(p, buffer + maxlen - p, "relative"); + } + + if (!nodes_empty(nodes)) + p += scnprintf(p, buffer + maxlen - p, ":%*pbl", + nodemask_pr_args(&nodes)); +} diff --git a/kernel/mm/mempool.c b/kernel/mm/mempool.c new file mode 100644 index 000000000..2cc08de8b --- /dev/null +++ b/kernel/mm/mempool.c @@ -0,0 +1,494 @@ +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + * debugging by David Rientjes, Copyright (C) 2015 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "slab.h" + +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +static void poison_error(mempool_t *pool, void *element, size_t size, + size_t byte) +{ + const int nr = pool->curr_nr; + const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); + const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); + int i; + + pr_err("BUG: mempool element poison mismatch\n"); + pr_err("Mempool %p size %zu\n", pool, size); + pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); + for (i = start; i < end; i++) + pr_cont("%x ", *(u8 *)(element + i)); + pr_cont("%s\n", end < size ? "..." : ""); + dump_stack(); +} + +static void __check_element(mempool_t *pool, void *element, size_t size) +{ + u8 *obj = element; + size_t i; + + for (i = 0; i < size; i++) { + u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; + + if (obj[i] != exp) { + poison_error(pool, element, size, i); + return; + } + } + memset(obj, POISON_INUSE, size); +} + +static void check_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->free == mempool_free_slab || pool->free == mempool_kfree) + __check_element(pool, element, ksize(element)); + + /* Mempools backed by page allocator */ + if (pool->free == mempool_free_pages) { + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} + +static void __poison_element(void *element, size_t size) +{ + u8 *obj = element; + + memset(obj, POISON_FREE, size - 1); + obj[size - 1] = POISON_END; +} + +static void poison_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + __poison_element(element, ksize(element)); + + /* Mempools backed by page allocator */ + if (pool->alloc == mempool_alloc_pages) { + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __poison_element(addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} +#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +static inline void check_element(mempool_t *pool, void *element) +{ +} +static inline void poison_element(mempool_t *pool, void *element) +{ +} +#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ + +static void kasan_poison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_free(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_kfree(element); + if (pool->alloc == mempool_alloc_pages) + kasan_free_pages(element, (unsigned long)pool->pool_data); +} + +static void kasan_unpoison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_alloc(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_krealloc(element, (size_t)pool->pool_data); + if (pool->alloc == mempool_alloc_pages) + kasan_alloc_pages(element, (unsigned long)pool->pool_data); +} + +static void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + poison_element(pool, element); + kasan_poison_element(pool, element); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + void *element = pool->elements[--pool->curr_nr]; + + BUG_ON(pool->curr_nr < 0); + check_element(pool, element); + kasan_unpoison_element(pool, element); + return element; +} + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + */ +void mempool_destroy(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + kfree(pool); +} +EXPORT_SYMBOL(mempool_destroy); + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc() and mempool_free() + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc() function is not called + * from IRQ contexts. + */ +mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + GFP_KERNEL, NUMA_NO_NODE); +} +EXPORT_SYMBOL(mempool_create); + +mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) +{ + mempool_t *pool; + pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); + if (!pool) + return NULL; + pool->elements = kmalloc_node(min_nr * sizeof(void *), + gfp_mask, node_id); + if (!pool->elements) { + kfree(pool); + return NULL; + } + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(gfp_mask, pool->pool_data); + if (unlikely(!element)) { + mempool_destroy(pool); + return NULL; + } + add_element(pool, element); + } + return pool; +} +EXPORT_SYMBOL(mempool_create_node); + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + */ +int mempool_resize(mempool_t *pool, int new_min_nr) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + might_sleep(); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { + while (new_min_nr < pool->curr_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(new_min_nr <= pool->min_nr)) { + /* Raced, other resize will do our work */ + spin_unlock_irqrestore(&pool->lock, flags); + kfree(new_elements); + goto out; + } + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + } else { + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); /* Raced */ + goto out; + } + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} +EXPORT_SYMBOL(mempool_resize); + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn() function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + * Note: using __GFP_ZERO is not supported. + */ +void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +{ + void *element; + unsigned long flags; + wait_queue_t wait; + gfp_t gfp_temp; + + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_sleep_if(gfp_mask & __GFP_WAIT); + + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ + gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ + gfp_mask |= __GFP_NOWARN; /* failures are OK */ + + gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + +repeat_alloc: + + element = pool->alloc(gfp_temp, pool->pool_data); + if (likely(element != NULL)) + return element; + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + /* paired with rmb in mempool_free(), read comment there */ + smp_wmb(); + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(element); + return element; + } + + /* + * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * alloc failed with that and @pool was empty, retry immediately. + */ + if (gfp_temp != gfp_mask) { + spin_unlock_irqrestore(&pool->lock, flags); + gfp_temp = gfp_mask; + goto repeat_alloc; + } + + /* We must not sleep if !__GFP_WAIT */ + if (!(gfp_mask & __GFP_WAIT)) { + spin_unlock_irqrestore(&pool->lock, flags); + return NULL; + } + + /* Let's wait for someone else to return an element to @pool */ + init_wait(&wait); + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * FIXME: this should be io_schedule(). The timeout is there as a + * workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + + finish_wait(&pool->wait, &wait); + goto repeat_alloc; +} +EXPORT_SYMBOL(mempool_alloc); + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (unlikely(element == NULL)) + return; + + /* + * Paired with the wmb in mempool_alloc(). The preceding read is + * for @element and the following @pool->curr_nr. This ensures + * that the visible value of @pool->curr_nr is from after the + * allocation of @element. This is necessary for fringe cases + * where @element was passed to this task without going through + * barriers. + * + * For example, assume @p is %NULL at the beginning and one task + * performs "p = mempool_alloc(...);" while another task is doing + * "while (!p) cpu_relax(); mempool_free(p, ...);". This function + * may end up using curr_nr value which is from before allocation + * of @p without the following rmb. + */ + smp_rmb(); + + /* + * For correctness, we need a test which is guaranteed to trigger + * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr + * without locking achieves that and refilling as soon as possible + * is desirable. + * + * Because curr_nr visible here is always a value after the + * allocation of @element, any task which decremented curr_nr below + * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets + * incremented to min_nr afterwards. If curr_nr gets incremented + * to min_nr after the allocation of @element, the elements + * allocated after that are subject to the same guarantee. + * + * Waiters happen iff curr_nr is 0 and the above guarantee also + * ensures that there will be frees which return elements to the + * pool waking up the waiters. + */ + if (unlikely(pool->curr_nr < pool->min_nr)) { + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr < pool->min_nr)) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} +EXPORT_SYMBOL(mempool_free); + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + VM_BUG_ON(mem->ctor); + return kmem_cache_alloc(mem, gfp_mask); +} +EXPORT_SYMBOL(mempool_alloc_slab); + +void mempool_free_slab(void *element, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + kmem_cache_free(mem, element); +} +EXPORT_SYMBOL(mempool_free_slab); + +/* + * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory + * specified by pool_data + */ +void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kmalloc); + +void mempool_kfree(void *element, void *pool_data) +{ + kfree(element); +} +EXPORT_SYMBOL(mempool_kfree); + +/* + * A simple mempool-backed page allocator that allocates pages + * of the order specified by pool_data. + */ +void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) +{ + int order = (int)(long)pool_data; + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL(mempool_alloc_pages); + +void mempool_free_pages(void *element, void *pool_data) +{ + int order = (int)(long)pool_data; + __free_pages(element, order); +} +EXPORT_SYMBOL(mempool_free_pages); diff --git a/kernel/mm/memtest.c b/kernel/mm/memtest.c new file mode 100644 index 000000000..1997d934b --- /dev/null +++ b/kernel/mm/memtest.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u64 patterns[] __initdata = { + /* The first entry has to be 0 to leave memtest with zeroed memory */ + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ +}; + +static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) +{ + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); + memblock_reserve(start_bad, end_bad - start_bad); +} + +static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size) +{ + u64 *p, *start, *end; + phys_addr_t start_bad, last_bad; + phys_addr_t start_phys_aligned; + const size_t incr = sizeof(pattern); + + start_phys_aligned = ALIGN(start_phys, incr); + start = __va(start_phys_aligned); + end = start + (size - (start_phys_aligned - start_phys)) / incr; + start_bad = 0; + last_bad = 0; + + for (p = start; p < end; p++) + *p = pattern; + + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); +} + +static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) +{ + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + if (this_start < this_end) { + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long)this_start, + (unsigned long long)this_end, + (unsigned long long)cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } + } +} + +/* default is disabled */ +static int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + + return 0; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(phys_addr_t start, phys_addr_t end) +{ + unsigned int i; + unsigned int idx = 0; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); + for (i = memtest_pattern-1; i < UINT_MAX; --i) { + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } +} diff --git a/kernel/mm/migrate.c b/kernel/mm/migrate.c new file mode 100644 index 000000000..f53838fe3 --- /dev/null +++ b/kernel/mm/migrate.c @@ -0,0 +1,1855 @@ +/* + * Memory Migration functionality - linux/mm/migration.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CREATE_TRACE_POINTS +#include + +#include "internal.h" + +/* + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is + * undesirable, use migrate_prep_local() + */ +int migrate_prep(void) +{ + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); + + return 0; +} + +/* Do the necessary work of migrate_prep but not if it involves other CPUs */ +int migrate_prep_local(void) +{ + lru_add_drain(); + + return 0; +} + +/* + * Put previously isolated pages back onto the appropriate lists + * from where they were once taken off for compaction/migration. + * + * This function shall be used whenever the isolated pageset has been + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() + * and isolate_huge_page(). + */ +void putback_movable_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + + list_for_each_entry_safe(page, page2, l, lru) { + if (unlikely(PageHuge(page))) { + putback_active_hugepage(page); + continue; + } + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + if (unlikely(isolated_balloon_page(page))) + balloon_page_putback(page); + else + putback_lru_page(page); + } +} + +/* + * Restore a potential migration pte to a working pte entry + */ +static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, + unsigned long addr, void *old) +{ + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + + if (unlikely(PageHuge(new))) { + ptep = huge_pte_offset(mm, addr); + if (!ptep) + goto out; + ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); + } else { + pmd = mm_find_pmd(mm, addr); + if (!pmd) + goto out; + + ptep = pte_offset_map(pmd, addr); + + /* + * Peek to check is_swap_pte() before taking ptlock? No, we + * can race mremap's move_ptes(), which skips anon_vma lock. + */ + + ptl = pte_lockptr(mm, pmd); + } + + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto unlock; + + entry = pte_to_swp_entry(pte); + + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != old) + goto unlock; + + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); + + /* Recheck VMA as permissions can change since migration started */ + if (is_write_migration_entry(entry)) + pte = maybe_mkwrite(pte, vma); + +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(new)) { + pte = pte_mkhuge(pte); + pte = arch_make_huge_pte(pte, vma, new, 0); + } +#endif + flush_dcache_page(new); + set_pte_at(mm, addr, ptep, pte); + + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, addr); + else + page_dup_rmap(new); + } else if (PageAnon(new)) + page_add_anon_rmap(new, vma, addr); + else + page_add_file_rmap(new); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, ptep); +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return SWAP_AGAIN; +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = old, + }; + + rmap_walk(new, &rwc); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + */ +void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) +{ + pte_t pte; + swp_entry_t entry; + struct page *page; + + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + page = migration_entry_to_page(entry); + + /* + * Once radix-tree replacement of page migration started, page_count + * *must* be zero. And, we don't want to call wait_on_page_locked() + * against a page without get_page(). + * So, we use get_page_unless_zero(), here. Even failed, page fault + * will occur again. + */ + if (!get_page_unless_zero(page)) + goto out; + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); + __migration_entry_wait(mm, pte, ptl); +} + +#ifdef CONFIG_BLOCK +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + get_bh(bh); + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + put_bh(failed_bh); + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} +#else +static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + return true; +} +#endif /* CONFIG_BLOCK */ + +/* + * Replace the page in the mapping. + * + * The number of remaining references must be: + * 1 for anonymous pages without a mapping + * 2 for pages with a mapping + * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. + */ +int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page, + struct buffer_head *head, enum migrate_mode mode, + int extra_count) +{ + int expected_count = 1 + extra_count; + void **pslot; + + if (!mapping) { + /* Anonymous page without mapping */ + if (page_count(page) != expected_count) + return -EAGAIN; + return MIGRATEPAGE_SUCCESS; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count += 1 + page_has_private(page); + if (page_count(page) != expected_count || + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* + * In the async migration case of moving a page with buffers, lock the + * buffers using trylock before the mapping is moved. If the mapping + * was moved, we later failed to lock the buffers and could not move + * the mapping back due to an elevated page count, we would have to + * block waiting on other references to be dropped. + */ + if (mode == MIGRATE_ASYNC && head && + !buffer_migrate_lock_buffers(head, mode)) { + page_unfreeze_refs(page, expected_count); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* + * Now we know that no one else is looking at the page. + */ + get_page(newpage); /* add cache reference */ + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + radix_tree_replace_slot(pslot, newpage); + + /* + * Drop cache reference from old page by unfreezing + * to one less reference. + * We know this isn't the last reference. + */ + page_unfreeze_refs(page, expected_count - 1); + + /* + * If moved to a different zone then also account + * the page for that zone. Other VM counters will be + * taken care of when we establish references to the + * new page and drop references to the old page. + * + * Note that anonymous pages are accounted for + * via NR_FILE_PAGES and NR_ANON_PAGES if they + * are mapped to swap space. + */ + __dec_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + if (!PageSwapCache(page) && PageSwapBacked(page)) { + __dec_zone_page_state(page, NR_SHMEM); + __inc_zone_page_state(newpage, NR_SHMEM); + } + spin_unlock_irq(&mapping->tree_lock); + + return MIGRATEPAGE_SUCCESS; +} + +/* + * The expected number of remaining references is the same as that + * of migrate_page_move_mapping(). + */ +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int expected_count; + void **pslot; + + if (!mapping) { + if (page_count(page) != 1) + return -EAGAIN; + return MIGRATEPAGE_SUCCESS; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count = 2 + page_has_private(page); + if (page_count(page) != expected_count || + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + get_page(newpage); + + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count - 1); + + spin_unlock_irq(&mapping->tree_lock); + return MIGRATEPAGE_SUCCESS; +} + +/* + * Gigantic pages are so large that we do not guarantee that page++ pointer + * arithmetic will work across the entire page. We need something more + * specialized. + */ +static void __copy_gigantic_page(struct page *dst, struct page *src, + int nr_pages) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < nr_pages; ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +static void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + int nr_pages; + + if (PageHuge(src)) { + /* hugetlbfs page */ + struct hstate *h = page_hstate(src); + nr_pages = pages_per_huge_page(h); + + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { + __copy_gigantic_page(dst, src, nr_pages); + return; + } + } else { + /* thp page */ + BUG_ON(!PageTransHuge(src)); + nr_pages = hpage_nr_pages(src); + } + + for (i = 0; i < nr_pages; i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + int cpupid; + + if (PageHuge(page) || PageTransHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (TestClearPageActive(page)) { + VM_BUG_ON_PAGE(PageUnevictable(page), page); + SetPageActive(newpage); + } else if (TestClearPageUnevictable(page)) + SetPageUnevictable(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + /* + * Want to mark the page and the radix tree as dirty, and + * redo the accounting that clear_page_dirty_for_io undid, + * but we can't use set_page_dirty because that function + * is actually a signal that all of the page has become dirty. + * Whereas only part of our page may be dirty. + */ + if (PageSwapBacked(page)) + SetPageDirty(newpage); + else + __set_page_dirty_nobuffers(newpage); + } + + /* + * Copy NUMA information to the new page, to prevent over-eager + * future migrations of this same page. + */ + cpupid = page_cpupid_xchg_last(page, -1); + page_cpupid_xchg_last(newpage, cpupid); + + mlock_migrate_page(newpage, page); + ksm_migrate_page(newpage, page); + /* + * Please do not reorder this without considering how mm/ksm.c's + * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + */ + if (PageSwapCache(page)) + ClearPageSwapCache(page); + ClearPagePrivate(page); + set_page_private(page, 0); + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} + +/************************************************************ + * Migration functions + ***********************************************************/ + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate/PagePrivate2. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + migrate_page_copy(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +EXPORT_SYMBOL(migrate_page); + +#ifdef CONFIG_BLOCK +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + struct buffer_head *bh, *head; + int rc; + + if (!page_has_buffers(page)) + return migrate_page(mapping, newpage, page, mode); + + head = page_buffers(page); + + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + /* + * In the async case, migrate_page_move_mapping locked the buffers + * with an IRQ-safe spinlock held. In the sync case, the buffers + * need to be locked now + */ + if (mode != MIGRATE_ASYNC) + BUG_ON(!buffer_migrate_lock_buffers(head, mode)); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return MIGRATEPAGE_SUCCESS; +} +EXPORT_SYMBOL(buffer_migrate_page); +#endif + +/* + * Writeback a page to clean the dirty state + */ +static int writeout(struct address_space *mapping, struct page *page) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .for_reclaim = 1 + }; + int rc; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + return -EINVAL; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + return -EAGAIN; + + /* + * A dirty page may imply that the underlying filesystem has + * the page on some queue. So the page must be clean for + * migration. Writeout may mean we loose the lock and the + * page state is no longer what we checked for earlier. + * At this point we know that the migration attempt cannot + * be successful. + */ + remove_migration_ptes(page, page); + + rc = mapping->a_ops->writepage(page, &wbc); + + if (rc != AOP_WRITEPAGE_ACTIVATE) + /* unlocked. Relock */ + lock_page(page); + + return (rc < 0) ? -EIO : -EAGAIN; +} + +/* + * Default handling if a filesystem does not provide a migration function. + */ +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + if (PageDirty(page)) { + /* Only writeback pages in full synchronous migration */ + if (mode != MIGRATE_SYNC) + return -EBUSY; + return writeout(mapping, page); + } + + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + return migrate_page(mapping, newpage, page, mode); +} + +/* + * Move a page to a newly allocated page + * The page is locked and all ptes have been successfully removed. + * + * The new page will have replaced the old page if this function + * is successful. + * + * Return value: + * < 0 - error code + * MIGRATEPAGE_SUCCESS - success + */ +static int move_to_new_page(struct page *newpage, struct page *page, + int page_was_mapped, enum migrate_mode mode) +{ + struct address_space *mapping; + int rc; + + /* + * Block others from accessing the page when we get around to + * establishing additional references. We are the only one + * holding a reference to the new page at this point. + */ + if (!trylock_page(newpage)) + BUG(); + + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + + mapping = page_mapping(page); + if (!mapping) + rc = migrate_page(mapping, newpage, page, mode); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems provide a + * migratepage callback. Anonymous pages are part of swap + * space which also has its own migratepage callback. This + * is the most common path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page, mode); + else + rc = fallback_migrate_page(mapping, newpage, page, mode); + + if (rc != MIGRATEPAGE_SUCCESS) { + newpage->mapping = NULL; + } else { + mem_cgroup_migrate(page, newpage, false); + if (page_was_mapped) + remove_migration_ptes(page, newpage); + page->mapping = NULL; + } + + unlock_page(newpage); + + return rc; +} + +static int __unmap_and_move(struct page *page, struct page *newpage, + int force, enum migrate_mode mode) +{ + int rc = -EAGAIN; + int page_was_mapped = 0; + struct anon_vma *anon_vma = NULL; + + if (!trylock_page(page)) { + if (!force || mode == MIGRATE_ASYNC) + goto out; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto out; + + lock_page(page); + } + + if (PageWriteback(page)) { + /* + * Only in the case of a full synchronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much + */ + if (mode != MIGRATE_SYNC) { + rc = -EBUSY; + goto out_unlock; + } + if (!force) + goto out_unlock; + wait_on_page_writeback(page); + } + /* + * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrates a page. + * This get_anon_vma() delays freeing anon_vma pointer until the end + * of migration. File cache pages are no problem because of page_lock() + * File Caches may use write_page() or lock_page() in migration, then, + * just care Anon page here. + */ + if (PageAnon(page) && !PageKsm(page)) { + /* + * Only page_lock_anon_vma_read() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + */ + anon_vma = page_get_anon_vma(page); + if (anon_vma) { + /* + * Anon page + */ + } else if (PageSwapCache(page)) { + /* + * We cannot be sure that the anon_vma of an unmapped + * swapcache page is safe to use because we don't + * know in advance if the VMA that this page belonged + * to still exists. If the VMA and others sharing the + * data have been freed, then the anon_vma could + * already be invalid. + * + * To avoid this possibility, swapcache pages get + * migrated but are not remapped when migration + * completes + */ + } else { + goto out_unlock; + } + } + + if (unlikely(isolated_balloon_page(page))) { + /* + * A ballooned page does not need any special attention from + * physical to virtual reverse mapping procedures. + * Skip any attempt to unmap PTEs or to remap swap cache, + * in order to avoid burning cycles at rmap level, and perform + * the page migration right away (proteced by page lock). + */ + rc = balloon_page_migrate(newpage, page, mode); + goto out_unlock; + } + + /* + * Corner case handling: + * 1. When a new swap-cache page is read into, it is added to the LRU + * and treated as swapcache but it has no rmap yet. + * Calling try_to_unmap() against a page->mapping==NULL page will + * trigger a BUG. So handle it here. + * 2. An orphaned page (see truncate_complete_page) might have + * fs-private metadata. The page can be picked up due to memory + * offlining. Everywhere else except page reclaim, the page is + * invisible to the vm, so the page can not be migrated. So try to + * free the metadata, so the page can be freed. + */ + if (!page->mapping) { + VM_BUG_ON_PAGE(PageAnon(page), page); + if (page_has_private(page)) { + try_to_free_buffers(page); + goto out_unlock; + } + goto skip_unmap; + } + + /* Establish migration ptes or remove ptes */ + if (page_mapped(page)) { + try_to_unmap(page, + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + page_was_mapped = 1; + } + +skip_unmap: + if (!page_mapped(page)) + rc = move_to_new_page(newpage, page, page_was_mapped, mode); + + if (rc && page_was_mapped) + remove_migration_ptes(page, page); + + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + +out_unlock: + unlock_page(page); +out: + return rc; +} + +/* + * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work + * around it. + */ +#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) +#define ICE_noinline noinline +#else +#define ICE_noinline +#endif + +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static ICE_noinline int unmap_and_move(new_page_t get_new_page, + free_page_t put_new_page, + unsigned long private, struct page *page, + int force, enum migrate_mode mode) +{ + int rc = 0; + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); + + if (!newpage) + return -ENOMEM; + + if (page_count(page) == 1) { + /* page was freed from under us. So we are done. */ + goto out; + } + + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto out; + + rc = __unmap_and_move(page, newpage, force, mode); + +out: + if (rc != -EAGAIN) { + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { + ClearPageSwapBacked(newpage); + put_new_page(newpage, private); + } else if (unlikely(__is_movable_balloon_page(newpage))) { + /* drop our reference, page already in the balloon */ + put_page(newpage); + } else + putback_lru_page(newpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } + return rc; +} + +/* + * Counterpart of unmap_and_move_page() for hugepage migration. + * + * This function doesn't wait the completion of hugepage I/O + * because there is no race between I/O and migration for hugepage. + * Note that currently hugepage I/O occurs only in direct I/O + * where no lock is held and PG_writeback is irrelevant, + * and writeback status of all subpages are counted in the reference + * count of the head page (i.e. if all subpages of a 2MB hugepage are + * under direct I/O, the reference of the head page is 512 and a bit more.) + * This means that when we try to migrate hugepage whose subpages are + * doing direct I/O, some references remain after try_to_unmap() and + * hugepage migration fails without data corruption. + * + * There is also no race when direct I/O is issued on the page under migration, + * because then pte is replaced with migration swap entry and direct I/O code + * will wait in the page fault for migration to complete. + */ +static int unmap_and_move_huge_page(new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) +{ + int rc = 0; + int *result = NULL; + int page_was_mapped = 0; + struct page *new_hpage; + struct anon_vma *anon_vma = NULL; + + /* + * Movability of hugepages depends on architectures and hugepage size. + * This check is necessary because some callers of hugepage migration + * like soft offline and memory hotremove don't walk through page + * tables or check whether the hugepage is pmd-based or not before + * kicking migration. + */ + if (!hugepage_migration_supported(page_hstate(hpage))) { + putback_active_hugepage(hpage); + return -ENOSYS; + } + + new_hpage = get_new_page(hpage, private, &result); + if (!new_hpage) + return -ENOMEM; + + rc = -EAGAIN; + + if (!trylock_page(hpage)) { + if (!force || mode != MIGRATE_SYNC) + goto out; + lock_page(hpage); + } + + if (PageAnon(hpage)) + anon_vma = page_get_anon_vma(hpage); + + if (page_mapped(hpage)) { + try_to_unmap(hpage, + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + page_was_mapped = 1; + } + + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); + + if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) + remove_migration_ptes(hpage, hpage); + + if (anon_vma) + put_anon_vma(anon_vma); + + if (rc == MIGRATEPAGE_SUCCESS) + hugetlb_cgroup_migrate(hpage, new_hpage); + + unlock_page(hpage); +out: + if (rc != -EAGAIN) + putback_active_hugepage(hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(new_hpage); + } + return rc; +} + +/* + * migrate_pages - migrate the pages specified in a list, to the free pages + * supplied as the target for the page migration + * + * @from: The list of pages to be migrated. + * @get_new_page: The function used to allocate free pages to be used + * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for + * page migration, if any. + * @reason: The reason for page migration. + * + * The function returns after 10 attempts or if no pages are movable any more + * because the list has become empty or no retryable pages exist any more. + * The caller should call putback_lru_pages() to return pages to the LRU + * or free list only if ret != 0. + * + * Returns the number of pages that were not migrated, or an error code. + */ +int migrate_pages(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) +{ + int retry = 1; + int nr_failed = 0; + int nr_succeeded = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + + for(pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + + if (PageHuge(page)) + rc = unmap_and_move_huge_page(get_new_page, + put_new_page, private, page, + pass > 2, mode); + else + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case MIGRATEPAGE_SUCCESS: + nr_succeeded++; + break; + default: + /* + * Permanent failure (-EBUSY, -ENOSYS, etc.): + * unlike -EAGAIN case, the failed page is + * removed from migration page list and not + * retried in the next outer loop. + */ + nr_failed++; + break; + } + } + } + rc = nr_failed + retry; +out: + if (nr_succeeded) + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); + if (nr_failed) + count_vm_events(PGMIGRATE_FAIL, nr_failed); + trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return rc; +} + +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; + +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; + + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; + + if (pm->node == MAX_NUMNODES) + return NULL; + + *result = &pm->status; + + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + pm->node); + else + return alloc_pages_exact_node(pm->node, + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); +} + +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + * The pm array ends with node = MAX_NUMNODES. + */ +static int do_move_page_to_node_array(struct mm_struct *mm, + struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); + + down_read(&mm->mmap_sem); + + /* + * Build a list of pages to migrate + */ + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; + + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) + goto set_status; + + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + + err = -ENOENT; + if (!page) + goto set_status; + + /* Use PageReserved to check for zero page */ + if (PageReserved(page)) + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) + /* + * Node already in the right place + */ + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + if (PageHuge(page)) { + if (PageHead(page)) + isolate_huge_page(page, &pagelist); + goto put_and_set; + } + + err = isolate_lru_page(page); + if (!err) { + list_add_tail(&page->lru, &pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; + } + + err = 0; + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, new_page_node, NULL, + (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(&pagelist); + } + + up_read(&mm->mmap_sem); + return err; +} + +/* + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. + */ +static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + struct page_to_node *pm; + unsigned long chunk_nr_pages; + unsigned long chunk_start; + int err; + + err = -ENOMEM; + pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); + if (!pm) + goto out; + + migrate_prep(); + + /* + * Store a chunk of page_to_node array in a page, + * but keep the last one as a marker + */ + chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; + + for (chunk_start = 0; + chunk_start < nr_pages; + chunk_start += chunk_nr_pages) { + int j; + + if (chunk_start + chunk_nr_pages > nr_pages) + chunk_nr_pages = nr_pages - chunk_start; + + /* fill the chunk pm with addrs and nodes from user-space */ + for (j = 0; j < chunk_nr_pages; j++) { + const void __user *p; + int node; + + err = -EFAULT; + if (get_user(p, pages + j + chunk_start)) + goto out_pm; + pm[j].addr = (unsigned long) p; + + if (get_user(node, nodes + j + chunk_start)) + goto out_pm; + + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_pm; + + if (!node_state(node, N_MEMORY)) + goto out_pm; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_pm; + + pm[j].node = node; + } + + /* End marker for this chunk */ + pm[chunk_nr_pages].node = MAX_NUMNODES; + + /* Migrate this chunk */ + err = do_move_page_to_node_array(mm, pm, + flags & MPOL_MF_MOVE_ALL); + if (err < 0) + goto out_pm; + + /* Return status information */ + for (j = 0; j < chunk_nr_pages; j++) + if (put_user(pm[j].status, status + j + chunk_start)) { + err = -EFAULT; + goto out_pm; + } + } + err = 0; + +out_pm: + free_page((unsigned long)pm); +out: + return err; +} + +/* + * Determine the nodes of an array of pages and store it in an array of status. + */ +static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, + const void __user **pages, int *status) +{ + unsigned long i; + + down_read(&mm->mmap_sem); + + for (i = 0; i < nr_pages; i++) { + unsigned long addr = (unsigned long)(*pages); + struct vm_area_struct *vma; + struct page *page; + int err = -EFAULT; + + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start) + goto set_status; + + page = follow_page(vma, addr, 0); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + *status = err; + + pages++; + status++; + } + + up_read(&mm->mmap_sem); +} + +/* + * Determine the nodes of a user array of pages and store it in + * a user array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ +#define DO_PAGES_STAT_CHUNK_NR 16 + const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; + int chunk_status[DO_PAGES_STAT_CHUNK_NR]; + + while (nr_pages) { + unsigned long chunk_nr; + + chunk_nr = nr_pages; + if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) + chunk_nr = DO_PAGES_STAT_CHUNK_NR; + + if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) + break; + + do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); + + if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + break; + + pages += chunk_nr; + status += chunk_nr; + nr_pages -= chunk_nr; + } + return nr_pages ? -EFAULT : 0; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct mm_struct *mm; + int err; + nodemask_t task_nodes; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + rcu_read_lock(); + task = pid ? find_task_by_vpid(pid) : current; + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(task); + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + tcred = __task_cred(task); + if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && + !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + err = -EPERM; + goto out; + } + rcu_read_unlock(); + + err = security_task_movememory(task); + if (err) + goto out; + + task_nodes = cpuset_mems_allowed(task); + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) + return -EINVAL; + + if (nodes) + err = do_pages_move(mm, task_nodes, nr_pages, pages, + nodes, status, flags); + else + err = do_pages_stat(mm, nr_pages, pages, status); + + mmput(mm); + return err; + +out: + put_task_struct(task); + return err; +} + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which crude + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, + unsigned long nr_migrate_pages) +{ + int z; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (!zone_reclaimable(zone)) + continue; + + /* Avoid waking kswapd by allocating pages_to_migrate pages. */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone) + + nr_migrate_pages, + 0, 0)) + continue; + return true; + } + return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, + unsigned long data, + int **result) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_exact_node(nid, + (GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & + ~GFP_IOFS, 0); + + return newpage; +} + +/* + * page migration rate limiting control. + * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs + * window of time. Default here says do not migrate more than 1280M per second. + */ +static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); + +/* Returns true if the node is migrate rate-limited after the update */ +static bool numamigrate_update_ratelimit(pg_data_t *pgdat, + unsigned long nr_pages) +{ + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + spin_lock(&pgdat->numabalancing_migrate_lock); + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies + + msecs_to_jiffies(migrate_interval_millisecs); + spin_unlock(&pgdat->numabalancing_migrate_lock); + } + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { + trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, + nr_pages); + return true; + } + + /* + * This is an unlocked non-atomic update so errors are possible. + * The consequences are failing to migrate when we potentiall should + * have which is not severe enough to warrant locking. If it is ever + * a problem, it can be converted to a per-cpu counter. + */ + pgdat->numabalancing_migrate_nr_pages += nr_pages; + return false; +} + +static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ + int page_lru; + + VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + + /* Avoid migrating to a node that is nearly full */ + if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) + return 0; + + if (isolate_lru_page(page)) + return 0; + + /* + * migrate_misplaced_transhuge_page() skips page migration's usual + * check on page_count(), so we must do it here, now that the page + * has been isolated: a GUP pin, or any other pin, prevents migration. + * The expected page count is 3: 1 for page's mapcount and 1 for the + * caller's pin and 1 for the reference taken by isolate_lru_page(). + */ + if (PageTransHuge(page) && page_count(page) != 3) { + putback_lru_page(page); + return 0; + } + + page_lru = page_is_file_cache(page); + mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, + hpage_nr_pages(page)); + + /* + * Isolating the page has taken another reference, so the + * caller's reference can be safely dropped without the page + * disappearing underneath us during migration. + */ + put_page(page); + return 1; +} + +bool pmd_trans_migrating(pmd_t pmd) +{ + struct page *page = pmd_page(pmd); + return PageLocked(page); +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + int isolated; + int nr_remaining; + LIST_HEAD(migratepages); + + /* + * Don't migrate file pages that are mapped in multiple processes + * with execute permissions as they are probably shared libraries. + */ + if (page_mapcount(page) != 1 && page_is_file_cache(page) && + (vma->vm_flags & VM_EXEC)) + goto out; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat, 1)) + goto out; + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) + goto out; + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); + if (nr_remaining) { + if (!list_empty(&migratepages)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + isolated = 0; + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); + BUG_ON(!list_empty(&migratepages)); + return isolated; + +out: + put_page(page); + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +/* + * Migrates a THP to a given target node. page must be locked and is unlocked + * before returning. + */ +int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + spinlock_t *ptl; + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + struct page *new_page = NULL; + int page_lru = page_is_file_cache(page); + unsigned long mmun_start = address & HPAGE_PMD_MASK; + unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + pmd_t orig_entry; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) + goto out_dropref; + + new_page = alloc_pages_node(node, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + HPAGE_PMD_ORDER); + if (!new_page) + goto out_fail; + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) { + put_page(new_page); + goto out_fail; + } + + if (mm_tlb_flush_pending(mm)) + flush_tlb_range(vma, mmun_start, mmun_end); + + /* Prepare a page as a migration target */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + + /* anon mapping, we can simply copy page->mapping to the new page: */ + new_page->mapping = page->mapping; + new_page->index = page->index; + migrate_page_copy(new_page, page); + WARN_ON(PageLRU(new_page)); + + /* Recheck the target PMD */ + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { +fail_putback: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + /* Reverse changes made by migrate_page_copy() */ + if (TestClearPageActive(new_page)) + SetPageActive(page); + if (TestClearPageUnevictable(new_page)) + SetPageUnevictable(page); + mlock_migrate_page(page, new_page); + + unlock_page(new_page); + put_page(new_page); /* Free it */ + + /* Retake the callers reference and putback on LRU */ + get_page(page); + putback_lru_page(page); + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); + + goto out_unlock; + } + + orig_entry = *pmd; + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + + /* + * Clear the old entry under pagetable lock and establish the new PTE. + * Any parallel GUP will either observe the old page blocking on the + * page lock, block on the page table lock or observe the new page. + * The SetPageUptodate on the new page and page_add_new_anon_rmap + * guarantee the copy is visible before the pagetable update. + */ + flush_cache_range(vma, mmun_start, mmun_end); + page_add_anon_rmap(new_page, vma, mmun_start); + pmdp_clear_flush_notify(vma, mmun_start, pmd); + set_pmd_at(mm, mmun_start, pmd, entry); + flush_tlb_range(vma, mmun_start, mmun_end); + update_mmu_cache_pmd(vma, address, &entry); + + if (page_count(page) != 2) { + set_pmd_at(mm, mmun_start, pmd, orig_entry); + flush_tlb_range(vma, mmun_start, mmun_end); + mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); + update_mmu_cache_pmd(vma, address, &entry); + page_remove_rmap(new_page); + goto fail_putback; + } + + mem_cgroup_migrate(page, new_page, false); + + page_remove_rmap(page); + + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + /* Take an "isolate" reference and put new page on the LRU. */ + get_page(new_page); + putback_lru_page(new_page); + + unlock_page(new_page); + unlock_page(page); + put_page(page); /* Drop the rmap reference */ + put_page(page); /* Drop the LRU isolation reference */ + + count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); + count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + -HPAGE_PMD_NR); + return isolated; + +out_fail: + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); +out_dropref: + ptl = pmd_lock(mm, pmd); + if (pmd_same(*pmd, entry)) { + entry = pmd_modify(entry, vma->vm_page_prot); + set_pmd_at(mm, mmun_start, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + } + spin_unlock(ptl); + +out_unlock: + unlock_page(page); + put_page(page); + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* CONFIG_NUMA */ diff --git a/kernel/mm/mincore.c b/kernel/mm/mincore.c new file mode 100644 index 000000000..be25efde6 --- /dev/null +++ b/kernel/mm/mincore.c @@ -0,0 +1,269 @@ +/* + * linux/mm/mincore.c + * + * Copyright (C) 1994-2006 Linus Torvalds + */ + +/* + * The mincore() system call. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned char present; + unsigned char *vec = walk->private; + + /* + * Hugepages under user process are always in RAM and never + * swapped out, but theoretically it needs to be checked. + */ + present = pte && !huge_pte_none(huge_ptep_get(pte)); + for (; addr != end; vec++, addr += PAGE_SIZE) + *vec = present; + walk->private = vec; +#else + BUG(); +#endif + return 0; +} + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) +{ + unsigned char present = 0; + struct page *page; + + /* + * When tmpfs swaps out a page from a file, any process mapping that + * file will not get a swp_entry_t in its pte, but rather it is like + * any other file mapping (ie. marked !present and faulted in with + * tmpfs's .fault). So swapped out tmpfs mappings are tested here. + */ +#ifdef CONFIG_SWAP + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); +#endif + if (page) { + present = PageUptodate(page); + page_cache_release(page); + } + + return present; +} + +static int __mincore_unmapped_range(unsigned long addr, unsigned long end, + struct vm_area_struct *vma, unsigned char *vec) +{ + unsigned long nr = (end - addr) >> PAGE_SHIFT; + int i; + + if (vma->vm_file) { + pgoff_t pgoff; + + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; + } + return nr; +} + +static int mincore_unmapped_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + walk->private += __mincore_unmapped_range(addr, end, + walk->vma, walk->private); + return 0; +} + +static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + spinlock_t *ptl; + struct vm_area_struct *vma = walk->vma; + pte_t *ptep; + unsigned char *vec = walk->private; + int nr = (end - addr) >> PAGE_SHIFT; + + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + memset(vec, 1, nr); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmd)) { + __mincore_unmapped_range(addr, end, vma, vec); + goto out; + } + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + pte_t pte = *ptep; + + if (pte_none(pte)) + __mincore_unmapped_range(addr, addr + PAGE_SIZE, + vma, vec); + else if (pte_present(pte)) + *vec = 1; + else { /* pte is a swap entry */ + swp_entry_t entry = pte_to_swp_entry(pte); + + if (non_swap_entry(entry)) { + /* + * migration or hwpoison entries are always + * uptodate + */ + *vec = 1; + } else { +#ifdef CONFIG_SWAP + *vec = mincore_page(swap_address_space(entry), + entry.val); +#else + WARN_ON(1); + *vec = 1; +#endif + } + } + vec++; + } + pte_unmap_unlock(ptep - 1, ptl); +out: + walk->private += nr; + cond_resched(); + return 0; +} + +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) +{ + struct vm_area_struct *vma; + unsigned long end; + int err; + struct mm_walk mincore_walk = { + .pmd_entry = mincore_pte_range, + .pte_hole = mincore_unmapped_range, + .hugetlb_entry = mincore_hugetlb, + .private = vec, + }; + + vma = find_vma(current->mm, addr); + if (!vma || addr < vma->vm_start) + return -ENOMEM; + mincore_walk.mm = vma->vm_mm; + end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + err = walk_page_range(addr, end, &mincore_walk); + if (err < 0) + return err; + return (end - addr) >> PAGE_SHIFT; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, + unsigned char __user *, vec) +{ + long retval; + unsigned long pages; + unsigned char *tmp; + + /* Check the start address: needs to be page-aligned.. */ + if (start & ~PAGE_CACHE_MASK) + return -EINVAL; + + /* ..and we need to be passed a valid user-space range */ + if (!access_ok(VERIFY_READ, (void __user *) start, len)) + return -ENOMEM; + + /* This also avoids any overflows on PAGE_CACHE_ALIGN */ + pages = len >> PAGE_SHIFT; + pages += (len & ~PAGE_MASK) != 0; + + if (!access_ok(VERIFY_WRITE, vec, pages)) + return -EFAULT; + + tmp = (void *) __get_free_page(GFP_USER); + if (!tmp) + return -EAGAIN; + + retval = 0; + while (pages) { + /* + * Do at most PAGE_SIZE entries per iteration, due to + * the temporary buffer size. + */ + down_read(¤t->mm->mmap_sem); + retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); + up_read(¤t->mm->mmap_sem); + + if (retval <= 0) + break; + if (copy_to_user(vec, tmp, retval)) { + retval = -EFAULT; + break; + } + pages -= retval; + vec += retval; + start += retval << PAGE_SHIFT; + retval = 0; + } + free_page((unsigned long) tmp); + return retval; +} diff --git a/kernel/mm/mlock.c b/kernel/mm/mlock.c new file mode 100644 index 000000000..6fd2cf15e --- /dev/null +++ b/kernel/mm/mlock.c @@ -0,0 +1,758 @@ +/* + * linux/mm/mlock.c + * + * (C) Copyright 1995 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +int can_do_mlock(void) +{ + if (rlimit(RLIMIT_MEMLOCK) != 0) + return 1; + if (capable(CAP_IPC_LOCK)) + return 1; + return 0; +} +EXPORT_SYMBOL(can_do_mlock); + +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + * + * When lazy mlocking via vmscan, it is important to ensure that the + * vma's VM_LOCKED status is not concurrently being modified, otherwise we + * may have mlocked a page that is being munlocked. So lazy mlock must take + * the mmap_sem for read, and verify that the vma really is locked + * (see mm/rmap.c). + */ + +/* + * LRU accounting for clear_page_mlock() + */ +void clear_page_mlock(struct page *page) +{ + if (!TestClearPageMlocked(page)) + return; + + mod_zone_page_state(page_zone(page), NR_MLOCK, + -hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGCLEARED); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + } else { + /* + * We lost the race. the page already moved to evictable list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + } +} + +/* + * Mark page as mlocked if not already. + * If page on LRU, isolate and putback to move to unevictable list. + */ +void mlock_vma_page(struct page *page) +{ + /* Serialize with page migration */ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { + mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } +} + +/* + * Isolate a page from LRU with optional get_page() pin. + * Assumes lru_lock already held and page already pinned. + */ +static bool __munlock_isolate_lru_page(struct page *page, bool getpage) +{ + if (PageLRU(page)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); + if (getpage) + get_page(page); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + return true; + } + + return false; +} + +/* + * Finish munlock after successful page isolation + * + * Page must be locked. This is a wrapper for try_to_munlock() + * and putback_lru_page() with munlock accounting. + */ +static void __munlock_isolated_page(struct page *page) +{ + int ret = SWAP_AGAIN; + + /* + * Optimization: if the page was mapped just once, that's our mapping + * and we don't need to check all the other vmas. + */ + if (page_mapcount(page) > 1) + ret = try_to_munlock(page); + + /* Did try_to_unlock() succeed or punt? */ + if (ret != SWAP_MLOCK) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); +} + +/* + * Accounting for page isolation fail during munlock + * + * Performs accounting when page isolation fails in munlock. There is nothing + * else to do because it means some other task has already removed the page + * from the LRU. putback_lru_page() will take care of removing the page from + * the unevictable list, if necessary. vmscan [page_referenced()] will move + * the page back to the unevictable list if some other vma has it mlocked. + */ +static void __munlock_isolation_failed(struct page *page) +{ + if (PageUnevictable(page)) + __count_vm_event(UNEVICTABLE_PGSTRANDED); + else + __count_vm_event(UNEVICTABLE_PGMUNLOCKED); +} + +/** + * munlock_vma_page - munlock a vma page + * @page - page to be unlocked, either a normal page or THP page head + * + * returns the size of the page as a page mask (0 for normal page, + * HPAGE_PMD_NR - 1 for THP head page) + * + * called from munlock()/munmap() path with page supposedly on the LRU. + * When we munlock a page, because the vma where we found the page is being + * munlock()ed or munmap()ed, we want to check whether other vmas hold the + * page locked so that we can leave it on the unevictable lru list and not + * bother vmscan with it. However, to walk the page's rmap list in + * try_to_munlock() we must isolate the page from the LRU. If some other + * task has removed the page from the LRU, we won't be able to do that. + * So we clear the PageMlocked as we might not get another chance. If we + * can't isolate the page, we leave it for putback_lru_page() and vmscan + * [page_referenced()/try_to_unmap()] to deal with. + */ +unsigned int munlock_vma_page(struct page *page) +{ + unsigned int nr_pages; + struct zone *zone = page_zone(page); + + /* For try_to_munlock() and to serialize with page migration */ + BUG_ON(!PageLocked(page)); + + /* + * Serialize with any parallel __split_huge_page_refcount() which + * might otherwise copy PageMlocked to part of the tail pages before + * we clear it in the head page. It also stabilizes hpage_nr_pages(). + */ + spin_lock_irq(&zone->lru_lock); + + nr_pages = hpage_nr_pages(page); + if (!TestClearPageMlocked(page)) + goto unlock_out; + + __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); + + if (__munlock_isolate_lru_page(page, true)) { + spin_unlock_irq(&zone->lru_lock); + __munlock_isolated_page(page); + goto out; + } + __munlock_isolation_failed(page); + +unlock_out: + spin_unlock_irq(&zone->lru_lock); + +out: + return nr_pages - 1; +} + +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + +/* + * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() + * + * The fast path is available only for evictable pages with single mapping. + * Then we can bypass the per-cpu pvec and get better performance. + * when mapcount > 1 we need try_to_munlock() which can fail. + * when !page_evictable(), we need the full redo logic of putback_lru_page to + * avoid leaving evictable page in unevictable list. + * + * In case of success, @page is added to @pvec and @pgrescued is incremented + * in case that the page was previously unevictable. @page is also unlocked. + */ +static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, + int *pgrescued) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (page_mapcount(page) <= 1 && page_evictable(page)) { + pagevec_add(pvec, page); + if (TestClearPageUnevictable(page)) + (*pgrescued)++; + unlock_page(page); + return true; + } + + return false; +} + +/* + * Putback multiple evictable pages to the LRU + * + * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of + * the pages might have meanwhile become unevictable but that is OK. + */ +static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) +{ + count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); + /* + *__pagevec_lru_add() calls release_pages() so we don't call + * put_page() explicitly + */ + __pagevec_lru_add(pvec); + count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); +} + +/* + * Munlock a batch of pages from the same zone + * + * The work is split to two main phases. First phase clears the Mlocked flag + * and attempts to isolate the pages, all under a single zone lru lock. + * The second phase finishes the munlock only for pages where isolation + * succeeded. + * + * Note that the pagevec may be modified during the process. + */ +static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) +{ + int i; + int nr = pagevec_count(pvec); + int delta_munlocked; + struct pagevec pvec_putback; + int pgrescued = 0; + + pagevec_init(&pvec_putback, 0); + + /* Phase 1: page isolation */ + spin_lock_irq(&zone->lru_lock); + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (TestClearPageMlocked(page)) { + /* + * We already have pin from follow_page_mask() + * so we can spare the get_page() here. + */ + if (__munlock_isolate_lru_page(page, false)) + continue; + else + __munlock_isolation_failed(page); + } + + /* + * We won't be munlocking this page in the next phase + * but we still need to release the follow_page_mask() + * pin. We cannot do it under lru_lock however. If it's + * the last pin, __page_cache_release() would deadlock. + */ + pagevec_add(&pvec_putback, pvec->pages[i]); + pvec->pages[i] = NULL; + } + delta_munlocked = -nr + pagevec_count(&pvec_putback); + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + spin_unlock_irq(&zone->lru_lock); + + /* Now we can release pins of pages that we are not munlocking */ + pagevec_release(&pvec_putback); + + /* Phase 2: page munlock */ + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (page) { + lock_page(page); + if (!__putback_lru_fast_prepare(page, &pvec_putback, + &pgrescued)) { + /* + * Slow path. We don't want to lose the last + * pin before unlock_page() + */ + get_page(page); /* for putback_lru_page() */ + __munlock_isolated_page(page); + unlock_page(page); + put_page(page); /* from follow_page_mask() */ + } + } + } + + /* + * Phase 3: page putback for pages that qualified for the fast path + * This will also call put_page() to return pin from follow_page_mask() + */ + if (pagevec_count(&pvec_putback)) + __putback_lru_fast(&pvec_putback, pgrescued); +} + +/* + * Fill up pagevec for __munlock_pagevec using pte walk + * + * The function expects that the struct page corresponding to @start address is + * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. + * + * The rest of @pvec is filled by subsequent pages within the same pmd and same + * zone, as long as the pte's are present and vm_normal_page() succeeds. These + * pages also get pinned. + * + * Returns the address of the next page that should be scanned. This equals + * @start + PAGE_SIZE when no page could be added by the pte walk. + */ +static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, + struct vm_area_struct *vma, int zoneid, unsigned long start, + unsigned long end) +{ + pte_t *pte; + spinlock_t *ptl; + + /* + * Initialize pte walk starting at the already pinned page where we + * are sure that there is a pte, as it was pinned under the same + * mmap_sem write op. + */ + pte = get_locked_pte(vma->vm_mm, start, &ptl); + /* Make sure we do not cross the page table boundary */ + end = pgd_addr_end(start, end); + end = pud_addr_end(start, end); + end = pmd_addr_end(start, end); + + /* The page next to the pinned page is the first we will try to get */ + start += PAGE_SIZE; + while (start < end) { + struct page *page = NULL; + pte++; + if (pte_present(*pte)) + page = vm_normal_page(vma, start, *pte); + /* + * Break if page could not be obtained or the page's node+zone does not + * match + */ + if (!page || page_zone_id(page) != zoneid) + break; + + get_page(page); + /* + * Increase the address that will be returned *before* the + * eventual break due to pvec becoming full by adding the page + */ + start += PAGE_SIZE; + if (pagevec_add(pvec, page) == 0) + break; + } + pte_unmap_unlock(pte, ptl); + return start; +} + +/* + * munlock_vma_pages_range() - munlock all pages in the vma range.' + * @vma - vma containing range to be munlock()ed. + * @start - start address in @vma of the range + * @end - end of range in @vma. + * + * For mremap(), munmap() and exit(). + * + * Called with @vma VM_LOCKED. + * + * Returns with VM_LOCKED cleared. Callers must be prepared to + * deal with this. + * + * We don't save and restore VM_LOCKED here because pages are + * still on lru. In unmap path, pages might be scanned by reclaim + * and re-mlocked by try_to_{munlock|unmap} before we unmap and + * free them. This will result in freeing mlocked pages. + */ +void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + vma->vm_flags &= ~VM_LOCKED; + + while (start < end) { + struct page *page = NULL; + unsigned int page_mask; + unsigned long page_increm; + struct pagevec pvec; + struct zone *zone; + int zoneid; + + pagevec_init(&pvec, 0); + /* + * Although FOLL_DUMP is intended for get_dump_page(), + * it just so happens that its special treatment of the + * ZERO_PAGE (returning an error instead of doing get_page) + * suits munlock very well (and if somehow an abnormal page + * has sneaked into the range, we won't oops here: great). + */ + page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, + &page_mask); + + if (page && !IS_ERR(page)) { + if (PageTransHuge(page)) { + lock_page(page); + /* + * Any THP page found by follow_page_mask() may + * have gotten split before reaching + * munlock_vma_page(), so we need to recompute + * the page_mask here. + */ + page_mask = munlock_vma_page(page); + unlock_page(page); + put_page(page); /* follow_page_mask() */ + } else { + /* + * Non-huge pages are handled in batches via + * pagevec. The pin from follow_page_mask() + * prevents them from collapsing by THP. + */ + pagevec_add(&pvec, page); + zone = page_zone(page); + zoneid = page_zone_id(page); + + /* + * Try to fill the rest of pagevec using fast + * pte walk. This will also update start to + * the next page to process. Then munlock the + * pagevec. + */ + start = __munlock_pagevec_fill(&pvec, vma, + zoneid, start, end); + __munlock_pagevec(&pvec, zone); + goto next; + } + } + /* It's a bug to munlock in the middle of a THP page */ + VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); + page_increm = 1 + page_mask; + start += page_increm * PAGE_SIZE; +next: + cond_resched(); + } +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes. + * + * For vmas that pass the filters, merge/split as appropriate. + */ +static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, vm_flags_t newflags) +{ + struct mm_struct *mm = vma->vm_mm; + pgoff_t pgoff; + int nr_pages; + int ret = 0; + int lock = !!(newflags & VM_LOCKED); + + if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) + goto out; /* don't set VM_LOCKED, don't count */ + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + if (start != vma->vm_start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + goto out; + } + + if (end != vma->vm_end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + goto out; + } + +success: + /* + * Keep track of amount of locked VM. + */ + nr_pages = (end - start) >> PAGE_SHIFT; + if (!lock) + nr_pages = -nr_pages; + mm->locked_vm += nr_pages; + + /* + * vm_flags is protected by the mmap_sem held in write mode. + * It's okay if try_to_unmap_one unmaps a page just after we + * set VM_LOCKED, populate_vma_page_range will bring it back. + */ + + if (lock) + vma->vm_flags = newflags; + else + munlock_vma_pages_range(vma, start, end); + +out: + *prev = vma; + return ret; +} + +static int do_mlock(unsigned long start, size_t len, int on) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * prev; + int error; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + vma = find_vma(current->mm, start); + if (!vma || vma->vm_start > start) + return -ENOMEM; + + prev = vma->vm_prev; + if (start > vma->vm_start) + prev = vma; + + for (nstart = start ; ; ) { + vm_flags_t newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = vma->vm_flags & ~VM_LOCKED; + if (on) + newflags |= VM_LOCKED; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + break; + nstart = tmp; + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + break; + + vma = prev->vm_next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + break; + } + } + return error; +} + +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +{ + unsigned long locked; + unsigned long lock_limit; + int error = -ENOMEM; + + if (!can_do_mlock()) + return -EPERM; + + lru_add_drain_all(); /* flush pagevec */ + + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked += current->mm->locked_vm; + + /* check against resource limits */ + if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) + error = do_mlock(start, len, 1); + + up_write(¤t->mm->mmap_sem); + if (error) + return error; + + error = __mm_populate(start, len, 0); + if (error) + return __mlock_posix_error_return(error); + return 0; +} + +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) +{ + int ret; + + len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + start &= PAGE_MASK; + + down_write(¤t->mm->mmap_sem); + ret = do_mlock(start, len, 0); + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static int do_mlockall(int flags) +{ + struct vm_area_struct * vma, * prev = NULL; + + if (flags & MCL_FUTURE) + current->mm->def_flags |= VM_LOCKED; + else + current->mm->def_flags &= ~VM_LOCKED; + if (flags == MCL_FUTURE) + goto out; + + for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + vm_flags_t newflags; + + newflags = vma->vm_flags & ~VM_LOCKED; + if (flags & MCL_CURRENT) + newflags |= VM_LOCKED; + + /* Ignore errors */ + mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + cond_resched_rcu_qs(); + } +out: + return 0; +} + +SYSCALL_DEFINE1(mlockall, int, flags) +{ + unsigned long lock_limit; + int ret = -EINVAL; + + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) + goto out; + + ret = -EPERM; + if (!can_do_mlock()) + goto out; + + if (flags & MCL_CURRENT) + lru_add_drain_all(); /* flush pagevec */ + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + + ret = -ENOMEM; + down_write(¤t->mm->mmap_sem); + + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || + capable(CAP_IPC_LOCK)) + ret = do_mlockall(flags); + up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) + mm_populate(0, TASK_SIZE); +out: + return ret; +} + +SYSCALL_DEFINE0(munlockall) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mlockall(0); + up_write(¤t->mm->mmap_sem); + return ret; +} + +/* + * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB + * shm segments) get accounted against the user_struct instead. + */ +static DEFINE_SPINLOCK(shmlock_user_lock); + +int user_shm_lock(size_t size, struct user_struct *user) +{ + unsigned long lock_limit, locked; + int allowed = 0; + + locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + lock_limit = rlimit(RLIMIT_MEMLOCK); + if (lock_limit == RLIM_INFINITY) + allowed = 1; + lock_limit >>= PAGE_SHIFT; + spin_lock(&shmlock_user_lock); + if (!allowed && + locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + goto out; + get_uid(user); + user->locked_shm += locked; + allowed = 1; +out: + spin_unlock(&shmlock_user_lock); + return allowed; +} + +void user_shm_unlock(size_t size, struct user_struct *user) +{ + spin_lock(&shmlock_user_lock); + user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + spin_unlock(&shmlock_user_lock); + free_uid(user); +} diff --git a/kernel/mm/mm_init.c b/kernel/mm/mm_init.c new file mode 100644 index 000000000..5f420f7fa --- /dev/null +++ b/kernel/mm/mm_init.c @@ -0,0 +1,205 @@ +/* + * mm_init.c - Memory initialisation verification and debugging + * + * Copyright 2008 IBM Corporation, 2008 + * Author Mel Gorman + * + */ +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef CONFIG_DEBUG_MEMORY_INIT +int __meminitdata mminit_loglevel; + +#ifndef SECTIONS_SHIFT +#define SECTIONS_SHIFT 0 +#endif + +/* The zonelists are simply reported, validation is manual. */ +void __init mminit_verify_zonelist(void) +{ + int nid; + + if (mminit_loglevel < MMINIT_VERIFY) + return; + + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + struct zoneref *z; + struct zonelist *zonelist; + int i, listid, zoneid; + + BUG_ON(MAX_ZONELISTS > 2); + for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { + + /* Identify the zone and nodelist */ + zoneid = i % MAX_NR_ZONES; + listid = i / MAX_NR_ZONES; + zonelist = &pgdat->node_zonelists[listid]; + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Print information about the zonelist */ + printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", + listid > 0 ? "thisnode" : "general", nid, + zone->name); + + /* Iterate the zonelist */ + for_each_zone_zonelist(zone, z, zonelist, zoneid) { +#ifdef CONFIG_NUMA + printk(KERN_CONT "%d:%s ", + zone->node, zone->name); +#else + printk(KERN_CONT "0:%s ", zone->name); +#endif /* CONFIG_NUMA */ + } + printk(KERN_CONT "\n"); + } + } +} + +void __init mminit_verify_pageflags_layout(void) +{ + int shift, width; + unsigned long or_mask, add_mask; + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", + "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d\n", + SECTIONS_SHIFT, + NODES_SHIFT, + ZONES_SHIFT, + LAST_CPUPID_SHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", + "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", + (unsigned long)SECTIONS_PGSHIFT, + (unsigned long)NODES_PGSHIFT, + (unsigned long)ZONES_PGSHIFT, + (unsigned long)LAST_CPUPID_PGSHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", + "Node/Zone ID: %lu -> %lu\n", + (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), + (unsigned long)ZONEID_PGOFF); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", + "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", + shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); +#ifdef NODE_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Node not in page flags"); +#endif +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Last cpupid not in page flags"); +#endif + + if (SECTIONS_WIDTH) { + shift -= SECTIONS_WIDTH; + BUG_ON(shift != SECTIONS_PGSHIFT); + } + if (NODES_WIDTH) { + shift -= NODES_WIDTH; + BUG_ON(shift != NODES_PGSHIFT); + } + if (ZONES_WIDTH) { + shift -= ZONES_WIDTH; + BUG_ON(shift != ZONES_PGSHIFT); + } + + /* Check for bitmask overlaps */ + or_mask = (ZONES_MASK << ZONES_PGSHIFT) | + (NODES_MASK << NODES_PGSHIFT) | + (SECTIONS_MASK << SECTIONS_PGSHIFT); + add_mask = (ZONES_MASK << ZONES_PGSHIFT) + + (NODES_MASK << NODES_PGSHIFT) + + (SECTIONS_MASK << SECTIONS_PGSHIFT); + BUG_ON(or_mask != add_mask); +} + +void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, + unsigned long nid, unsigned long pfn) +{ + BUG_ON(page_to_nid(page) != nid); + BUG_ON(page_zonenum(page) != zone); + BUG_ON(page_to_pfn(page) != pfn); +} + +static __init int set_mminit_loglevel(char *str) +{ + get_option(&str, &mminit_loglevel); + return 0; +} +early_param("mminit_loglevel", set_mminit_loglevel); +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +struct kobject *mm_kobj; +EXPORT_SYMBOL_GPL(mm_kobj); + +#ifdef CONFIG_SMP +s32 vm_committed_as_batch = 32; + +static void __meminit mm_compute_batch(void) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr*2, 32); + + /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ + memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + + vm_committed_as_batch = max_t(s32, memsized_batch, batch); +} + +static int __meminit mm_compute_batch_notifier(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + mm_compute_batch(); + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block compute_batch_nb __meminitdata = { + .notifier_call = mm_compute_batch_notifier, + .priority = IPC_CALLBACK_PRI, /* use lowest priority */ +}; + +static int __init mm_compute_batch_init(void) +{ + mm_compute_batch(); + register_hotmemory_notifier(&compute_batch_nb); + + return 0; +} + +__initcall(mm_compute_batch_init); + +#endif + +static int __init mm_sysfs_init(void) +{ + mm_kobj = kobject_create_and_add("mm", kernel_kobj); + if (!mm_kobj) + return -ENOMEM; + + return 0; +} +postcore_initcall(mm_sysfs_init); diff --git a/kernel/mm/mmap.c b/kernel/mm/mmap.c new file mode 100644 index 000000000..bb50cacc3 --- /dev/null +++ b/kernel/mm/mmap.c @@ -0,0 +1,3396 @@ +/* + * mm/mmap.c + * + * Written by obz. + * + * Address space accounting code + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "internal.h" + +#ifndef arch_mmap_check +#define arch_mmap_check(addr, len, flags) (0) +#endif + +#ifndef arch_rebalance_pgtables +#define arch_rebalance_pgtables(addr, len) (addr) +#endif + +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + return __pgprot(pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | + pgprot_val(arch_vm_get_page_prot(vm_flags))); +} +EXPORT_SYMBOL(vm_get_page_prot); + +static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +{ + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); +} + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +void vma_set_page_prot(struct vm_area_struct *vma) +{ + unsigned long vm_flags = vma->vm_flags; + + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma)) { + vm_flags &= ~VM_SHARED; + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, + vm_flags); + } +} + + +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ +int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long free, allowed, reserve; + + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); + + free += get_nr_swap_pages(); + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (free <= totalreserve_pages) + goto error; + else + free -= totalreserve_pages; + + /* + * Reserve some for root + */ + if (!cap_sys_admin) + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + if (free > pages) + return 0; + + goto error; + } + + allowed = vm_commit_limit(); + /* + * Reserve some for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + +/* + * Requires inode->i_mapping->i_mmap_rwsem + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct file *file, struct address_space *mapping) +{ + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&file_inode(file)->i_writecount); + if (vma->vm_flags & VM_SHARED) + mapping_unmap_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Unlink a file-based vm structure from its interval tree, to hide + * vma from rmap and vmtruncate before freeing its page tables. + */ +void unlink_file_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (file) { + struct address_space *mapping = file->f_mapping; + i_mmap_lock_write(mapping); + __remove_shared_vm_struct(vma, file, mapping); + i_mmap_unlock_write(mapping); + } +} + +/* + * Close a vm structure and free it, returning the next. + */ +static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *next = vma->vm_next; + + might_sleep(); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); + kmem_cache_free(vm_area_cachep, vma); + return next; +} + +static unsigned long do_brk(unsigned long addr, unsigned long len); + +SYSCALL_DEFINE1(brk, unsigned long, brk) +{ + unsigned long retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + unsigned long min_brk; + bool populate; + + down_write(&mm->mmap_sem); + +#ifdef CONFIG_COMPAT_BRK + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (current->brk_randomized) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; +#else + min_brk = mm->start_brk; +#endif + if (brk < min_brk) + goto out; + + /* + * Check against rlimit here. If this check is done later after the test + * of oldbrk with newbrk then it can escape the test and let the data + * segment grow beyond its set limit the in case where the limit is + * not page aligned -Ram Gupta + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, + mm->end_data, mm->start_data)) + goto out; + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; + +set_brk: + mm->brk = brk; + populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; + up_write(&mm->mmap_sem); + if (populate) + mm_populate(oldbrk, newbrk - oldbrk); + return brk; + +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +static long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max, subtree_gap; + max = vma->vm_start; + if (vma->vm_prev) + max -= vma->vm_prev->vm_end; + if (vma->vm_rb.rb_left) { + subtree_gap = rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + if (vma->vm_rb.rb_right) { + subtree_gap = rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + return max; +} + +#ifdef CONFIG_DEBUG_VM_RB +static int browse_rb(struct rb_root *root) +{ + int i = 0, j, bug = 0; + struct rb_node *nd, *pn = NULL; + unsigned long prev = 0, pend = 0; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + if (vma->vm_start < prev) { + pr_emerg("vm_start %lx < prev %lx\n", + vma->vm_start, prev); + bug = 1; + } + if (vma->vm_start < pend) { + pr_emerg("vm_start %lx < pend %lx\n", + vma->vm_start, pend); + bug = 1; + } + if (vma->vm_start > vma->vm_end) { + pr_emerg("vm_start %lx > vm_end %lx\n", + vma->vm_start, vma->vm_end); + bug = 1; + } + if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { + pr_emerg("free gap %lx, correct %lx\n", + vma->rb_subtree_gap, + vma_compute_subtree_gap(vma)); + bug = 1; + } + i++; + pn = nd; + prev = vma->vm_start; + pend = vma->vm_end; + } + j = 0; + for (nd = pn; nd; nd = rb_prev(nd)) + j++; + if (i != j) { + pr_emerg("backwards %d, forwards %d\n", j, i); + bug = 1; + } + return bug ? -1 : i; +} + +static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) +{ + struct rb_node *nd; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + VM_BUG_ON_VMA(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma), + vma); + } +} + +static void validate_mm(struct mm_struct *mm) +{ + int bug = 0; + int i = 0; + unsigned long highest_address = 0; + struct vm_area_struct *vma = mm->mmap; + + while (vma) { + struct anon_vma_chain *avc; + + vma_lock_anon_vma(vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + vma_unlock_anon_vma(vma); + highest_address = vma->vm_end; + vma = vma->vm_next; + i++; + } + if (i != mm->map_count) { + pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); + bug = 1; + } + if (highest_address != mm->highest_vm_end) { + pr_emerg("mm->highest_vm_end %lx, found %lx\n", + mm->highest_vm_end, highest_address); + bug = 1; + } + i = browse_rb(&mm->mm_rb); + if (i != mm->map_count) { + if (i != -1) + pr_emerg("map_count %d rb %d\n", mm->map_count, i); + bug = 1; + } + VM_BUG_ON_MM(bug, mm); +} +#else +#define validate_mm_rb(root, ignore) do { } while (0) +#define validate_mm(mm) do { } while (0) +#endif + +RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_subtree_gap) + +/* + * Update augmented rbtree rb_subtree_gap values after vma->vm_start or + * vma->vm_prev->vm_end values changed, without modifying the vma's position + * in the rbtree. + */ +static void vma_gap_update(struct vm_area_struct *vma) +{ + /* + * As it turns out, RB_DECLARE_CALLBACKS() already created a callback + * function that does exacltly what we want. + */ + vma_gap_callbacks_propagate(&vma->vm_rb, NULL); +} + +static inline void vma_rb_insert(struct vm_area_struct *vma, + struct rb_root *root) +{ + /* All rb_subtree_gap values must be consistent prior to insertion */ + validate_mm_rb(root, NULL); + + rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the vma being erased. + */ + validate_mm_rb(root, vma); + + /* + * Note rb_erase_augmented is a fairly large inline function, + * so make sure we instantiate it only once with our desired + * augmented rbtree callbacks. + */ + rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_sem and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +static int find_vma_links(struct mm_struct *mm, unsigned long addr, + unsigned long end, struct vm_area_struct **pprev, + struct rb_node ***rb_link, struct rb_node **rb_parent) +{ + struct rb_node **__rb_link, *__rb_parent, *rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + /* Fail if an existing vma overlaps the area */ + if (vma_tmp->vm_start < end) + return -ENOMEM; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return 0; +} + +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) +{ + unsigned long nr_pages = 0; + struct vm_area_struct *vma; + + /* Find first overlaping mapping */ + vma = find_vma_intersection(mm, addr, end); + if (!vma) + return 0; + + nr_pages = (min(end, vma->vm_end) - + max(addr, vma->vm_start)) >> PAGE_SHIFT; + + /* Iterate over the rest of the overlaps */ + for (vma = vma->vm_next; vma; vma = vma->vm_next) { + unsigned long overlap_len; + + if (vma->vm_start > end) + break; + + overlap_len = min(end, vma->vm_end) - vma->vm_start; + nr_pages += overlap_len >> PAGE_SHIFT; + } + + return nr_pages; +} + +void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, + struct rb_node **rb_link, struct rb_node *rb_parent) +{ + /* Update tracking information for the gap following the new vma. */ + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + mm->highest_vm_end = vma->vm_end; + + /* + * vma->vm_prev wasn't known when we followed the rbtree to find the + * correct insertion point for that vma. As a result, we could not + * update the vma vm_rb parents rb_subtree_gap values on the way down. + * So, we first insert the vma with a zero rb_subtree_gap value + * (to be consistent with what we did on the way down), and then + * immediately update the gap to the correct value. Finally we + * rebalance the rbtree after all augmented values have been set. + */ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + vma->rb_subtree_gap = 0; + vma_gap_update(vma); + vma_rb_insert(vma, &mm->mm_rb); +} + +static void __vma_link_file(struct vm_area_struct *vma) +{ + struct file *file; + + file = vma->vm_file; + if (file) { + struct address_space *mapping = file->f_mapping; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&file_inode(file)->i_writecount); + if (vma->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + } +} + +static void +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); +} + +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + struct address_space *mapping = NULL; + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + i_mmap_lock_write(mapping); + } + + __vma_link(mm, vma, prev, rb_link, rb_parent); + __vma_link_file(vma); + + if (mapping) + i_mmap_unlock_write(mapping); + + mm->map_count++; + validate_mm(mm); +} + +/* + * Helper for vma_adjust() in the split_vma insert case: insert a vma into the + * mm's list and rbtree. It has already been inserted into the interval tree. + */ +static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +{ + struct vm_area_struct *prev; + struct rb_node **rb_link, *rb_parent; + + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + BUG(); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mm->map_count++; +} + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + struct vm_area_struct *next; + + vma_rb_erase(vma, &mm->mm_rb); + prev->vm_next = next = vma->vm_next; + if (next) + next->vm_prev = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); +} + +/* + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that + * is already present in an i_mmap tree without adjusting the tree. + * The following helper function should be used when such adjustments + * are necessary. The "insert" vma (if any) is to be inserted + * before we drop the necessary locks. + */ +int vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *importer = NULL; + struct address_space *mapping = NULL; + struct rb_root *root = NULL; + struct anon_vma *anon_vma = NULL; + struct file *file = vma->vm_file; + bool start_changed = false, end_changed = false; + long adjust_next = 0; + int remove_next = 0; + + if (next && !insert) { + struct vm_area_struct *exporter = NULL; + + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and + * perhaps the one after too (mprotect case 6). + */ +again: remove_next = 1 + (end > next->vm_end); + end = next->vm_end; + exporter = next; + importer = vma; + } else if (end > next->vm_start) { + /* + * vma expands, overlapping part of the next: + * mprotect case 5 shifting the boundary up. + */ + adjust_next = (end - next->vm_start) >> PAGE_SHIFT; + exporter = next; + importer = vma; + } else if (end < vma->vm_end) { + /* + * vma shrinks, and !insert tells it's not + * split_vma inserting another: so it must be + * mprotect case 4 shifting the boundary down. + */ + adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); + exporter = vma; + importer = next; + } + + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (exporter && exporter->anon_vma && !importer->anon_vma) { + int error; + + importer->anon_vma = exporter->anon_vma; + error = anon_vma_clone(importer, exporter); + if (error) + return error; + } + } + + if (file) { + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + + if (adjust_next) + uprobe_munmap(next, next->vm_start, next->vm_end); + + i_mmap_lock_write(mapping); + if (insert) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(insert); + } + } + + vma_adjust_trans_huge(vma, start, end, adjust_next); + + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + if (anon_vma) { + VM_BUG_ON_VMA(adjust_next && next->anon_vma && + anon_vma != next->anon_vma, next); + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_pre_update_vma(next); + } + + if (root) { + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, root); + if (adjust_next) + vma_interval_tree_remove(next, root); + } + + if (start != vma->vm_start) { + vma->vm_start = start; + start_changed = true; + } + if (end != vma->vm_end) { + vma->vm_end = end; + end_changed = true; + } + vma->vm_pgoff = pgoff; + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; + } + + if (root) { + if (adjust_next) + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + if (remove_next) { + /* + * vma_merge has merged next into vma, and needs + * us to remove next before dropping the locks. + */ + __vma_unlink(mm, next, vma); + if (file) + __remove_shared_vm_struct(next, file, mapping); + } else if (insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + __insert_vm_struct(mm, insert); + } else { + if (start_changed) + vma_gap_update(vma); + if (end_changed) { + if (!next) + mm->highest_vm_end = end; + else if (!adjust_next) + vma_gap_update(next); + } + } + + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_post_update_vma(next); + anon_vma_unlock_write(anon_vma); + } + if (mapping) + i_mmap_unlock_write(mapping); + + if (root) { + uprobe_mmap(vma); + + if (adjust_next) + uprobe_mmap(next); + } + + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + kmem_cache_free(vm_area_cachep, next); + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove another next too. It would clutter + * up the code too much to do both in one go. + */ + next = vma->vm_next; + if (remove_next == 2) + goto again; + else if (next) + vma_gap_update(next); + else + mm->highest_vm_end = end; + } + if (insert && file) + uprobe_mmap(insert); + + validate_mm(mm); + + return 0; +} + +/* + * If the vma has a ->close operation then the driver probably needs to release + * per-vma resources, so we don't attempt to merge those. + */ +static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags) +{ + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressue on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) + return 0; + if (vma->vm_file != file) + return 0; + if (vma->vm_ops && vma->vm_ops->close) + return 0; + return 1; +} + +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2, + struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + if ((!anon_vma1 || !anon_vma2) && (!vma || + list_is_singular(&vma->anon_vma_chain))) + return 1; + return anon_vma1 == anon_vma2; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + */ +static int +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { + if (vma->vm_pgoff == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + */ +static int +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { + pgoff_t vm_pglen; + vm_pglen = vma_pages(vma); + if (vma->vm_pgoff + vm_pglen == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out + * whether that can be merged with its predecessor or its successor. + * Or both (it neatly fills a hole). + * + * In most cases - when called for mmap, brk or mremap - [addr,end) is + * certain not to be mapped by the time vma_merge is called; but when + * called for mprotect, it is certain to be already mapped (either at + * an offset within prev, or at the start of next), and the flags of + * this area are about to be changed to vm_flags - and the no-change + * case has already been eliminated. + * + * The following mprotect cases have to be considered, where AAAA is + * the area passed down from mprotect_fixup, never extending beyond one + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: + * + * AAAA AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX + * cannot merge might become might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or + * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or + * mremap move: PPPPNNNNNNNN 8 + * AAAA + * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN + * might become case 1 below case 2 below case 3 below + * + * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: + * mprotect_fixup updates vm_flags & vm_page_prot on successful return. + */ +struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy) +{ + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + struct vm_area_struct *area, *next; + int err; + + /* + * We later require that vma->vm_flags == vm_flags, + * so this tests vma->vm_flags & VM_SPECIAL, too. + */ + if (vm_flags & VM_SPECIAL) + return NULL; + + if (prev) + next = prev->vm_next; + else + next = mm->mmap; + area = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; + + /* + * Can it merge with the predecessor? + */ + if (prev && prev->vm_end == addr && + mpol_equal(vma_policy(prev), policy) && + can_vma_merge_after(prev, vm_flags, + anon_vma, file, pgoff)) { + /* + * OK, it can. Can we now merge in the successor as well? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen) && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma, NULL)) { + /* cases 1, 6 */ + err = vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL); + } else /* cases 2, 5, 7 */ + err = vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); + if (err) + return NULL; + khugepaged_enter_vma_merge(prev, vm_flags); + return prev; + } + + /* + * Can this new request be merged in front of next? + */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen)) { + if (prev && addr < prev->vm_end) /* case 4 */ + err = vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); + else /* cases 3, 8 */ + err = vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL); + if (err) + return NULL; + khugepaged_enter_vma_merge(area, vm_flags); + return area; + } + + return NULL; +} + +/* + * Rough compatbility check to quickly see if it's even worth looking + * at sharing an anon_vma. + * + * They need to have the same vm_file, and the flags can only differ + * in things that mprotect may change. + * + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that + * we can merge the two vma's. For example, we refuse to merge a vma if + * there is a vm_ops->close() function, because that indicates that the + * driver is doing some kind of reference counting. But that doesn't + * really matter for the anon_vma sharing case. + */ +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) +{ + return a->vm_end == b->vm_start && + mpol_equal(vma_policy(a), vma_policy(b)) && + a->vm_file == b->vm_file && + !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); +} + +/* + * Do some basic sanity checking to see if we can re-use the anon_vma + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be + * the same as 'old', the other will be the new one that is trying + * to share the anon_vma. + * + * NOTE! This runs with mm_sem held for reading, so it is possible that + * the anon_vma of 'old' is concurrently in the process of being set up + * by another page fault trying to merge _that_. But that's ok: if it + * is being set up, that automatically means that it will be a singleton + * acceptable for merging, so we can do all of this optimistically. But + * we do that READ_ONCE() to make sure that we never re-load the pointer. + * + * IOW: that the "list_is_singular()" test on the anon_vma_chain only + * matters for the 'stable anon_vma' case (ie the thing we want to avoid + * is to return an anon_vma that is "complex" due to having gone through + * a fork). + * + * We also make sure that the two vma's are compatible (adjacent, + * and with the same memory policies). That's all stable, even with just + * a read lock on the mm_sem. + */ +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) +{ + if (anon_vma_compatible(a, b)) { + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); + + if (anon_vma && list_is_singular(&old->anon_vma_chain)) + return anon_vma; + } + return NULL; +} + +/* + * find_mergeable_anon_vma is used by anon_vma_prepare, to check + * neighbouring vmas for a suitable anon_vma, before it goes off + * to allocate a new anon_vma. It checks because a repetitive + * sequence of mprotects and faults may otherwise lead to distinct + * anon_vmas being allocated, preventing vma merge in subsequent + * mprotect. + */ +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *near; + + near = vma->vm_next; + if (!near) + goto try_prev; + + anon_vma = reusable_anon_vma(near, vma, near); + if (anon_vma) + return anon_vma; +try_prev: + near = vma->vm_prev; + if (!near) + goto none; + + anon_vma = reusable_anon_vma(near, near, vma); + if (anon_vma) + return anon_vma; +none: + /* + * There's no absolute need to look only at touching neighbours: + * we could search further afield for "compatible" anon_vmas. + * But it would probably just be a waste of time searching, + * or lead to too many vmas hanging off the same anon_vma. + * We're trying to allow mprotect remerging later on, + * not trying to minimize memory used for anon_vmas. + */ + return NULL; +} + +#ifdef CONFIG_PROC_FS +void vm_stat_account(struct mm_struct *mm, unsigned long flags, + struct file *file, long pages) +{ + const unsigned long stack_flags + = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); + + mm->total_vm += pages; + + if (file) { + mm->shared_vm += pages; + if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) + mm->exec_vm += pages; + } else if (flags & stack_flags) + mm->stack_vm += pages; +} +#endif /* CONFIG_PROC_FS */ + +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + +static inline int mlock_future_check(struct mm_struct *mm, + unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + +/* + * The caller must hold down_write(¤t->mm->mmap_sem). + */ + +unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, unsigned long pgoff, + unsigned long *populate) +{ + struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; + + *populate = 0; + + /* + * Does the application expect PROT_READ to imply PROT_EXEC? + * + * (the exception is when the underlying filesystem is noexec + * mounted, in which case we dont add PROT_EXEC.) + */ + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) + if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) + prot |= PROT_EXEC; + + if (!len) + return -EINVAL; + + if (!(flags & MAP_FIXED)) + addr = round_hint_to_min(addr); + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len) + return -ENOMEM; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EOVERFLOW; + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; + + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; + + if (file) { + struct inode *inode = file_inode(file); + + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) + return -EACCES; + + /* + * Make sure we don't allow writing to an append-only + * file.. + */ + if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* + * Make sure there are no mandatory locks on the file. + */ + if (locks_verify_locked(file)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (vm_flags & VM_EXEC) + return -EPERM; + vm_flags &= ~VM_MAYEXEC; + } + + if (!file->f_op->mmap) + return -ENODEV; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; + break; + + default: + return -EINVAL; + } + } else { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; + /* + * Ignore pgoff. + */ + pgoff = 0; + vm_flags |= VM_SHARED | VM_MAYSHARE; + break; + case MAP_PRIVATE: + /* + * Set pgoff according to addr for anon_vma. + */ + pgoff = addr >> PAGE_SHIFT; + break; + default: + return -EINVAL; + } + } + + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. + */ + if (flags & MAP_NORESERVE) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } + + addr = mmap_region(file, addr, len, vm_flags, pgoff); + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + *populate = len; + return addr; +} + +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + audit_mmap_fd(fd, flags); + file = fget(fd); + if (!file) + goto out; + if (is_file_hugepages(file)) + len = ALIGN(len, huge_page_size(hstate_file(file))); + retval = -EINVAL; + if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + goto out_fput; + } else if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + struct hstate *hs; + + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + if (!hs) + return -EINVAL; + + len = ALIGN(len, huge_page_size(hs)); + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, + VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, + (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: + if (file) + fput(file); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + +/* + * Some shared mappigns will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +int vma_wants_writenotify(struct vm_area_struct *vma) +{ + vm_flags_t vm_flags = vma->vm_flags; + + /* If it was private or non-writable, the write bit is already clear */ + if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + return 0; + + /* The backer wishes to know when pages are first written to? */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) + return 1; + + /* The open routine did something to the protections that pgprot_modify + * won't preserve? */ + if (pgprot_val(vma->vm_page_prot) != + pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) + return 0; + + /* Do we need to track softdirty? */ + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + return 1; + + /* Specialty mapping? */ + if (vm_flags & VM_PFNMAP) + return 0; + + /* Can the mapping track the dirty pages? */ + return vma->vm_file && vma->vm_file->f_mapping && + mapping_cap_account_dirty(vma->vm_file->f_mapping); +} + +/* + * We account for memory if it's a private writeable mapping, + * not hugepages and VM_NORESERVE wasn't set. + */ +static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) +{ + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; +} + +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + int error; + struct rb_node **rb_link, *rb_parent; + unsigned long charged = 0; + + /* Check against address space limit. */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + if (!(vm_flags & MAP_FIXED)) + return -ENOMEM; + + nr_pages = count_vma_pages_range(mm, addr, addr + len); + + if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + + /* Clear old maps */ + error = -ENOMEM; + while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, + &rb_parent)) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + } + + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(file, vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + + /* + * Can we just expand an old mapping? + */ + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, + NULL); + if (vma) + goto out; + + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->vm_pgoff = pgoff; + INIT_LIST_HEAD(&vma->anon_vma_chain); + + if (file) { + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + } + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto allow_write_and_free_vma; + } + + /* ->mmap() can change vma->vm_file, but must guarantee that + * vma_link() below can deny write-access if VM_DENYWRITE is set + * and map writably if VM_SHARED is set. This usually means the + * new file must not have been exposed to user-space, yet. + */ + vma->vm_file = get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + * Bug: If addr is changed, prev, rb_link, rb_parent should + * be updated for vma_link() + */ + WARN_ON_ONCE(addr != vma->vm_start); + + addr = vma->vm_start; + vm_flags = vma->vm_flags; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + vma_link(mm, vma, prev, rb_link, rb_parent); + /* Once vma denies write, undo our temporary denial count */ + if (file) { + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + } + file = vma->vm_file; +out: + perf_event_mmap(vma); + + vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm))) + mm->locked_vm += (len >> PAGE_SHIFT); + else + vma->vm_flags &= ~VM_LOCKED; + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + + vma_set_page_prot(vma); + + return addr; + +unmap_and_free_vma: + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +allow_write_and_free_vma: + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); +free_vma: + kmem_cache_free(vm_area_cachep, vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + return error; +} + +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + /* + * We implement the search by looking for an rbtree node that + * immediately follows a suitable gap. That is, + * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; + * - gap_end = vma->vm_start >= info->low_limit + length; + * - gap_end - gap_start >= length + */ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* Adjust search limits by the desired length */ + if (info->high_limit < length) + return -ENOMEM; + high_limit = info->high_limit - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + goto check_highest; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + goto check_highest; + + while (true) { + /* Visit left subtree if it looks promising */ + gap_end = vma->vm_start; + if (gap_end >= low_limit && vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +check_current: + /* Check if current node has a suitable gap */ + if (gap_start > high_limit) + return -ENOMEM; + if (gap_end >= low_limit && gap_end - gap_start >= length) + goto found; + + /* Visit right subtree if it looks promising */ + if (vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + goto check_highest; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_left) { + gap_start = vma->vm_prev->vm_end; + gap_end = vma->vm_start; + goto check_current; + } + } + } + +check_highest: + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ + if (gap_start > high_limit) + return -ENOMEM; + +found: + /* We found a suitable gap. Clip it with the original low_limit. */ + if (gap_start < info->low_limit) + gap_start = info->low_limit; + + /* Adjust gap address to the desired alignment */ + gap_start += (info->align_offset - gap_start) & info->align_mask; + + VM_BUG_ON(gap_start + info->length > info->high_limit); + VM_BUG_ON(gap_start + info->length > gap_end); + return gap_start; +} + +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* + * Adjust search limits by the desired length. + * See implementation comment at top of unmapped_area(). + */ + gap_end = info->high_limit; + if (gap_end < length) + return -ENOMEM; + high_limit = gap_end - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + if (gap_start <= high_limit) + goto found_highest; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + return -ENOMEM; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + return -ENOMEM; + + while (true) { + /* Visit right subtree if it looks promising */ + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + if (gap_start <= high_limit && vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + +check_current: + /* Check if current node has a suitable gap */ + gap_end = vma->vm_start; + if (gap_end < low_limit) + return -ENOMEM; + if (gap_start <= high_limit && gap_end - gap_start >= length) + goto found; + + /* Visit left subtree if it looks promising */ + if (vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + return -ENOMEM; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_right) { + gap_start = vma->vm_prev ? + vma->vm_prev->vm_end : 0; + goto check_current; + } + } + } + +found: + /* We found a suitable gap. Clip it with the original high_limit. */ + if (gap_end > info->high_limit) + gap_end = info->high_limit; + +found_highest: + /* Compute highest gap address at the desired alignment */ + gap_end -= info->length; + gap_end -= (gap_end - info->align_offset) & info->align_mask; + + VM_BUG_ON(gap_end < info->low_limit); + VM_BUG_ON(gap_end < gap_start); + return gap_end; +} + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); +} +#endif + +/* + * This mmap-allocator allocates new areas top-down from below the + * stack's low limit (the base): + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} +#endif + +unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + + unsigned long error = arch_mmap_check(addr, len, flags); + if (error) + return error; + + /* Careful about overflows.. */ + if (len > TASK_SIZE) + return -ENOMEM; + + get_area = current->mm->get_unmapped_area; + if (file && file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + addr = get_area(file, addr, len, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + + addr = arch_rebalance_pgtables(addr, len); + error = security_mmap_addr(addr); + return error ? error : addr; +} + +EXPORT_SYMBOL(get_unmapped_area); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct rb_node *rb_node; + struct vm_area_struct *vma; + + /* Check the cache first. */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + + if (vma) + vmacache_update(addr, vma); + return vma; +} + +EXPORT_SYMBOL(find_vma); + +/* + * Same as find_vma, but also return a pointer to the previous VMA in *pprev. + */ +struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma; + + vma = find_vma(mm, addr); + if (vma) { + *pprev = vma->vm_prev; + } else { + struct rb_node *rb_node = mm->mm_rb.rb_node; + *pprev = NULL; + while (rb_node) { + *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); + rb_node = rb_node->rb_right; + } + } + return vma; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + struct rlimit *rlim = current->signal->rlim; + unsigned long new_start, actual_size; + + /* address space limit tests */ + if (!may_expand_vm(mm, grow)) + return -ENOMEM; + + /* Stack limit test */ + actual_size = size; + if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) + actual_size -= PAGE_SIZE; + if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + return -ENOMEM; + + /* mlock limit tests */ + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked; + unsigned long limit; + locked = mm->locked_vm + grow; + limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit >>= PAGE_SHIFT; + if (locked > limit && !capable(CAP_IPC_LOCK)) + return -ENOMEM; + } + + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory_mm(mm, grow)) + return -ENOMEM; + + /* Ok, everything looks good - let it rip */ + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + return 0; +} + +#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +/* + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +int expand_upwards(struct vm_area_struct *vma, unsigned long address) +{ + int error; + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + vma_lock_anon_vma(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + * Also guard against wrapping around to address 0. + */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else { + vma_unlock_anon_vma(vma); + return -ENOMEM; + } + error = 0; + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + /* + * vma_gap_update() doesn't support concurrent + * updates, but we only hold a shared mmap_sem + * lock here, so we need to protect against + * concurrent vma expansions. + * vma_lock_anon_vma() doesn't help here, as + * we don't guarantee that all growable vmas + * in a mm share the same root anon vma. + * So, we reuse mm->page_table_lock to guard + * against concurrent vma expansions. + */ + spin_lock(&vma->vm_mm->page_table_lock); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_end = address; + anon_vma_interval_tree_post_update_vma(vma); + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + vma->vm_mm->highest_vm_end = address; + spin_unlock(&vma->vm_mm->page_table_lock); + + perf_event_mmap(vma); + } + } + } + vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); + validate_mm(vma->vm_mm); + return error; +} +#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ + +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +int expand_downwards(struct vm_area_struct *vma, + unsigned long address) +{ + int error; + + /* + * We must make sure the anon_vma is allocated + * so that the anon_vma locking is not a noop. + */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + address &= PAGE_MASK; + error = security_mmap_addr(address); + if (error) + return error; + + vma_lock_anon_vma(vma); + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + /* + * vma_gap_update() doesn't support concurrent + * updates, but we only hold a shared mmap_sem + * lock here, so we need to protect against + * concurrent vma expansions. + * vma_lock_anon_vma() doesn't help here, as + * we don't guarantee that all growable vmas + * in a mm share the same root anon vma. + * So, we reuse mm->page_table_lock to guard + * against concurrent vma expansions. + */ + spin_lock(&vma->vm_mm->page_table_lock); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_start = address; + vma->vm_pgoff -= grow; + anon_vma_interval_tree_post_update_vma(vma); + vma_gap_update(vma); + spin_unlock(&vma->vm_mm->page_table_lock); + + perf_event_mmap(vma); + } + } + } + vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); + validate_mm(vma->vm_mm); + return error; +} + +/* + * Note how expand_stack() refuses to expand the stack all the way to + * abut the next virtual mapping, *unless* that mapping itself is also + * a stack mapping. We want to leave room for a guard page, after all + * (the guard page itself is not added here, that is done by the + * actual page faulting logic) + * + * This matches the behavior of the guard page logic (see mm/memory.c: + * check_stack_guard_page()), which only allows the guard page to be + * removed under these circumstances. + */ +#ifdef CONFIG_STACK_GROWSUP +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + struct vm_area_struct *next; + + address &= PAGE_MASK; + next = vma->vm_next; + if (next && next->vm_start == address + PAGE_SIZE) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + } + return expand_upwards(vma, address); +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) + populate_vma_page_range(prev, addr, prev->vm_end, NULL); + return prev; +} +#else +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + struct vm_area_struct *prev; + + address &= PAGE_MASK; + prev = vma->vm_prev; + if (prev && prev->vm_end == address) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + } + return expand_downwards(vma, address); +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm, addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) + populate_vma_page_range(vma, addr, start, NULL); + return vma; +} +#endif + +EXPORT_SYMBOL_GPL(find_extend_vma); + +/* + * Ok - we have the memory areas we should free on the vma list, + * so release them, and do the vma updates. + * + * Called with the mm semaphore held. + */ +static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +{ + unsigned long nr_accounted = 0; + + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); + do { + long nrpages = vma_pages(vma); + + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += nrpages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vma = remove_vma(vma); + } while (vma); + vm_unacct_memory(nr_accounted); + validate_mm(mm); +} + +/* + * Get rid of page table information in the indicated region. + * + * Called with the mm semaphore held. + */ +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; + struct mmu_gather tlb; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + unmap_vmas(&tlb, vma, start, end); + free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : USER_PGTABLES_CEILING); + tlb_finish_mmu(&tlb, start, end); +} + +/* + * Create a list of vma's touched by the unmap, removing them from the mm's + * vma list as we go.. + */ +static void +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, unsigned long end) +{ + struct vm_area_struct **insertion_point; + struct vm_area_struct *tail_vma = NULL; + + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + vma->vm_prev = NULL; + do { + vma_rb_erase(vma, &mm->mm_rb); + mm->map_count--; + tail_vma = vma; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + *insertion_point = vma; + if (vma) { + vma->vm_prev = prev; + vma_gap_update(vma); + } else + mm->highest_vm_end = prev ? prev->vm_end : 0; + tail_vma->vm_next = NULL; + + /* Kill the cache */ + vmacache_invalidate(mm); +} + +/* + * __split_vma() bypasses sysctl_max_map_count checking. We use this on the + * munmap path where it doesn't make sense to fail. + */ +static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + struct vm_area_struct *new; + int err = -ENOMEM; + + if (is_vm_hugetlb_page(vma) && (addr & + ~(huge_page_mask(hstate_vma(vma))))) + return -EINVAL; + + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) + goto out_err; + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + + INIT_LIST_HEAD(&new->anon_vma_chain); + + if (new_below) + new->vm_end = addr; + else { + new->vm_start = addr; + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + + err = vma_dup_policy(vma, new); + if (err) + goto out_free_vma; + + err = anon_vma_clone(new, vma); + if (err) + goto out_free_mpol; + + if (new->vm_file) + get_file(new->vm_file); + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + if (new_below) + err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + ((addr - new->vm_start) >> PAGE_SHIFT), new); + else + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + + /* Success. */ + if (!err) + return 0; + + /* Clean everything up if vma_adjust failed. */ + if (new->vm_ops && new->vm_ops->close) + new->vm_ops->close(new); + if (new->vm_file) + fput(new->vm_file); + unlink_anon_vmas(new); + out_free_mpol: + mpol_put(vma_policy(new)); + out_free_vma: + kmem_cache_free(vm_area_cachep, new); + out_err: + return err; +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the tail. + */ +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + return __split_vma(mm, vma, addr, new_below); +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardinge + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end; + struct vm_area_struct *vma, *prev, *last; + + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + len = PAGE_ALIGN(len); + if (len == 0) + return -EINVAL; + + /* Find the first overlapping VMA */ + vma = find_vma(mm, start); + if (!vma) + return 0; + prev = vma->vm_prev; + /* we have start < vma->vm_end */ + + /* if it doesn't overlap, we have nothing.. */ + end = start + len; + if (vma->vm_start >= end) + return 0; + + /* + * If we need to split any vma, do it now to save pain later. + * + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially + * unmapped vm_area_struct will remain in use: so lower split_vma + * places tmp vma above, and higher split_vma places tmp vma below. + */ + if (start > vma->vm_start) { + int error; + + /* + * Make sure that map_count on return from munmap() will + * not exceed its limit; but let map_count go just above + * its limit temporarily, to help free resources as expected. + */ + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + error = __split_vma(mm, vma, start, 0); + if (error) + return error; + prev = vma; + } + + /* Does it split the last one? */ + last = find_vma(mm, end); + if (last && end > last->vm_start) { + int error = __split_vma(mm, last, end, 1); + if (error) + return error; + } + vma = prev ? prev->vm_next : mm->mmap; + + /* + * unlock any mlock()ed ranges before detaching vmas + */ + if (mm->locked_vm) { + struct vm_area_struct *tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + tmp = tmp->vm_next; + } + } + + /* + * Remove the vma's, and unmap the actual pages + */ + detach_vmas_to_be_unmapped(mm, vma, prev, end); + unmap_region(mm, vma, prev, start, end); + + arch_unmap(mm, vma, start, end); + + /* Fix up all other VM information */ + remove_vma_list(mm, vma); + + return 0; +} + +int vm_munmap(unsigned long start, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, start, len); + up_write(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(vm_munmap); + +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) +{ + profile_munmap(addr); + return vm_munmap(addr, len); +} + + +/* + * Emulation of deprecated remap_file_pages() syscall. + */ +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) +{ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long populate = 0; + unsigned long ret = -EINVAL; + struct file *file; + + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " + "See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start < vma->vm_start || start + size > vma->vm_end) + goto out; + + if (pgoff == linear_page_index(vma, start)) { + ret = 0; + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) { + flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ + munlock_vma_pages_range(vma, start, start + size); + } + + file = get_file(vma->vm_file); + ret = do_mmap_pgoff(vma->vm_file, start, size, + prot, flags, pgoff, &populate); + fput(file); +out: + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + +static inline void verify_mm_writelocked(struct mm_struct *mm) +{ +#ifdef CONFIG_DEBUG_VM + if (unlikely(down_read_trylock(&mm->mmap_sem))) { + WARN_ON(1); + up_read(&mm->mmap_sem); + } +#endif +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +static unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + unsigned long flags; + struct rb_node **rb_link, *rb_parent; + pgoff_t pgoff = addr >> PAGE_SHIFT; + int error; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (error & ~PAGE_MASK) + return error; + + error = mlock_future_check(mm, mm->def_flags, len); + if (error) + return error; + + /* + * mm->mmap_sem is required to protect against another thread + * changing the mappings in case we sleep. + */ + verify_mm_writelocked(mm); + + /* + * Clear old maps. this also does some error checking for us + */ + while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, + &rb_parent)) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + } + + /* Check against address space limits *after* clearing old maps... */ + if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + /* Can we just expand an old private anonymous mapping? */ + vma = vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) { + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + } + + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_pgoff = pgoff; + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); +out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vma->vm_flags |= VM_SOFTDIRTY; + return addr; +} + +unsigned long vm_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + unsigned long ret; + bool populate; + + down_write(&mm->mmap_sem); + ret = do_brk(addr, len); + populate = ((mm->def_flags & VM_LOCKED) != 0); + up_write(&mm->mmap_sem); + if (populate) + mm_populate(addr, len); + return ret; +} +EXPORT_SYMBOL(vm_brk); + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct *mm) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + unsigned long nr_accounted = 0; + + /* mm's last user has gone, and its about to be pulled down */ + mmu_notifier_release(mm); + + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { + if (vma->vm_flags & VM_LOCKED) + munlock_vma_pages_all(vma); + vma = vma->vm_next; + } + } + + arch_exit_mmap(mm); + + vma = mm->mmap; + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + + lru_add_drain(); + flush_cache_mm(mm); + tlb_gather_mmu(&tlb, mm, 0, -1); + /* update_hiwater_rss(mm) here? but nobody should be looking */ + /* Use -1 here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, vma, 0, -1); + + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + tlb_finish_mmu(&tlb, 0, -1); + + /* + * Walk the list again, actually closing and freeing it, + * with preemption enabled, without holding any MM locks. + */ + while (vma) { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); + vma = remove_vma(vma); + } + vm_unacct_memory(nr_accounted); +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_rwsem is taken here. + */ +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +{ + struct vm_area_struct *prev; + struct rb_node **rb_link, *rb_parent; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap_pgoff and in do_brk. + */ + if (!vma->vm_file) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + return -ENOMEM; + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, vma_pages(vma))) + return -ENOMEM; + + vma_link(mm, vma, prev, rb_link, rb_parent); + return 0; +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) +{ + struct vm_area_struct *vma = *vmap; + unsigned long vma_start = vma->vm_start; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma, *prev; + struct rb_node **rb_link, *rb_parent; + bool faulted_in_anon_vma = true; + + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. + */ + if (unlikely(!vma->vm_file && !vma->anon_vma)) { + pgoff = addr >> PAGE_SHIFT; + faulted_in_anon_vma = false; + } + + if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + return NULL; /* should never get here */ + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (new_vma) { + /* + * Source vma may have been merged into new_vma + */ + if (unlikely(vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end)) { + /* + * The only way we can get a vma_merge with + * self during an mremap is if the vma hasn't + * been faulted in yet and we were allowed to + * reset the dst vma->vm_pgoff to the + * destination address of the mremap to allow + * the merge to happen. mremap must change the + * vm_pgoff linearity between src and dst vmas + * (in turn preventing a vma_merge) to be + * safe. It is only safe to keep the vm_pgoff + * linear if there are no pages mapped yet. + */ + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); + *vmap = vma = new_vma; + } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); + } else { + new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (new_vma) { + *new_vma = *vma; + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (vma_dup_policy(vma, new_vma)) + goto out_free_vma; + INIT_LIST_HEAD(&new_vma->anon_vma_chain); + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; + } + } + return new_vma; + + out_free_mempol: + mpol_put(vma_policy(new_vma)); + out_free_vma: + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; +} + +/* + * Return true if the calling process may expand its vm space by the passed + * number of pages + */ +int may_expand_vm(struct mm_struct *mm, unsigned long npages) +{ + unsigned long cur = mm->total_vm; /* pages */ + unsigned long lim; + + lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; + + if (cur + npages > lim) + return 0; + return 1; +} + +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; + +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + pgoff_t pgoff; + struct page **pages; + + /* + * special mappings have no vm_file, and in that case, the mm + * uses vm_pgoff internally. So we have to subtract it from here. + * We are allowed to do this because we are the mm; do not copy + * this code into drivers! + */ + pgoff = vmf->pgoff - vma->vm_pgoff; + + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) + pgoff--; + + if (*pages) { + struct page *page = *pages; + get_page(page); + vmf->page = page; + return 0; + } + + return VM_FAULT_SIGBUS; +} + +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) +{ + int ret; + struct vm_area_struct *vma; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (unlikely(vma == NULL)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + + vma->vm_ops = ops; + vma->vm_private_data = priv; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; + + mm->total_vm += len >> PAGE_SHIFT; + + perf_event_mmap(vma); + + return vma; + +out: + kmem_cache_free(vm_area_cachep, vma); + return ERR_PTR(ret); +} + +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); + + return PTR_ERR_OR_ZERO(vma); +} + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); + /* + * We can safely modify head.next after taking the + * anon_vma->root->rwsem. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->root->rwsem. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_node)) + BUG(); + } +} + +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_sem in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_sem until mm_drop_all_locks() returns. + * + * mmap_sem in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout. It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We can take all the locks in random order because the VM code + * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never + * takes more than one of them in a row. Secondly we're protected + * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + + mutex_lock(&mm_all_locks_mutex); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); + } + + return 0; + +out_unlock: + mm_drop_all_locks(mm); + return -EINTR; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->rb_root will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->root->rwsem. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_node)) + BUG(); + anon_vma_unlock_write(anon_vma); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + i_mmap_unlock_write(mapping); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_sem cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} + +/* + * initialise the VMA slab + */ +void __init mmap_init(void) +{ + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); + VM_BUG_ON(ret); +} + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +subsys_initcall(init_user_reserve); + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +subsys_initcall(init_admin_reserve); + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long tmp, free_kbytes; + + switch (action) { + case MEM_ONLINE: + /* Default max is 128MB. Leave alone if modified by operator. */ + tmp = sysctl_user_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 17)) + init_user_reserve(); + + /* Default max is 8MB. Leave alone if modified by operator. */ + tmp = sysctl_admin_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 13)) + init_admin_reserve(); + + break; + case MEM_OFFLINE: + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + if (sysctl_user_reserve_kbytes > free_kbytes) { + init_user_reserve(); + pr_info("vm.user_reserve_kbytes reset to %lu\n", + sysctl_user_reserve_kbytes); + } + + if (sysctl_admin_reserve_kbytes > free_kbytes) { + init_admin_reserve(); + pr_info("vm.admin_reserve_kbytes reset to %lu\n", + sysctl_admin_reserve_kbytes); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { + .notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ + if (register_hotmemory_notifier(&reserve_mem_nb)) + pr_err("Failed registering memory add/remove notifier for admin reserve\n"); + + return 0; +} +subsys_initcall(init_reserve_notifier); diff --git a/kernel/mm/mmu_context.c b/kernel/mm/mmu_context.c new file mode 100644 index 000000000..b1b6f238e --- /dev/null +++ b/kernel/mm/mmu_context.c @@ -0,0 +1,64 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * + * See ../COPYING for licensing terms. + */ + +#include +#include +#include +#include + +#include + +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + preempt_disable_rt(); + active_mm = tsk->active_mm; + if (active_mm != mm) { + atomic_inc(&mm->mm_count); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + preempt_enable_rt(); + task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif + + if (active_mm != mm) + mmdrop(active_mm); +} +EXPORT_SYMBOL_GPL(use_mm); + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + sync_mm_rss(mm); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} +EXPORT_SYMBOL_GPL(unuse_mm); diff --git a/kernel/mm/mmu_notifier.c b/kernel/mm/mmu_notifier.c new file mode 100644 index 000000000..3b9b3d074 --- /dev/null +++ b/kernel/mm/mmu_notifier.c @@ -0,0 +1,396 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* global SRCU for all MMs */ +static struct srcu_struct srcu; + +/* + * This function allows mmu_notifier::release callback to delay a call to + * a function that will free appropriate resources. The function must be + * quick and must not block. + */ +void mmu_notifier_call_srcu(struct rcu_head *rcu, + void (*func)(struct rcu_head *rcu)) +{ + call_srcu(&srcu, rcu, func); +} +EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); + +void mmu_notifier_synchronize(void) +{ + /* Wait for any running method to finish. */ + srcu_barrier(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + +/* + * This function can't run concurrently against mmu_notifier_register + * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap + * runs with mm_users == 0. Other tasks may still invoke mmu notifiers + * in parallel despite there being no task using this mm any more, + * through the vmas outside of the exit_mmap context, such as with + * vmtruncate. This serializes against mmu_notifier_unregister with + * the mmu_notifier_mm->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm + * can't go away from under us as exit_mmap holds an mm_count pin + * itself. + */ +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + int id; + + /* + * SRCU here will block mmu_notifier_unregister until + * ->release returns. + */ + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) + /* + * If ->release runs before mmu_notifier_unregister it must be + * handled, as it's the only way for the driver to flush all + * existing sptes and stop the driver from establishing any more + * sptes before all the pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + + spin_lock(&mm->mmu_notifier_mm->lock); + while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { + mn = hlist_entry(mm->mmu_notifier_mm->list.first, + struct mmu_notifier, + hlist); + /* + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than to wait + * for ->release to finish and for mmu_notifier_unregister to + * return. + */ + hlist_del_init_rcu(&mn->hlist); + } + spin_unlock(&mm->mmu_notifier_mm->lock); + srcu_read_unlock(&srcu, id); + + /* + * synchronize_srcu here prevents mmu_notifier_release from returning to + * exit_mmap (which would proceed with freeing all pages in the mm) + * until the ->release method returns, if it was invoked by + * mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one mm_count + * is held by exit_mmap. + */ + synchronize_srcu(&srcu); +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct mmu_notifier *mn; + int young = 0, id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->clear_flush_young) + young |= mn->ops->clear_flush_young(mn, mm, start, end); + } + srcu_read_unlock(&srcu, id); + + return young; +} + +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + int young = 0, id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + srcu_read_unlock(&srcu, id); + + return young; +} + +void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, + pte_t pte) +{ + struct mmu_notifier *mn; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->change_pte) + mn->ops->change_pte(mn, mm, address, pte); + } + srcu_read_unlock(&srcu, id); +} + +void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + srcu_read_unlock(&srcu, id); +} + +void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_start) + mn->ops->invalidate_range_start(mn, mm, start, end); + } + srcu_read_unlock(&srcu, id); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); + +void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + /* + * Call invalidate_range here too to avoid the need for the + * subsystem of having to register an invalidate_range_end + * call-back when there is invalidate_range already. Usually a + * subsystem registers either invalidate_range_start()/end() or + * invalidate_range(), so this will be no additional overhead + * (besides the pointer check). + */ + if (mn->ops->invalidate_range) + mn->ops->invalidate_range(mn, mm, start, end); + if (mn->ops->invalidate_range_end) + mn->ops->invalidate_range_end(mn, mm, start, end); + } + srcu_read_unlock(&srcu, id); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); + +void __mmu_notifier_invalidate_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range) + mn->ops->invalidate_range(mn, mm, start, end); + } + srcu_read_unlock(&srcu, id); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); + +static int do_mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm, + int take_mmap_sem) +{ + struct mmu_notifier_mm *mmu_notifier_mm; + int ret; + + BUG_ON(atomic_read(&mm->mm_users) <= 0); + + /* + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ + BUG_ON(!srcu.per_cpu_ref); + + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + + if (take_mmap_sem) + down_write(&mm->mmap_sem); + ret = mm_take_all_locks(mm); + if (unlikely(ret)) + goto out_clean; + + if (!mm_has_notifiers(mm)) { + INIT_HLIST_HEAD(&mmu_notifier_mm->list); + spin_lock_init(&mmu_notifier_mm->lock); + + mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; + } + atomic_inc(&mm->mm_count); + + /* + * Serialize the update against mmu_notifier_unregister. A + * side note: mmu_notifier_release can't run concurrently with + * us because we hold the mm_users pin (either implicitly as + * current->mm or explicitly with get_task_mm() or similar). + * We can't race against any other mmu notifier method either + * thanks to mm_take_all_locks(). + */ + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + + mm_drop_all_locks(mm); +out_clean: + if (take_mmap_sem) + up_write(&mm->mmap_sem); + kfree(mmu_notifier_mm); +out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); + return ret; +} + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. Must also ensure mm_users can't go down + * to zero while this runs to avoid races with mmu_notifier_release, + * so mm has to be current->mm or the mm should be pinned safely such + * as with get_task_mm(). If the mm is not current->mm, the mm_users + * pin should be released by calling mmput after mmu_notifier_register + * returns. mmu_notifier_unregister must be always called to + * unregister the notifier. mm_count is automatically pinned to allow + * mmu_notifier_unregister to safely run at any time later, before or + * after exit_mmap. ->release will always be called before exit_mmap + * frees the pages. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 1); +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * Same as mmu_notifier_register but here the caller must hold the + * mmap_sem in write mode. + */ +int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 0); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_register); + +/* this is called after the last mmu_notifier_unregister() returned */ +void __mmu_notifier_mm_destroy(struct mm_struct *mm) +{ + BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); + kfree(mm->mmu_notifier_mm); + mm->mmu_notifier_mm = LIST_POISON1; /* debug */ +} + +/* + * This releases the mm_count pin automatically and frees the mm + * structure if it was the last user of it. It serializes against + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before + * calling mmu_notifier_unregister. ->release or any other notifier + * method may be invoked concurrently with mmu_notifier_unregister, + * and only after mmu_notifier_unregister returned we're guaranteed + * that ->release or any other method can't run anymore. + */ +void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +{ + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + if (!hlist_unhashed(&mn->hlist)) { + /* + * SRCU here will force exit_mmap to wait for ->release to + * finish before freeing the pages. + */ + int id; + + id = srcu_read_lock(&srcu); + /* + * exit_mmap will block in mmu_notifier_release to guarantee + * that ->release is called before freeing the pages. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); + + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + } + + /* + * Wait for any running method to finish, of course including + * ->release if it was run by mmu_notifier_release instead of us. + */ + synchronize_srcu(&srcu); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +/* + * Same as mmu_notifier_unregister but no callback and no srcu synchronization. + */ +void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + spin_lock(&mm->mmu_notifier_mm->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +static int __init mmu_notifier_init(void) +{ + return init_srcu_struct(&srcu); +} +subsys_initcall(mmu_notifier_init); diff --git a/kernel/mm/mmzone.c b/kernel/mm/mmzone.c new file mode 100644 index 000000000..7d87ebb0d --- /dev/null +++ b/kernel/mm/mmzone.c @@ -0,0 +1,114 @@ +/* + * linux/mm/mmzone.c + * + * management codes for pgdats, zones and page flags + */ + + +#include +#include +#include + +struct pglist_data *first_online_pgdat(void) +{ + return NODE_DATA(first_online_node); +} + +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) +{ + int nid = next_online_node(pgdat->node_id); + + if (nid == MAX_NUMNODES) + return NULL; + return NODE_DATA(nid); +} + +/* + * next_zone - helper magic for for_each_zone() + */ +struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + return zone; +} + +static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) +{ +#ifdef CONFIG_NUMA + return node_isset(zonelist_node_idx(zref), *nodes); +#else + return 1; +#endif /* CONFIG_NUMA */ +} + +/* Returns the next zone at or below highest_zoneidx in a zonelist */ +struct zoneref *next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes) +{ + /* + * Find the next suitable zone to use for the allocation. + * Only filter based on nodemask if it's set + */ + if (likely(nodes == NULL)) + while (zonelist_zone_idx(z) > highest_zoneidx) + z++; + else + while (zonelist_zone_idx(z) > highest_zoneidx || + (z->zone && !zref_in_nodemask(z, nodes))) + z++; + + return z; +} + +#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL +int memmap_valid_within(unsigned long pfn, + struct page *page, struct zone *zone) +{ + if (page_to_pfn(page) != pfn) + return 0; + + if (page_zone(page) != zone) + return 0; + + return 1; +} +#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +void lruvec_init(struct lruvec *lruvec) +{ + enum lru_list lru; + + memset(lruvec, 0, sizeof(struct lruvec)); + + for_each_lru(lru) + INIT_LIST_HEAD(&lruvec->lists[lru]); +} + +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +int page_cpupid_xchg_last(struct page *page, int cpupid) +{ + unsigned long old_flags, flags; + int last_cpupid; + + do { + old_flags = flags = page->flags; + last_cpupid = page_cpupid_last(page); + + flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + + return last_cpupid; +} +#endif diff --git a/kernel/mm/mprotect.c b/kernel/mm/mprotect.c new file mode 100644 index 000000000..88584838e --- /dev/null +++ b/kernel/mm/mprotect.c @@ -0,0 +1,433 @@ +/* + * mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, int prot_numa, spinlock_t **ptl) +{ + pte_t *pte; + spinlock_t *pmdl; + + /* !prot_numa is protected by mmap_sem held for write */ + if (!prot_numa) + return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + + pmdl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { + spin_unlock(pmdl); + return NULL; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + spin_unlock(pmdl); + return pte; +} + +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable, int prot_numa) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte, oldpte; + spinlock_t *ptl; + unsigned long pages = 0; + + pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + if (!pte) + return 0; + + arch_enter_lazy_mmu_mode(); + do { + oldpte = *pte; + if (pte_present(oldpte)) { + pte_t ptent; + bool preserve_write = prot_numa && pte_write(oldpte); + + /* + * Avoid trapping faults against the zero or KSM + * pages. See similar comment in change_huge_pmd. + */ + if (prot_numa) { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (!page || PageKsm(page)) + continue; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + continue; + } + + ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = pte_modify(ptent, newprot); + if (preserve_write) + ptent = pte_mkwrite(ptent); + + /* Avoid taking write faults for known dirty pages */ + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) { + ptent = pte_mkwrite(ptent); + } + ptep_modify_prot_commit(mm, addr, pte, ptent); + pages++; + } else if (IS_ENABLED(CONFIG_MIGRATION)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + + if (is_write_migration_entry(entry)) { + pte_t newpte; + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + set_pte_at(mm, addr, pte, newpte); + + pages++; + } + } + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + + return pages; +} + +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, + pud_t *pud, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) +{ + pmd_t *pmd; + struct mm_struct *mm = vma->vm_mm; + unsigned long next; + unsigned long pages = 0; + unsigned long nr_huge_updates = 0; + unsigned long mni_start = 0; + + pmd = pmd_offset(pud, addr); + do { + unsigned long this_pages; + + next = pmd_addr_end(addr, end); + if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + continue; + + /* invoke the mmu notifier if the pmd is populated */ + if (!mni_start) { + mni_start = addr; + mmu_notifier_invalidate_range_start(mm, mni_start, end); + } + + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma, addr, pmd); + else { + int nr_ptes = change_huge_pmd(vma, pmd, addr, + newprot, prot_numa); + + if (nr_ptes) { + if (nr_ptes == HPAGE_PMD_NR) { + pages += HPAGE_PMD_NR; + nr_huge_updates++; + } + + /* huge pmd was handled */ + continue; + } + } + /* fall through, the trans huge pmd just split */ + } + this_pages = change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + pages += this_pages; + } while (pmd++, addr = next, addr != end); + + if (mni_start) + mmu_notifier_invalidate_range_end(mm, mni_start, end); + + if (nr_huge_updates) + count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); + return pages; +} + +static inline unsigned long change_pud_range(struct vm_area_struct *vma, + pgd_t *pgd, unsigned long addr, unsigned long end, + pgprot_t newprot, int dirty_accountable, int prot_numa) +{ + pud_t *pud; + unsigned long next; + unsigned long pages = 0; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + pages += change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable, prot_numa); + } while (pud++, addr = next, addr != end); + + return pages; +} + +static unsigned long change_protection_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable, int prot_numa) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + unsigned long start = addr; + unsigned long pages = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + set_tlb_flush_pending(mm); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + pages += change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable, prot_numa); + } while (pgd++, addr = next, addr != end); + + /* Only flush the TLB if we actually modified any entries: */ + if (pages) + flush_tlb_range(vma, start, end); + clear_tlb_flush_pending(mm); + + return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable, int prot_numa) +{ + unsigned long pages; + + if (is_vm_hugetlb_page(vma)) + pages = hugetlb_change_protection(vma, start, end, newprot); + else + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + + return pages; +} + +int +mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long oldflags = vma->vm_flags; + long nrpages = (end - start) >> PAGE_SHIFT; + unsigned long charged = 0; + pgoff_t pgoff; + int error; + int dirty_accountable = 0; + + if (newflags == oldflags) { + *pprev = vma; + return 0; + } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here + */ + if (newflags & VM_WRITE) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| + VM_SHARED|VM_NORESERVE))) { + charged = nrpages; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } + } + + /* + * First try to merge with previous and/or next vma. + */ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *pprev = vma_merge(mm, *pprev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + if (*pprev) { + vma = *pprev; + goto success; + } + + *pprev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto fail; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto fail; + } + +success: + /* + * vm_flags and vm_page_prot are protected by the mmap_sem + * held in write mode. + */ + vma->vm_flags = newflags; + dirty_accountable = vma_wants_writenotify(vma); + vma_set_page_prot(vma); + + change_protection(vma, start, end, vma->vm_page_prot, + dirty_accountable, 0); + + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + vm_stat_account(mm, newflags, vma->vm_file, nrpages); + perf_event_mmap(vma); + return 0; + +fail: + vm_unacct_memory(charged); + return error; +} + +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) +{ + unsigned long vm_flags, nstart, end, tmp, reqprot; + struct vm_area_struct *vma, *prev; + int error = -EINVAL; + const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); + if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ + return -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + if (!len) + return 0; + len = PAGE_ALIGN(len); + end = start + len; + if (end <= start) + return -ENOMEM; + if (!arch_validate_prot(prot)) + return -EINVAL; + + reqprot = prot; + /* + * Does the application expect PROT_READ to imply PROT_EXEC: + */ + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) + prot |= PROT_EXEC; + + vm_flags = calc_vm_prot_bits(prot); + + down_write(¤t->mm->mmap_sem); + + vma = find_vma(current->mm, start); + error = -ENOMEM; + if (!vma) + goto out; + prev = vma->vm_prev; + if (unlikely(grows & PROT_GROWSDOWN)) { + if (vma->vm_start >= end) + goto out; + start = vma->vm_start; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + } else { + if (vma->vm_start > start) + goto out; + if (unlikely(grows & PROT_GROWSUP)) { + end = vma->vm_end; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSUP)) + goto out; + } + } + if (start > vma->vm_start) + prev = vma; + + for (nstart = start ; ; ) { + unsigned long newflags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + newflags = vm_flags; + newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + + /* newflags >> 4 shift VM_MAY% in place of VM_% */ + if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { + error = -EACCES; + goto out; + } + + error = security_file_mprotect(vma, reqprot, prot); + if (error) + goto out; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + goto out; + nstart = tmp; + + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + goto out; + + vma = prev->vm_next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + goto out; + } + } +out: + up_write(¤t->mm->mmap_sem); + return error; +} diff --git a/kernel/mm/mremap.c b/kernel/mm/mremap.c new file mode 100644 index 000000000..034e2d360 --- /dev/null +++ b/kernel/mm/mremap.c @@ -0,0 +1,584 @@ +/* + * mm/mremap.c + * + * (C) Copyright 1996 Linus Torvalds + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + pud = pud_offset(pgd, addr); + if (pud_none_or_clear_bad(pud)) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + + return pmd; +} + +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return NULL; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + VM_BUG_ON(pmd_trans_huge(*pmd)); + + return pmd; +} + +static pte_t move_soft_dirty_pte(pte_t pte) +{ + /* + * Set soft dirty bit so we can notice + * in userspace the ptes were moved. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY + if (pte_present(pte)) + pte = pte_mksoft_dirty(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_mksoft_dirty(pte); +#endif + return pte; +} + +static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, + unsigned long old_addr, unsigned long old_end, + struct vm_area_struct *new_vma, pmd_t *new_pmd, + unsigned long new_addr, bool need_rmap_locks) +{ + struct address_space *mapping = NULL; + struct anon_vma *anon_vma = NULL; + struct mm_struct *mm = vma->vm_mm; + pte_t *old_pte, *new_pte, pte; + spinlock_t *old_ptl, *new_ptl; + + /* + * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma + * locks to ensure that rmap will always observe either the old or the + * new ptes. This is the easiest way to avoid races with + * truncate_pagecache(), page migration, etc... + * + * When need_rmap_locks is false, we use other ways to avoid + * such races: + * + * - During exec() shift_arg_pages(), we use a specially tagged vma + * which rmap call sites look for using is_vma_temporary_stack(). + * + * - During mremap(), new_vma is often known to be placed after vma + * in rmap traversal order. This ensures rmap will always observe + * either the old pte, or the new pte, or both (the page table locks + * serialize access to individual ptes, but only rmap traversal + * order guarantees that we won't miss both the old and new ptes). + */ + if (need_rmap_locks) { + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + i_mmap_lock_write(mapping); + } + if (vma->anon_vma) { + anon_vma = vma->anon_vma; + anon_vma_lock_write(anon_vma); + } + } + + /* + * We don't have to worry about the ordering of src and dst + * pte locks because exclusive mmap_sem prevents deadlock. + */ + old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + new_pte = pte_offset_map(new_pmd, new_addr); + new_ptl = pte_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); + + for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, + new_pte++, new_addr += PAGE_SIZE) { + if (pte_none(*old_pte)) + continue; + pte = ptep_get_and_clear(mm, old_addr, old_pte); + pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); + pte = move_soft_dirty_pte(pte); + set_pte_at(mm, new_addr, new_pte, pte); + } + + arch_leave_lazy_mmu_mode(); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + pte_unmap(new_pte - 1); + pte_unmap_unlock(old_pte - 1, old_ptl); + if (anon_vma) + anon_vma_unlock_write(anon_vma); + if (mapping) + i_mmap_unlock_write(mapping); +} + +#define LATENCY_LIMIT (64 * PAGE_SIZE) + +unsigned long move_page_tables(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len, + bool need_rmap_locks) +{ + unsigned long extent, next, old_end; + pmd_t *old_pmd, *new_pmd; + bool need_flush = false; + unsigned long mmun_start; /* For mmu_notifiers */ + unsigned long mmun_end; /* For mmu_notifiers */ + + old_end = old_addr + len; + flush_cache_range(vma, old_addr, old_end); + + mmun_start = old_addr; + mmun_end = old_end; + mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); + + for (; old_addr < old_end; old_addr += extent, new_addr += extent) { + cond_resched(); + next = (old_addr + PMD_SIZE) & PMD_MASK; + /* even if next overflowed, extent below will be ok */ + extent = next - old_addr; + if (extent > old_end - old_addr) + extent = old_end - old_addr; + old_pmd = get_old_pmd(vma->vm_mm, old_addr); + if (!old_pmd) + continue; + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); + if (!new_pmd) + break; + if (pmd_trans_huge(*old_pmd)) { + int err = 0; + if (extent == HPAGE_PMD_SIZE) { + VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, + vma); + /* See comment in move_ptes() */ + if (need_rmap_locks) + anon_vma_lock_write(vma->anon_vma); + err = move_huge_pmd(vma, new_vma, old_addr, + new_addr, old_end, + old_pmd, new_pmd); + if (need_rmap_locks) + anon_vma_unlock_write(vma->anon_vma); + } + if (err > 0) { + need_flush = true; + continue; + } else if (!err) { + split_huge_page_pmd(vma, old_addr, old_pmd); + } + VM_BUG_ON(pmd_trans_huge(*old_pmd)); + } + if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, + new_pmd, new_addr)) + break; + next = (new_addr + PMD_SIZE) & PMD_MASK; + if (extent > next - new_addr) + extent = next - new_addr; + if (extent > LATENCY_LIMIT) + extent = LATENCY_LIMIT; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, + new_vma, new_pmd, new_addr, need_rmap_locks); + need_flush = true; + } + if (likely(need_flush)) + flush_tlb_range(vma, old_end-len, old_addr); + + mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + + return len + old_addr - old_end; /* how much done */ +} + +static unsigned long move_vma(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long old_len, + unsigned long new_len, unsigned long new_addr, bool *locked) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma; + unsigned long vm_flags = vma->vm_flags; + unsigned long new_pgoff; + unsigned long moved_len; + unsigned long excess = 0; + unsigned long hiwater_vm; + int split = 0; + int err; + bool need_rmap_locks; + + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) + return -ENOMEM; + + /* + * Advise KSM to break any KSM pages in the area to be moved: + * it would be confusing if they were to turn up at the new + * location, where they happen to coincide with different KSM + * pages recently unmapped. But leave vma->vm_flags as it was, + * so KSM can come around to merge on vma and new_vma afterwards. + */ + err = ksm_madvise(vma, old_addr, old_addr + old_len, + MADV_UNMERGEABLE, &vm_flags); + if (err) + return err; + + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); + if (!new_vma) + return -ENOMEM; + + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, + need_rmap_locks); + if (moved_len < old_len) { + /* + * On error, move entries back from new area to old, + * which will succeed since page tables still there, + * and then proceed to unmap new area instead of old. + */ + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, + true); + vma = new_vma; + old_len = new_len; + old_addr = new_addr; + new_addr = -ENOMEM; + } else if (vma->vm_file && vma->vm_file->f_op->mremap) { + err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + if (err < 0) { + move_page_tables(new_vma, new_addr, vma, old_addr, + moved_len, true); + return err; + } + } + + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vm_flags & VM_ACCOUNT) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (old_addr > vma->vm_start && + old_addr + old_len < vma->vm_end) + split = 1; + } + + /* + * If we failed to move page tables we still do total_vm increment + * since do_munmap() will decrement it by old_len == new_len. + * + * Since total_vm is about to be raised artificially high for a + * moment, we need to restore high watermark afterwards: if stats + * are taken meanwhile, total_vm and hiwater_vm appear too high. + * If this were a serious issue, we'd add a flag to do_munmap(). + */ + hiwater_vm = mm->hiwater_vm; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + + if (do_munmap(mm, old_addr, old_len) < 0) { + /* OOM: unable to split vma, just get accounts right */ + vm_unacct_memory(excess >> PAGE_SHIFT); + excess = 0; + } + mm->hiwater_vm = hiwater_vm; + + /* Restore VM_ACCOUNT if one or two pieces of vma left */ + if (excess) { + vma->vm_flags |= VM_ACCOUNT; + if (split) + vma->vm_next->vm_flags |= VM_ACCOUNT; + } + + if (vm_flags & VM_LOCKED) { + mm->locked_vm += new_len >> PAGE_SHIFT; + *locked = true; + } + + return new_addr; +} + +static struct vm_area_struct *vma_to_resize(unsigned long addr, + unsigned long old_len, unsigned long new_len, unsigned long *p) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = find_vma(mm, addr); + + if (!vma || vma->vm_start > addr) + return ERR_PTR(-EFAULT); + + if (is_vm_hugetlb_page(vma)) + return ERR_PTR(-EINVAL); + + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + return ERR_PTR(-EFAULT); + + /* Need to be careful about a growing mapping */ + if (new_len > old_len) { + unsigned long pgoff; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return ERR_PTR(-EFAULT); + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return ERR_PTR(-EINVAL); + } + + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = mm->locked_vm << PAGE_SHIFT; + lock_limit = rlimit(RLIMIT_MEMLOCK); + locked += new_len - old_len; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return ERR_PTR(-EAGAIN); + } + + if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + return ERR_PTR(-ENOMEM); + + if (vma->vm_flags & VM_ACCOUNT) { + unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; + if (security_vm_enough_memory_mm(mm, charged)) + return ERR_PTR(-ENOMEM); + *p = charged; + } + + return vma; +} + +static unsigned long mremap_to(unsigned long addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len, bool *locked) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; + unsigned long map_flags; + + if (new_addr & ~PAGE_MASK) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + ret = do_munmap(mm, new_addr, new_len); + if (ret) + goto out; + + if (old_len >= new_len) { + ret = do_munmap(mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + old_len = new_len; + } + + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + map_flags = MAP_FIXED; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (ret & ~PAGE_MASK) + goto out1; + + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); + if (!(ret & ~PAGE_MASK)) + goto out; +out1: + vm_unacct_memory(charged); + +out: + return ret; +} + +static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) +{ + unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ + return 0; + if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + return 0; + if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, + 0, MAP_FIXED) & ~PAGE_MASK) + return 0; + return 1; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; + bool locked = false; + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) + return ret; + + if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) + return ret; + + if (addr & ~PAGE_MASK) + return ret; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* + * We allow a zero old-len as a special case + * for DOS-emu "duplicate shm area" thing. But + * a zero new-len is nonsensical. + */ + if (!new_len) + return ret; + + down_write(¤t->mm->mmap_sem); + + if (flags & MREMAP_FIXED) { + ret = mremap_to(addr, old_len, new_addr, new_len, + &locked); + goto out; + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + * do_munmap does all the needed commit accounting + */ + if (old_len >= new_len) { + ret = do_munmap(mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + ret = addr; + goto out; + } + + /* + * Ok, we need to grow.. + */ + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + /* old_len exactly to the end of the area.. + */ + if (old_len == vma->vm_end - addr) { + /* can we just expand the current mapping? */ + if (vma_expandable(vma, new_len - old_len)) { + int pages = (new_len - old_len) >> PAGE_SHIFT; + + if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + ret = -ENOMEM; + goto out; + } + + vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + if (vma->vm_flags & VM_LOCKED) { + mm->locked_vm += pages; + locked = true; + new_addr = addr; + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, + vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; + } + + ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); + } +out: + if (ret & ~PAGE_MASK) + vm_unacct_memory(charged); + up_write(¤t->mm->mmap_sem); + if (locked && new_len > old_len) + mm_populate(new_addr + old_len, new_len - old_len); + return ret; +} diff --git a/kernel/mm/msync.c b/kernel/mm/msync.c new file mode 100644 index 000000000..bb04d53ae --- /dev/null +++ b/kernel/mm/msync.c @@ -0,0 +1,107 @@ +/* + * linux/mm/msync.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * The msync() system call. + */ +#include +#include +#include +#include +#include +#include + +/* + * MS_SYNC syncs the entire file - including mappings. + * + * MS_ASYNC does not start I/O (it used to, up to 2.5.67). + * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). + * Now it doesn't do anything, since dirty pages are properly tracked. + * + * The application may now run fsync() to + * write out the dirty pages and wait on the writeout and check the result. + * Or the application may run fadvise(FADV_DONTNEED) against the fd to start + * async writeout immediately. + * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to + * applications. + */ +SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) +{ + unsigned long end; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int unmapped_error = 0; + int error = -EINVAL; + + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + if (start & ~PAGE_MASK) + goto out; + if ((flags & MS_ASYNC) && (flags & MS_SYNC)) + goto out; + error = -ENOMEM; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -ENOMEM at the end. + */ + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + for (;;) { + struct file *file; + loff_t fstart, fend; + + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out_unlock; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + start = vma->vm_start; + if (start >= end) + goto out_unlock; + unmapped_error = -ENOMEM; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if ((flags & MS_INVALIDATE) && + (vma->vm_flags & VM_LOCKED)) { + error = -EBUSY; + goto out_unlock; + } + file = vma->vm_file; + fstart = (start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + fend = fstart + (min(end, vma->vm_end) - start) - 1; + start = vma->vm_end; + if ((flags & MS_SYNC) && file && + (vma->vm_flags & VM_SHARED)) { + get_file(file); + up_read(&mm->mmap_sem); + error = vfs_fsync_range(file, fstart, fend, 1); + fput(file); + if (error || start >= end) + goto out; + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + } else { + if (start >= end) { + error = 0; + goto out_unlock; + } + vma = vma->vm_next; + } + } +out_unlock: + up_read(&mm->mmap_sem); +out: + return error ? : unmapped_error; +} diff --git a/kernel/mm/nobootmem.c b/kernel/mm/nobootmem.c new file mode 100644 index 000000000..90b504683 --- /dev/null +++ b/kernel/mm/nobootmem.c @@ -0,0 +1,438 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + void *ptr; + u64 addr; + + if (limit > memblock.current_limit) + limit = memblock.current_limit; + + addr = memblock_find_in_range_node(size, align, goal, limit, nid); + if (!addr) + return NULL; + + if (memblock_reserve(addr, size)) + return NULL; + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; +} + +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int order; + + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); + + while (start + (1UL << order) > end) + order--; + + __free_pages_bootmem(pfn_to_page(start), order); + + start += (1UL << order); + } +} + +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn > end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + +static unsigned long __init free_low_memory_core_early(void) +{ + unsigned long count = 0; + phys_addr_t start, end; + u64 i; + + memblock_clear_hotplug(0, -1); + + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) + count += __free_memory_core(start, end); + +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + { + phys_addr_t size; + + /* Free memblock.reserved array if it was allocated */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); + + /* Free memblock.memory array if it was allocated */ + size = get_allocated_memblock_memory_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); + } +#endif + + return count; +} + +static int reset_managed_pages_done __initdata; + +void reset_node_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + if (reset_managed_pages_done) + return; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + + reset_managed_pages_done = 1; +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + unsigned long pages; + + reset_all_zones_managed_pages(); + + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ + pages = free_low_memory_core_early(); + totalram_pages += pages; + + return pages; +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + memblock_free(physaddr, size); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + memblock_free(addr, size); +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem_nopanic(size, align, goal, limit); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem(size, align, goal, limit); +} + +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + +again: + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, limit); + if (ptr) + return ptr; + + ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, + goal, limit); + if (ptr) + return ptr; + + if (goal) { + goal = 0; + goto again; + } + + return NULL; +} + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); +} + +static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal, + unsigned long limit) +{ + void *ptr; + + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); + if (ptr) + return ptr; + + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node(pgdat, size, align, goal, 0); +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return __alloc_bootmem_node(pgdat, size, align, goal); +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +void * __init __alloc_bootmem_low_nopanic(unsigned long size, + unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem_nopanic(size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + return ___alloc_bootmem_node(pgdat, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); +} diff --git a/kernel/mm/nommu.c b/kernel/mm/nommu.c new file mode 100644 index 000000000..e544508e2 --- /dev/null +++ b/kernel/mm/nommu.c @@ -0,0 +1,2181 @@ +/* + * linux/mm/nommu.c + * + * Replacement code for mm functions to support CPU's that don't + * have any form of memory management unit (thus no virtual memory). + * + * See Documentation/nommu-mmap.txt + * + * Copyright (c) 2004-2008 David Howells + * Copyright (c) 2000-2003 David McCullough + * Copyright (c) 2000-2001 D Jeff Dionne + * Copyright (c) 2002 Greg Ungerer + * Copyright (c) 2007-2010 Paul Mundt + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "internal.h" + +#if 0 +#define kenter(FMT, ...) \ + printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) +#else +#define kenter(FMT, ...) \ + no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) +#define kleave(FMT, ...) \ + no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) +#define kdebug(FMT, ...) \ + no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) +#endif + +void *high_memory; +EXPORT_SYMBOL(high_memory); +struct page *mem_map; +unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); +unsigned long highest_memmap_pfn; +struct percpu_counter vm_committed_as; +int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ +unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; +int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ +int heap_stack_gap = 0; + +atomic_long_t mmap_pages_allocated; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} + +EXPORT_SYMBOL_GPL(vm_memory_committed); + +EXPORT_SYMBOL(mem_map); + +/* list of mapped, potentially shareable regions */ +static struct kmem_cache *vm_region_jar; +struct rb_root nommu_region_tree = RB_ROOT; +DECLARE_RWSEM(nommu_region_sem); + +const struct vm_operations_struct generic_file_vm_ops = { +}; + +/* + * Return the total memory allocated for this pointer, not + * just what the caller asked for. + * + * Doesn't have to be accurate, i.e. may have races. + */ +unsigned int kobjsize(const void *objp) +{ + struct page *page; + + /* + * If the object we have should not have ksize performed on it, + * return size of 0 + */ + if (!objp || !virt_addr_valid(objp)) + return 0; + + page = virt_to_head_page(objp); + + /* + * If the allocator sets PageSlab, we know the pointer came from + * kmalloc(). + */ + if (PageSlab(page)) + return ksize(objp); + + /* + * If it's not a compound page, see if we have a matching VMA + * region. This test is intentionally done in reverse order, + * so if there's no VMA, we still fall through and hand back + * PAGE_SIZE for 0-order pages. + */ + if (!PageCompound(page)) { + struct vm_area_struct *vma; + + vma = find_vma(current->mm, (unsigned long)objp); + if (vma) + return vma->vm_end - vma->vm_start; + } + + /* + * The ksize() function is only guaranteed to work for pointers + * returned by kmalloc(). So handle arbitrary pointers here. + */ + return PAGE_SIZE << compound_order(page); +} + +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int foll_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + struct vm_area_struct *vma; + unsigned long vm_flags; + int i; + + /* calculate required read or write permissions. + * If FOLL_FORCE is set, we only require the "MAY" flags. + */ + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + for (i = 0; i < nr_pages; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) + goto finish_or_fault; + + if (pages) { + pages[i] = virt_to_page(start); + if (pages[i]) + page_cache_get(pages[i]); + } + if (vmas) + vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; + } + + return i; + +finish_or_fault: + return i ? : -EFAULT; +} + +/* + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); +} +EXPORT_SYMBOL(get_user_pages); + +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return get_user_pages(tsk, mm, start, nr_pages, write, force, + pages, NULL); +} +EXPORT_SYMBOL(get_user_pages_locked); + +long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + unsigned int gup_flags) +{ + long ret; + down_read(&mm->mmap_sem); + ret = get_user_pages(tsk, mm, start, nr_pages, write, force, + pages, NULL); + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(__get_user_pages_unlocked); + +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages, 0); +} +EXPORT_SYMBOL(get_user_pages_unlocked); + +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + +LIST_HEAD(vmap_area_list); + +void vfree(const void *addr) +{ + kfree(addr); +} +EXPORT_SYMBOL(vfree); + +void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +{ + /* + * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() + * returns only a logical address. + */ + return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); +} +EXPORT_SYMBOL(__vmalloc); + +void *vmalloc_user(unsigned long size) +{ + void *ret; + + ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + if (ret) { + struct vm_area_struct *vma; + + down_write(¤t->mm->mmap_sem); + vma = find_vma(current->mm, (unsigned long)ret); + if (vma) + vma->vm_flags |= VM_USERMAP; + up_write(¤t->mm->mmap_sem); + } + + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +struct page *vmalloc_to_page(const void *addr) +{ + return virt_to_page(addr); +} +EXPORT_SYMBOL(vmalloc_to_page); + +unsigned long vmalloc_to_pfn(const void *addr) +{ + return page_to_pfn(virt_to_page(addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + +long vread(char *buf, char *addr, unsigned long count) +{ + /* Don't allow overflow */ + if ((unsigned long) buf + count < count) + count = -(unsigned long) buf; + + memcpy(buf, addr, count); + return count; +} + +long vwrite(char *buf, char *addr, unsigned long count) +{ + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + memcpy(addr, buf, count); + return count; +} + +/* + * vmalloc - allocate virtually continguos memory + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); +} +EXPORT_SYMBOL(vmalloc); + +/* + * vzalloc - allocate virtually continguos memory with zero fill + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc_node(unsigned long size, int node) +{ + return vmalloc(size); +} +EXPORT_SYMBOL(vmalloc_node); + +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return vzalloc(size); +} +EXPORT_SYMBOL(vzalloc_node); + +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into continguos kernel virtual space. + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); +} +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + * + * VM_USERMAP is set on the corresponding VMA so that subsequent calls to + * remap_vmalloc_range() are permissible. + */ +void *vmalloc_32_user(unsigned long size) +{ + /* + * We'll have to sort out the ZONE_DMA bits for 64-bit, + * but for now this can simply use vmalloc_user() directly. + */ + return vmalloc_user(size); +} +EXPORT_SYMBOL(vmalloc_32_user); + +void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vmap); + +void vunmap(const void *addr) +{ + BUG(); +} +EXPORT_SYMBOL(vunmap); + +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vm_map_ram); + +void vm_unmap_ram(const void *mem, unsigned int count) +{ + BUG(); +} +EXPORT_SYMBOL(vm_unmap_ram); + +void vm_unmap_aliases(void) +{ +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/* + * Implement a stub for vmalloc_sync_all() if the architecture chose not to + * have one. + */ +void __weak vmalloc_sync_all(void) +{ +} + +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * + * Returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(free_vm_area); + +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page); + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +SYSCALL_DEFINE1(brk, unsigned long, brk) +{ + struct mm_struct *mm = current->mm; + + if (brk < mm->start_brk || brk > mm->context.end_brk) + return mm->brk; + + if (mm->brk == brk) + return mm->brk; + + /* + * Always allow shrinking brk + */ + if (brk <= mm->brk) { + mm->brk = brk; + return brk; + } + + /* + * Ok, looks good - let it rip. + */ + flush_icache_range(mm->brk, brk); + return mm->brk = brk; +} + +/* + * initialise the VMA and region record slabs + */ +void __init mmap_init(void) +{ + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); + VM_BUG_ON(ret); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); +} + +/* + * validate the region tree + * - the caller must hold the region lock + */ +#ifdef CONFIG_DEBUG_NOMMU_REGIONS +static noinline void validate_nommu_regions(void) +{ + struct vm_region *region, *last; + struct rb_node *p, *lastp; + + lastp = rb_first(&nommu_region_tree); + if (!lastp) + return; + + last = rb_entry(lastp, struct vm_region, vm_rb); + BUG_ON(unlikely(last->vm_end <= last->vm_start)); + BUG_ON(unlikely(last->vm_top < last->vm_end)); + + while ((p = rb_next(lastp))) { + region = rb_entry(p, struct vm_region, vm_rb); + last = rb_entry(lastp, struct vm_region, vm_rb); + + BUG_ON(unlikely(region->vm_end <= region->vm_start)); + BUG_ON(unlikely(region->vm_top < region->vm_end)); + BUG_ON(unlikely(region->vm_start < last->vm_top)); + + lastp = p; + } +} +#else +static void validate_nommu_regions(void) +{ +} +#endif + +/* + * add a region into the global tree + */ +static void add_nommu_region(struct vm_region *region) +{ + struct vm_region *pregion; + struct rb_node **p, *parent; + + validate_nommu_regions(); + + parent = NULL; + p = &nommu_region_tree.rb_node; + while (*p) { + parent = *p; + pregion = rb_entry(parent, struct vm_region, vm_rb); + if (region->vm_start < pregion->vm_start) + p = &(*p)->rb_left; + else if (region->vm_start > pregion->vm_start) + p = &(*p)->rb_right; + else if (pregion == region) + return; + else + BUG(); + } + + rb_link_node(®ion->vm_rb, parent, p); + rb_insert_color(®ion->vm_rb, &nommu_region_tree); + + validate_nommu_regions(); +} + +/* + * delete a region from the global tree + */ +static void delete_nommu_region(struct vm_region *region) +{ + BUG_ON(!nommu_region_tree.rb_node); + + validate_nommu_regions(); + rb_erase(®ion->vm_rb, &nommu_region_tree); + validate_nommu_regions(); +} + +/* + * free a contiguous series of pages + */ +static void free_page_series(unsigned long from, unsigned long to) +{ + for (; from < to; from += PAGE_SIZE) { + struct page *page = virt_to_page(from); + + kdebug("- free %lx", from); + atomic_long_dec(&mmap_pages_allocated); + if (page_count(page) != 1) + kdebug("free page %p: refcount not one: %d", + page, page_count(page)); + put_page(page); + } +} + +/* + * release a reference to a region + * - the caller must hold the region semaphore for writing, which this releases + * - the region may not have been added to the tree yet, in which case vm_top + * will equal vm_start + */ +static void __put_nommu_region(struct vm_region *region) + __releases(nommu_region_sem) +{ + kenter("%p{%d}", region, region->vm_usage); + + BUG_ON(!nommu_region_tree.rb_node); + + if (--region->vm_usage == 0) { + if (region->vm_top > region->vm_start) + delete_nommu_region(region); + up_write(&nommu_region_sem); + + if (region->vm_file) + fput(region->vm_file); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ + if (region->vm_flags & VM_MAPPED_COPY) { + kdebug("free series"); + free_page_series(region->vm_start, region->vm_top); + } + kmem_cache_free(vm_region_jar, region); + } else { + up_write(&nommu_region_sem); + } +} + +/* + * release a reference to a region + */ +static void put_nommu_region(struct vm_region *region) +{ + down_write(&nommu_region_sem); + __put_nommu_region(region); +} + +/* + * update protection on a vma + */ +static void protect_vma(struct vm_area_struct *vma, unsigned long flags) +{ +#ifdef CONFIG_MPU + struct mm_struct *mm = vma->vm_mm; + long start = vma->vm_start & PAGE_MASK; + while (start < vma->vm_end) { + protect_page(mm, start, flags); + start += PAGE_SIZE; + } + update_protections(mm); +#endif +} + +/* + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_sem held writelocked + */ +static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +{ + struct vm_area_struct *pvma, *prev; + struct address_space *mapping; + struct rb_node **p, *parent, *rb_prev; + + kenter(",%p", vma); + + BUG_ON(!vma->vm_region); + + mm->map_count++; + vma->vm_mm = mm; + + protect_vma(vma, vma->vm_flags); + + /* add the VMA to the mapping */ + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + + i_mmap_lock_write(mapping); + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* add the VMA to the tree */ + parent = rb_prev = NULL; + p = &mm->mm_rb.rb_node; + while (*p) { + parent = *p; + pvma = rb_entry(parent, struct vm_area_struct, vm_rb); + + /* sort by: start addr, end addr, VMA struct addr in that order + * (the latter is necessary as we may get identical VMAs) */ + if (vma->vm_start < pvma->vm_start) + p = &(*p)->rb_left; + else if (vma->vm_start > pvma->vm_start) { + rb_prev = parent; + p = &(*p)->rb_right; + } else if (vma->vm_end < pvma->vm_end) + p = &(*p)->rb_left; + else if (vma->vm_end > pvma->vm_end) { + rb_prev = parent; + p = &(*p)->rb_right; + } else if (vma < pvma) + p = &(*p)->rb_left; + else if (vma > pvma) { + rb_prev = parent; + p = &(*p)->rb_right; + } else + BUG(); + } + + rb_link_node(&vma->vm_rb, parent, p); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); + + /* add VMA to the VMA list also */ + prev = NULL; + if (rb_prev) + prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + + __vma_link_list(mm, vma, prev, parent); +} + +/* + * delete a VMA from its owning mm_struct and address space + */ +static void delete_vma_from_mm(struct vm_area_struct *vma) +{ + int i; + struct address_space *mapping; + struct mm_struct *mm = vma->vm_mm; + struct task_struct *curr = current; + + kenter("%p", vma); + + protect_vma(vma, 0); + + mm->map_count--; + for (i = 0; i < VMACACHE_SIZE; i++) { + /* if the vma is cached, invalidate the entire cache */ + if (curr->vmacache[i] == vma) { + vmacache_invalidate(mm); + break; + } + } + + /* remove the VMA from the mapping */ + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + + i_mmap_lock_write(mapping); + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* remove from the MM's tree and list */ + rb_erase(&vma->vm_rb, &mm->mm_rb); + + if (vma->vm_prev) + vma->vm_prev->vm_next = vma->vm_next; + else + mm->mmap = vma->vm_next; + + if (vma->vm_next) + vma->vm_next->vm_prev = vma->vm_prev; +} + +/* + * destroy a VMA record + */ +static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + kenter("%p", vma); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + put_nommu_region(vma->vm_region); + kmem_cache_free(vm_area_cachep, vma); +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + /* check the cache first */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + /* trawl the list (there may be multiple mappings in which addr + * resides) */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end > addr) { + vmacache_update(addr, vma); + return vma; + } + } + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return -ENOMEM; +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + unsigned long end = addr + len; + + /* check the cache first */ + vma = vmacache_find_exact(mm, addr, end); + if (vma) + return vma; + + /* trawl the list (there may be multiple mappings in which addr + * resides) */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_start < addr) + continue; + if (vma->vm_start > addr) + return NULL; + if (vma->vm_end == end) { + vmacache_update(addr, vma); + return vma; + } + } + + return NULL; +} + +/* + * determine whether a mapping should be permitted and, if so, what sort of + * mapping we're capable of supporting + */ +static int validate_mmap_request(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff, + unsigned long *_capabilities) +{ + unsigned long capabilities, rlen; + int ret; + + /* do the simple checks first */ + if (flags & MAP_FIXED) { + printk(KERN_DEBUG + "%d: Can't do fixed-address/overlay mmap of RAM\n", + current->pid); + return -EINVAL; + } + + if ((flags & MAP_TYPE) != MAP_PRIVATE && + (flags & MAP_TYPE) != MAP_SHARED) + return -EINVAL; + + if (!len) + return -EINVAL; + + /* Careful about overflows.. */ + rlen = PAGE_ALIGN(len); + if (!rlen || rlen > TASK_SIZE) + return -ENOMEM; + + /* offset overflow? */ + if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) + return -EOVERFLOW; + + if (file) { + /* files must support mmap */ + if (!file->f_op->mmap) + return -ENODEV; + + /* work out if what we've got could possibly be shared + * - we support chardevs that provide their own "memory" + * - we support files/blockdevs that are memory backed + */ + if (file->f_op->mmap_capabilities) { + capabilities = file->f_op->mmap_capabilities(file); + } else { + /* no explicit capabilities set, so assume some + * defaults */ + switch (file_inode(file)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFBLK: + capabilities = NOMMU_MAP_COPY; + break; + + case S_IFCHR: + capabilities = + NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | + NOMMU_MAP_WRITE; + break; + + default: + return -EINVAL; + } + } + + /* eliminate any capabilities that we can't support on this + * device */ + if (!file->f_op->get_unmapped_area) + capabilities &= ~NOMMU_MAP_DIRECT; + if (!(file->f_mode & FMODE_CAN_READ)) + capabilities &= ~NOMMU_MAP_COPY; + + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + + if (flags & MAP_SHARED) { + /* do checks for writing, appending and locking */ + if ((prot & PROT_WRITE) && + !(file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (IS_APPEND(file_inode(file)) && + (file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (locks_verify_locked(file)) + return -EAGAIN; + + if (!(capabilities & NOMMU_MAP_DIRECT)) + return -ENODEV; + + /* we mustn't privatise shared mappings */ + capabilities &= ~NOMMU_MAP_COPY; + } else { + /* we're going to read the file into private memory we + * allocate */ + if (!(capabilities & NOMMU_MAP_COPY)) + return -ENODEV; + + /* we don't permit a private writable mapping to be + * shared with the backing device */ + if (prot & PROT_WRITE) + capabilities &= ~NOMMU_MAP_DIRECT; + } + + if (capabilities & NOMMU_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || + ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || + ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) + ) { + capabilities &= ~NOMMU_MAP_DIRECT; + if (flags & MAP_SHARED) { + printk(KERN_WARNING + "MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + } + } + + /* handle executable mappings and implied executable + * mappings */ + if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (prot & PROT_EXEC) + return -EPERM; + } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + /* handle implication of PROT_EXEC by PROT_READ */ + if (current->personality & READ_IMPLIES_EXEC) { + if (capabilities & NOMMU_MAP_EXEC) + prot |= PROT_EXEC; + } + } else if ((prot & PROT_READ) && + (prot & PROT_EXEC) && + !(capabilities & NOMMU_MAP_EXEC) + ) { + /* backing file is not executable, try to copy */ + capabilities &= ~NOMMU_MAP_DIRECT; + } + } else { + /* anonymous mappings are always memory backed and can be + * privately mapped + */ + capabilities = NOMMU_MAP_COPY; + + /* handle PROT_EXEC implication by PROT_READ */ + if ((prot & PROT_READ) && + (current->personality & READ_IMPLIES_EXEC)) + prot |= PROT_EXEC; + } + + /* allow the security API to have its say */ + ret = security_mmap_addr(addr); + if (ret < 0) + return ret; + + /* looks okay */ + *_capabilities = capabilities; + return 0; +} + +/* + * we've determined that we can make the mapping, now translate what we + * now know into VMA flags + */ +static unsigned long determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) +{ + unsigned long vm_flags; + + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); + /* vm_flags |= mm->def_flags; */ + + if (!(capabilities & NOMMU_MAP_DIRECT)) { + /* attempt to share read-only copies of mapped file chunks */ + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (file && !(prot & PROT_WRITE)) + vm_flags |= VM_MAYSHARE; + } else { + /* overlay a shareable mapping on the backing device or inode + * if possible - used for chardevs, ramfs/tmpfs/shmfs and + * romfs/cramfs */ + vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); + if (flags & MAP_SHARED) + vm_flags |= VM_SHARED; + } + + /* refuse to let anyone share private mappings with this process if + * it's being traced - otherwise breakpoints set in it may interfere + * with another untraced process + */ + if ((flags & MAP_PRIVATE) && current->ptrace) + vm_flags &= ~VM_MAYSHARE; + + return vm_flags; +} + +/* + * set up a shared mapping on a file (the driver or filesystem provides and + * pins the storage) + */ +static int do_mmap_shared_file(struct vm_area_struct *vma) +{ + int ret; + + ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret == 0) { + vma->vm_region->vm_top = vma->vm_region->vm_end; + return 0; + } + if (ret != -ENOSYS) + return ret; + + /* getting -ENOSYS indicates that direct mmap isn't possible (as + * opposed to tried but failed) so we can only give a suitable error as + * it's not possible to make a private copy if MAP_SHARED was given */ + return -ENODEV; +} + +/* + * set up a private mapping or an anonymous shared mapping + */ +static int do_mmap_private(struct vm_area_struct *vma, + struct vm_region *region, + unsigned long len, + unsigned long capabilities) +{ + unsigned long total, point; + void *base; + int ret, order; + + /* invoke the file's mapping function so that it can keep track of + * shared mappings on devices or memory + * - VM_MAYSHARE will be set if it may attempt to share + */ + if (capabilities & NOMMU_MAP_DIRECT) { + ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); + if (ret == 0) { + /* shouldn't return success if we're not sharing */ + BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); + vma->vm_region->vm_top = vma->vm_region->vm_end; + return 0; + } + if (ret != -ENOSYS) + return ret; + + /* getting an ENOSYS error indicates that direct mmap isn't + * possible (as opposed to tried but failed) so we'll try to + * make a private copy of the data and map that instead */ + } + + + /* allocate some memory to hold the mapping + * - note that this may not return a page-aligned address if the object + * we're allocating is smaller than a page + */ + order = get_order(len); + kdebug("alloc order %d for %lx", order, len); + + total = 1 << order; + point = len >> PAGE_SHIFT; + + /* we don't want to allocate a power-of-2 sized page set */ + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + total = point; + kdebug("try to alloc exact %lu pages", total); + } + + base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); + if (!base) + goto enomem; + + atomic_long_add(total, &mmap_pages_allocated); + + region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + region->vm_start = (unsigned long) base; + region->vm_end = region->vm_start + len; + region->vm_top = region->vm_start + (total << PAGE_SHIFT); + + vma->vm_start = region->vm_start; + vma->vm_end = region->vm_start + len; + + if (vma->vm_file) { + /* read the contents of a file into the copy */ + mm_segment_t old_fs; + loff_t fpos; + + fpos = vma->vm_pgoff; + fpos <<= PAGE_SHIFT; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = __vfs_read(vma->vm_file, base, len, &fpos); + set_fs(old_fs); + + if (ret < 0) + goto error_free; + + /* clear the last little bit */ + if (ret < len) + memset(base + ret, 0, len - ret); + + } + + return 0; + +error_free: + free_page_series(region->vm_start, region->vm_top); + region->vm_start = vma->vm_start = 0; + region->vm_end = vma->vm_end = 0; + region->vm_top = 0; + return ret; + +enomem: + pr_err("Allocation of length %lu from process %d (%s) failed\n", + len, current->pid, current->comm); + show_free_areas(0); + return -ENOMEM; +} + +/* + * handle mapping creation for uClinux + */ +unsigned long do_mmap_pgoff(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff, + unsigned long *populate) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *rb; + unsigned long capabilities, vm_flags, result; + int ret; + + kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); + + *populate = 0; + + /* decide whether we should attempt the mapping, and if so what sort of + * mapping */ + ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, + &capabilities); + if (ret < 0) { + kleave(" = %d [val]", ret); + return ret; + } + + /* we ignore the address hint */ + addr = 0; + len = PAGE_ALIGN(len); + + /* we've determined that we can make the mapping, now translate what we + * now know into VMA flags */ + vm_flags = determine_vm_flags(file, prot, flags, capabilities); + + /* we're going to need to record the mapping */ + region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); + if (!region) + goto error_getting_region; + + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!vma) + goto error_getting_vma; + + region->vm_usage = 1; + region->vm_flags = vm_flags; + region->vm_pgoff = pgoff; + + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma->vm_flags = vm_flags; + vma->vm_pgoff = pgoff; + + if (file) { + region->vm_file = get_file(file); + vma->vm_file = get_file(file); + } + + down_write(&nommu_region_sem); + + /* if we want to share, we need to check for regions created by other + * mmap() calls that overlap with our proposed mapping + * - we can only share with a superset match on most regular files + * - shared mappings on character devices and memory backed files are + * permitted to overlap inexactly as far as we are concerned for in + * these cases, sharing is handled in the driver or filesystem rather + * than here + */ + if (vm_flags & VM_MAYSHARE) { + struct vm_region *pregion; + unsigned long pglen, rpglen, pgend, rpgend, start; + + pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgend = pgoff + pglen; + + for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { + pregion = rb_entry(rb, struct vm_region, vm_rb); + + if (!(pregion->vm_flags & VM_MAYSHARE)) + continue; + + /* search for overlapping mappings on the same file */ + if (file_inode(pregion->vm_file) != + file_inode(file)) + continue; + + if (pregion->vm_pgoff >= pgend) + continue; + + rpglen = pregion->vm_end - pregion->vm_start; + rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; + rpgend = pregion->vm_pgoff + rpglen; + if (pgoff >= rpgend) + continue; + + /* handle inexactly overlapping matches between + * mappings */ + if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && + !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { + /* new mapping is not a subset of the region */ + if (!(capabilities & NOMMU_MAP_DIRECT)) + goto sharing_violation; + continue; + } + + /* we've found a region we can share */ + pregion->vm_usage++; + vma->vm_region = pregion; + start = pregion->vm_start; + start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; + vma->vm_start = start; + vma->vm_end = start + len; + + if (pregion->vm_flags & VM_MAPPED_COPY) { + kdebug("share copy"); + vma->vm_flags |= VM_MAPPED_COPY; + } else { + kdebug("share mmap"); + ret = do_mmap_shared_file(vma); + if (ret < 0) { + vma->vm_region = NULL; + vma->vm_start = 0; + vma->vm_end = 0; + pregion->vm_usage--; + pregion = NULL; + goto error_just_free; + } + } + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; + goto share; + } + + /* obtain the address at which to make a shared mapping + * - this is the hook for quasi-memory character devices to + * tell us the location of a shared mapping + */ + if (capabilities & NOMMU_MAP_DIRECT) { + addr = file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + if (IS_ERR_VALUE(addr)) { + ret = addr; + if (ret != -ENOSYS) + goto error_just_free; + + /* the driver refused to tell us where to site + * the mapping so we'll have to attempt to copy + * it */ + ret = -ENODEV; + if (!(capabilities & NOMMU_MAP_COPY)) + goto error_just_free; + + capabilities &= ~NOMMU_MAP_DIRECT; + } else { + vma->vm_start = region->vm_start = addr; + vma->vm_end = region->vm_end = addr + len; + } + } + } + + vma->vm_region = region; + + /* set up the mapping + * - the region is filled in if NOMMU_MAP_DIRECT is still set + */ + if (file && vma->vm_flags & VM_SHARED) + ret = do_mmap_shared_file(vma); + else + ret = do_mmap_private(vma, region, len, capabilities); + if (ret < 0) + goto error_just_free; + add_nommu_region(region); + + /* clear anonymous mappings that don't ask for uninitialized data */ + if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) + memset((void *)region->vm_start, 0, + region->vm_end - region->vm_start); + + /* okay... we have a mapping; now we have to register it */ + result = vma->vm_start; + + current->mm->total_vm += len >> PAGE_SHIFT; + +share: + add_vma_to_mm(current->mm, vma); + + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } + + up_write(&nommu_region_sem); + + kleave(" = %lx", result); + return result; + +error_just_free: + up_write(&nommu_region_sem); +error: + if (region->vm_file) + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + if (vma->vm_file) + fput(vma->vm_file); + kmem_cache_free(vm_area_cachep, vma); + kleave(" = %d", ret); + return ret; + +sharing_violation: + up_write(&nommu_region_sem); + printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + ret = -EINVAL; + goto error; + +error_getting_vma: + kmem_cache_free(vm_region_jar, region); + printk(KERN_WARNING "Allocation of vma for %lu byte allocation" + " from process %d failed\n", + len, current->pid); + show_free_areas(0); + return -ENOMEM; + +error_getting_region: + printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" + " from process %d failed\n", + len, current->pid); + show_free_areas(0); + return -ENOMEM; +} + +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + audit_mmap_fd(fd, flags); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + + if (file) + fput(file); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + +/* + * split a vma into two pieces at address 'addr', a new vma is allocated either + * for the first part or the tail. + */ +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + struct vm_area_struct *new; + struct vm_region *region; + unsigned long npages; + + kenter(""); + + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ + if (vma->vm_file) + return -ENOMEM; + + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); + if (!region) + return -ENOMEM; + + new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!new) { + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; + } + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + *region = *vma->vm_region; + new->vm_region = region; + + npages = (addr - vma->vm_start) >> PAGE_SHIFT; + + if (new_below) { + region->vm_top = region->vm_end = new->vm_end = addr; + } else { + region->vm_start = new->vm_start = addr; + region->vm_pgoff = new->vm_pgoff += npages; + } + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + delete_vma_from_mm(vma); + down_write(&nommu_region_sem); + delete_nommu_region(vma->vm_region); + if (new_below) { + vma->vm_region->vm_start = vma->vm_start = addr; + vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; + } else { + vma->vm_region->vm_end = vma->vm_end = addr; + vma->vm_region->vm_top = addr; + } + add_nommu_region(vma->vm_region); + add_nommu_region(new->vm_region); + up_write(&nommu_region_sem); + add_vma_to_mm(mm, vma); + add_vma_to_mm(mm, new); + return 0; +} + +/* + * shrink a VMA by removing the specified chunk from either the beginning or + * the end + */ +static int shrink_vma(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long from, unsigned long to) +{ + struct vm_region *region; + + kenter(""); + + /* adjust the VMA's pointers, which may reposition it in the MM's tree + * and list */ + delete_vma_from_mm(vma); + if (from > vma->vm_start) + vma->vm_end = from; + else + vma->vm_start = to; + add_vma_to_mm(mm, vma); + + /* cut the backing region down to size */ + region = vma->vm_region; + BUG_ON(region->vm_usage != 1); + + down_write(&nommu_region_sem); + delete_nommu_region(region); + if (from > region->vm_start) { + to = region->vm_top; + region->vm_top = region->vm_end = from; + } else { + region->vm_start = to; + } + add_nommu_region(region); + up_write(&nommu_region_sem); + + free_page_series(from, to); + return 0; +} + +/* + * release a mapping + * - under NOMMU conditions the chunk to be unmapped must be backed by a single + * VMA, though it need not cover the whole VMA + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + unsigned long end; + int ret; + + kenter(",%lx,%zx", start, len); + + len = PAGE_ALIGN(len); + if (len == 0) + return -EINVAL; + + end = start + len; + + /* find the first potentially overlapping VMA */ + vma = find_vma(mm, start); + if (!vma) { + static int limit; + if (limit < 5) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d" + " (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); + limit++; + } + return -EINVAL; + } + + /* we're allowed to split an anonymous VMA but not a file-backed one */ + if (vma->vm_file) { + do { + if (start > vma->vm_start) { + kleave(" = -EINVAL [miss]"); + return -EINVAL; + } + if (end == vma->vm_end) + goto erase_whole_vma; + vma = vma->vm_next; + } while (vma); + kleave(" = -EINVAL [split file]"); + return -EINVAL; + } else { + /* the chunk must be a subset of the VMA found */ + if (start == vma->vm_start && end == vma->vm_end) + goto erase_whole_vma; + if (start < vma->vm_start || end > vma->vm_end) { + kleave(" = -EINVAL [superset]"); + return -EINVAL; + } + if (start & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned start]"); + return -EINVAL; + } + if (end != vma->vm_end && end & ~PAGE_MASK) { + kleave(" = -EINVAL [unaligned split]"); + return -EINVAL; + } + if (start != vma->vm_start && end != vma->vm_end) { + ret = split_vma(mm, vma, start, 1); + if (ret < 0) { + kleave(" = %d [split]", ret); + return ret; + } + } + return shrink_vma(mm, vma, start, end); + } + +erase_whole_vma: + delete_vma_from_mm(vma); + delete_vma(mm, vma); + kleave(" = 0"); + return 0; +} +EXPORT_SYMBOL(do_munmap); + +int vm_munmap(unsigned long addr, size_t len) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL(vm_munmap); + +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) +{ + return vm_munmap(addr, len); +} + +/* + * release all the mappings made in a process's VM space + */ +void exit_mmap(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + if (!mm) + return; + + kenter(""); + + mm->total_vm = 0; + + while ((vma = mm->mmap)) { + mm->mmap = vma->vm_next; + delete_vma_from_mm(vma); + delete_vma(mm, vma); + cond_resched(); + } + + kleave(""); +} + +unsigned long vm_brk(unsigned long addr, unsigned long len) +{ + return -ENOMEM; +} + +/* + * expand (or shrink) an existing mapping, potentially moving it at the same + * time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * under NOMMU conditions, we only permit changing a mapping's size, and only + * as long as it stays within the region allocated by do_mmap_private() and the + * block is not shareable + * + * MREMAP_FIXED is not supported under NOMMU conditions + */ +static unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + + /* insanity checks first */ + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + if (old_len == 0 || new_len == 0) + return (unsigned long) -EINVAL; + + if (addr & ~PAGE_MASK) + return -EINVAL; + + if (flags & MREMAP_FIXED && new_addr != addr) + return (unsigned long) -EINVAL; + + vma = find_vma_exact(current->mm, addr, old_len); + if (!vma) + return (unsigned long) -EINVAL; + + if (vma->vm_end != vma->vm_start + old_len) + return (unsigned long) -EFAULT; + + if (vma->vm_flags & VM_MAYSHARE) + return (unsigned long) -EPERM; + + if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) + return (unsigned long) -ENOMEM; + + /* all checks complete - do it */ + vma->vm_end = vma->vm_start + new_len; + return vma->vm_start; +} + +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; +} + +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + *page_mask = 0; + return NULL; +} + +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + if (addr != (pfn << PAGE_SHIFT)) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + return 0; +} +EXPORT_SYMBOL(remap_pfn_range); + +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long pfn = start >> PAGE_SHIFT; + unsigned long vm_len = vma->vm_end - vma->vm_start; + + pfn += vma->vm_pgoff; + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + unsigned int size = vma->vm_end - vma->vm_start; + + if (!(vma->vm_flags & VM_USERMAP)) + return -EINVAL; + + vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); + vma->vm_end = vma->vm_start + size; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range); + +unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + return -ENOMEM; +} + +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, + int even_cows) +{ +} +EXPORT_SYMBOL(unmap_mapping_range); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long free, allowed, reserve; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); + + free += get_nr_swap_pages(); + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (free <= totalreserve_pages) + goto error; + else + free -= totalreserve_pages; + + /* + * Reserve some for root + */ + if (!cap_sys_admin) + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + if (free > pages) + return 0; + + goto error; + } + + allowed = vm_commit_limit(); + /* + * Reserve some 3% for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; + +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + +int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(filemap_fault); + +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); +} +EXPORT_SYMBOL(filemap_map_pages); + +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) +{ + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (vma) { + /* don't overrun this mapping */ + if (addr + len >= vma->vm_end) + len = vma->vm_end - addr; + + /* only read or write mappings where it is permitted */ + if (write && vma->vm_flags & VM_MAYWRITE) + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); + else if (!write && vma->vm_flags & VM_MAYREAD) + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); + else + len = 0; + } else { + len = 0; + } + + up_read(&mm->mmap_sem); + + return len; +} + +/** + * @access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + len = __access_remote_vm(tsk, mm, addr, buf, len, write); + + mmput(mm); + return len; +} + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + i_mmap_lock_read(inode->i_mapping); + + /* search for VMAs that fall within the dead zone */ + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + i_mmap_unlock_read(inode->i_mapping); + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + i_mmap_unlock_read(inode->i_mapping); + up_write(&nommu_region_sem); + return 0; +} + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) diff --git a/kernel/mm/oom_kill.c b/kernel/mm/oom_kill.c new file mode 100644 index 000000000..2b665da1b --- /dev/null +++ b/kernel/mm/oom_kill.c @@ -0,0 +1,832 @@ +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998,2000 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * Copyright (C) 2010 Google, Inc. + * Rewritten by David Rientjes + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from __alloc_pages() + * in mm/page_alloc.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +int sysctl_panic_on_oom; +int sysctl_oom_kill_allocating_task; +int sysctl_oom_dump_tasks = 1; +static DEFINE_SPINLOCK(zone_scan_lock); + +#ifdef CONFIG_NUMA +/** + * has_intersects_mems_allowed() - check task eligiblity for kill + * @start: task struct of which task to consider + * @mask: nodemask passed to page allocator for mempolicy ooms + * + * Task eligibility is determined by whether or not a candidate task, @tsk, + * shares the same mempolicy nodes as current if it is bound by such a policy + * and whether or not it has the same set of allowed cpuset nodes. + */ +static bool has_intersects_mems_allowed(struct task_struct *start, + const nodemask_t *mask) +{ + struct task_struct *tsk; + bool ret = false; + + rcu_read_lock(); + for_each_thread(start, tsk) { + if (mask) { + /* + * If this is a mempolicy constrained oom, tsk's + * cpuset is irrelevant. Only return true if its + * mempolicy intersects current, otherwise it may be + * needlessly killed. + */ + ret = mempolicy_nodemask_intersects(tsk, mask); + } else { + /* + * This is not a mempolicy constrained oom, so only + * check the mems of tsk's cpuset. + */ + ret = cpuset_mems_allowed_intersects(current, tsk); + } + if (ret) + break; + } + rcu_read_unlock(); + + return ret; +} +#else +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) +{ + return true; +} +#endif /* CONFIG_NUMA */ + +/* + * The process p may have detached its own ->mm while exiting or through + * use_mm(), but one or more of its subthreads may still have a valid + * pointer. Return p, or any of its subthreads with a valid ->mm, with + * task_lock() held. + */ +struct task_struct *find_lock_task_mm(struct task_struct *p) +{ + struct task_struct *t; + + rcu_read_lock(); + + for_each_thread(p, t) { + task_lock(t); + if (likely(t->mm)) + goto found; + task_unlock(t); + } + t = NULL; +found: + rcu_read_unlock(); + + return t; +} + +/* return true if the task is not adequate as candidate victim task. */ +static bool oom_unkillable_task(struct task_struct *p, + struct mem_cgroup *memcg, const nodemask_t *nodemask) +{ + if (is_global_init(p)) + return true; + if (p->flags & PF_KTHREAD) + return true; + + /* When mem_cgroup_out_of_memory() and p is not member of the group */ + if (memcg && !task_in_mem_cgroup(p, memcg)) + return true; + + /* p may not have freeable memory in nodemask */ + if (!has_intersects_mems_allowed(p, nodemask)) + return true; + + return false; +} + +/** + * oom_badness - heuristic function to determine which candidate task to kill + * @p: task struct of which task we should calculate + * @totalpages: total present RAM allowed for page allocation + * + * The heuristic for determining which task to kill is made to be as simple and + * predictable as possible. The goal is to return the highest value for the + * task consuming the most memory to avoid subsequent oom failures. + */ +unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, + const nodemask_t *nodemask, unsigned long totalpages) +{ + long points; + long adj; + + if (oom_unkillable_task(p, memcg, nodemask)) + return 0; + + p = find_lock_task_mm(p); + if (!p) + return 0; + + adj = (long)p->signal->oom_score_adj; + if (adj == OOM_SCORE_ADJ_MIN) { + task_unlock(p); + return 0; + } + + /* + * The baseline for the badness score is the proportion of RAM that each + * task's rss, pagetable and swap space use. + */ + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); + task_unlock(p); + + /* + * Root processes get 3% bonus, just like the __vm_enough_memory() + * implementation used by LSMs. + */ + if (has_capability_noaudit(p, CAP_SYS_ADMIN)) + points -= (points * 3) / 100; + + /* Normalize to oom_score_adj units */ + adj *= totalpages / 1000; + points += adj; + + /* + * Never return 0 for an eligible task regardless of the root bonus and + * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). + */ + return points > 0 ? points : 1; +} + +/* + * Determine the type of allocation constraint. + */ +#ifdef CONFIG_NUMA +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) +{ + struct zone *zone; + struct zoneref *z; + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + bool cpuset_limited = false; + int nid; + + /* Default to all available memory */ + *totalpages = totalram_pages + total_swap_pages; + + if (!zonelist) + return CONSTRAINT_NONE; + /* + * Reach here only when __GFP_NOFAIL is used. So, we should avoid + * to kill current.We have to random task kill in this case. + * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. + */ + if (gfp_mask & __GFP_THISNODE) + return CONSTRAINT_NONE; + + /* + * This is not a __GFP_THISNODE allocation, so a truncated nodemask in + * the page allocator means a mempolicy is in effect. Cpuset policy + * is enforced in get_page_from_freelist(). + */ + if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, *nodemask) + *totalpages += node_spanned_pages(nid); + return CONSTRAINT_MEMORY_POLICY; + } + + /* Check this allocation failure is caused by cpuset's wall function */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) + if (!cpuset_zone_allowed(zone, gfp_mask)) + cpuset_limited = true; + + if (cpuset_limited) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, cpuset_current_mems_allowed) + *totalpages += node_spanned_pages(nid); + return CONSTRAINT_CPUSET; + } + return CONSTRAINT_NONE; +} +#else +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) +{ + *totalpages = totalram_pages + total_swap_pages; + return CONSTRAINT_NONE; +} +#endif + +enum oom_scan_t oom_scan_process_thread(struct task_struct *task, + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill) +{ + if (oom_unkillable_task(task, NULL, nodemask)) + return OOM_SCAN_CONTINUE; + + /* + * This task already has access to memory reserves and is being killed. + * Don't allow any other task to have access to the reserves. + */ + if (test_tsk_thread_flag(task, TIF_MEMDIE)) { + if (!force_kill) + return OOM_SCAN_ABORT; + } + if (!task->mm) + return OOM_SCAN_CONTINUE; + + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) + return OOM_SCAN_SELECT; + + if (task_will_free_mem(task) && !force_kill) + return OOM_SCAN_ABORT; + + return OOM_SCAN_OK; +} + +/* + * Simple selection loop. We chose the process with the highest + * number of 'points'. Returns -1 on scan abort. + * + * (not docbooked, we don't want this one cluttering up the manual) + */ +static struct task_struct *select_bad_process(unsigned int *ppoints, + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill) +{ + struct task_struct *g, *p; + struct task_struct *chosen = NULL; + unsigned long chosen_points = 0; + + rcu_read_lock(); + for_each_process_thread(g, p) { + unsigned int points; + + switch (oom_scan_process_thread(p, totalpages, nodemask, + force_kill)) { + case OOM_SCAN_SELECT: + chosen = p; + chosen_points = ULONG_MAX; + /* fall through */ + case OOM_SCAN_CONTINUE: + continue; + case OOM_SCAN_ABORT: + rcu_read_unlock(); + return (struct task_struct *)(-1UL); + case OOM_SCAN_OK: + break; + }; + points = oom_badness(p, NULL, nodemask, totalpages); + if (!points || points < chosen_points) + continue; + /* Prefer thread group leaders for display purposes */ + if (points == chosen_points && thread_group_leader(chosen)) + continue; + + chosen = p; + chosen_points = points; + } + if (chosen) + get_task_struct(chosen); + rcu_read_unlock(); + + *ppoints = chosen_points * 1000 / totalpages; + return chosen; +} + +/** + * dump_tasks - dump current memory state of all system tasks + * @memcg: current's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms + * + * Dumps the current memory state of all eligible tasks. Tasks not in the same + * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes + * are not shown. + * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, + * swapents, oom_score_adj value, and name. + */ +static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) +{ + struct task_struct *p; + struct task_struct *task; + + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); + rcu_read_lock(); + for_each_process(p) { + if (oom_unkillable_task(p, memcg, nodemask)) + continue; + + task = find_lock_task_mm(p); + if (!task) { + /* + * This is a kthread or all of p's threads have already + * detached their mm's. There's no need to report + * them; they can't be oom killed anyway. + */ + continue; + } + + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + atomic_long_read(&task->mm->nr_ptes), + mm_nr_pmds(task->mm), + get_mm_counter(task->mm, MM_SWAPENTS), + task->signal->oom_score_adj, task->comm); + task_unlock(task); + } + rcu_read_unlock(); +} + +static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *memcg, const nodemask_t *nodemask) +{ + task_lock(current); + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " + "oom_score_adj=%hd\n", + current->comm, gfp_mask, order, + current->signal->oom_score_adj); + cpuset_print_task_mems_allowed(current); + task_unlock(current); + dump_stack(); + if (memcg) + mem_cgroup_print_oom_info(memcg, p); + else + show_mem(SHOW_MEM_FILTER_NODES); + if (sysctl_oom_dump_tasks) + dump_tasks(memcg, nodemask); +} + +/* + * Number of OOM victims in flight + */ +static atomic_t oom_victims = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); + +bool oom_killer_disabled __read_mostly; +static DECLARE_RWSEM(oom_sem); + +/** + * mark_tsk_oom_victim - marks the given task as OOM victim. + * @tsk: task to mark + * + * Has to be called with oom_sem taken for read and never after + * oom has been disabled already. + */ +void mark_tsk_oom_victim(struct task_struct *tsk) +{ + WARN_ON(oom_killer_disabled); + /* OOM killer might race with memcg OOM */ + if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; + /* + * Make sure that the task is woken up from uninterruptible sleep + * if it is frozen because OOM killer wouldn't be able to free + * any memory and livelock. freezing_slow_path will tell the freezer + * that TIF_MEMDIE tasks should be ignored. + */ + __thaw_task(tsk); + atomic_inc(&oom_victims); +} + +/** + * unmark_oom_victim - unmarks the current task as OOM victim. + * + * Wakes up all waiters in oom_killer_disable() + */ +void unmark_oom_victim(void) +{ + if (!test_and_clear_thread_flag(TIF_MEMDIE)) + return; + + down_read(&oom_sem); + /* + * There is no need to signal the lasst oom_victim if there + * is nobody who cares. + */ + if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + wake_up_all(&oom_victims_wait); + up_read(&oom_sem); +} + +/** + * oom_killer_disable - disable OOM killer + * + * Forces all page allocations to fail rather than trigger OOM killer. + * Will block and wait until all OOM victims are killed. + * + * The function cannot be called when there are runnable user tasks because + * the userspace would see unexpected allocation failures as a result. Any + * new usage of this function should be consulted with MM people. + * + * Returns true if successful and false if the OOM killer cannot be + * disabled. + */ +bool oom_killer_disable(void) +{ + /* + * Make sure to not race with an ongoing OOM killer + * and that the current is not the victim. + */ + down_write(&oom_sem); + if (test_thread_flag(TIF_MEMDIE)) { + up_write(&oom_sem); + return false; + } + + oom_killer_disabled = true; + up_write(&oom_sem); + + wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + + return true; +} + +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + down_write(&oom_sem); + oom_killer_disabled = false; + up_write(&oom_sem); +} + +#define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * Must be called while holding a reference to p, which will be released upon + * returning. + */ +void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *memcg, nodemask_t *nodemask, + const char *message) +{ + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + struct mm_struct *mm; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just set TIF_MEMDIE so it can die quickly + */ + task_lock(p); + if (p->mm && task_will_free_mem(p)) { + mark_tsk_oom_victim(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(p, gfp_mask, order, memcg, nodemask); + + task_lock(p); + pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + task_unlock(p); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (child->mm == p->mm) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, memcg, nodemask, + totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + p = find_lock_task_mm(victim); + if (!p) { + put_task_struct(victim); + return; + } else if (victim != p) { + get_task_struct(p); + put_task_struct(victim); + victim = p; + } + + /* mm cannot safely be dereferenced after task_unlock(victim) */ + mm = victim->mm; + mark_tsk_oom_victim(victim); + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), + K(get_mm_counter(victim->mm, MM_ANONPAGES)), + K(get_mm_counter(victim->mm, MM_FILEPAGES))); + task_unlock(victim); + + /* + * Kill all user processes sharing victim->mm in other thread groups, if + * any. They don't get access to memory reserves, though, to avoid + * depletion of all memory. This prevents mm->mmap_sem livelock when an + * oom killed thread cannot exit because it requires the semaphore and + * its contended by another thread trying to allocate memory itself. + * That thread will now get access to memory reserves since it has a + * pending fatal signal. + */ + rcu_read_lock(); + for_each_process(p) + if (p->mm == mm && !same_thread_group(p, victim) && + !(p->flags & PF_KTHREAD)) { + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; + + task_lock(p); /* Protect ->comm from prctl() */ + pr_err("Kill process %d (%s) sharing same memory\n", + task_pid_nr(p), p->comm); + task_unlock(p); + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + } + rcu_read_unlock(); + + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + put_task_struct(victim); +} +#undef K + +/* + * Determines whether the kernel must panic because of the panic_on_oom sysctl. + */ +void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask, + struct mem_cgroup *memcg) +{ + if (likely(!sysctl_panic_on_oom)) + return; + if (sysctl_panic_on_oom != 2) { + /* + * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel + * does not panic for cpuset, mempolicy, or memcg allocation + * failures. + */ + if (constraint != CONSTRAINT_NONE) + return; + } + dump_header(NULL, gfp_mask, order, memcg, nodemask); + panic("Out of memory: %s panic_on_oom is enabled\n", + sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); +} + +static BLOCKING_NOTIFIER_HEAD(oom_notify_list); + +int register_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_oom_notifier); + +int unregister_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_oom_notifier); + +/* + * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero + * if a parallel OOM killing is already taking place that includes a zone in + * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. + */ +bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) +{ + struct zoneref *z; + struct zone *zone; + bool ret = true; + + spin_lock(&zone_scan_lock); + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) + if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { + ret = false; + goto out; + } + + /* + * Lock each zone in the zonelist under zone_scan_lock so a parallel + * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. + */ + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) + set_bit(ZONE_OOM_LOCKED, &zone->flags); + +out: + spin_unlock(&zone_scan_lock); + return ret; +} + +/* + * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed + * allocation attempts with zonelists containing them may now recall the OOM + * killer, if necessary. + */ +void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) +{ + struct zoneref *z; + struct zone *zone; + + spin_lock(&zone_scan_lock); + for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) + clear_bit(ZONE_OOM_LOCKED, &zone->flags); + spin_unlock(&zone_scan_lock); +} + +/** + * __out_of_memory - kill the "best" process when we run out of memory + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + */ +static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) +{ + const nodemask_t *mpol_mask; + struct task_struct *p; + unsigned long totalpages; + unsigned long freed = 0; + unsigned int uninitialized_var(points); + enum oom_constraint constraint = CONSTRAINT_NONE; + int killed = 0; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; + + /* + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. + * + * But don't select if current has already released its mm and cleared + * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. + */ + if (current->mm && + (fatal_signal_pending(current) || task_will_free_mem(current))) { + mark_tsk_oom_victim(current); + return; + } + + /* + * Check if there were limitations on the allocation (only relevant for + * NUMA) that may require different handling. + */ + constraint = constrained_alloc(zonelist, gfp_mask, nodemask, + &totalpages); + mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); + + if (sysctl_oom_kill_allocating_task && current->mm && + !oom_unkillable_task(current, NULL, nodemask) && + current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(current); + oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, + nodemask, + "Out of memory (oom_kill_allocating_task)"); + goto out; + } + + p = select_bad_process(&points, totalpages, mpol_mask, force_kill); + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + panic("Out of memory and no killable processes...\n"); + } + if (p != (void *)-1UL) { + oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, + nodemask, "Out of memory"); + killed = 1; + } +out: + /* + * Give the killed threads a good chance of exiting before trying to + * allocate memory again. + */ + if (killed) + schedule_timeout_killable(1); +} + +/** + * out_of_memory - tries to invoke OOM killer. + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator + * @force_kill: true if a task must be killed, even if others are exiting + * + * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() + * when it returns false. Otherwise returns true. + */ +bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask, bool force_kill) +{ + bool ret = false; + + down_read(&oom_sem); + if (!oom_killer_disabled) { + __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); + ret = true; + } + up_read(&oom_sem); + + return ret; +} + +/* + * The pagefault handler calls here because it is out of memory, so kill a + * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a + * parallel oom killing is already in progress so do nothing. + */ +void pagefault_out_of_memory(void) +{ + struct zonelist *zonelist; + + down_read(&oom_sem); + if (mem_cgroup_oom_synchronize(true)) + goto unlock; + + zonelist = node_zonelist(first_memory_node, GFP_KERNEL); + if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { + if (!oom_killer_disabled) + __out_of_memory(NULL, 0, 0, NULL, false); + else + /* + * There shouldn't be any user tasks runable while the + * OOM killer is disabled so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); + + oom_zonelist_unlock(zonelist, GFP_KERNEL); + } +unlock: + up_read(&oom_sem); +} diff --git a/kernel/mm/page-writeback.c b/kernel/mm/page-writeback.c new file mode 100644 index 000000000..eb59f7eea --- /dev/null +++ b/kernel/mm/page-writeback.c @@ -0,0 +1,2436 @@ +/* + * mm/page-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Contains functions related to writing back dirty pages at the + * address_space level. + * + * 10Apr2002 Andrew Morton + * Initial version + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* __set_page_dirty_buffers */ +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Sleep at most 200ms at a time in balance_dirty_pages(). + */ +#define MAX_PAUSE max(HZ/5, 1) + +/* + * Try to keep balance_dirty_pages() call intervals higher than this many pages + * by raising pause time to max_pause when falls below it. + */ +#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) + +/* + * Estimate write bandwidth at 200ms intervals. + */ +#define BANDWIDTH_INTERVAL max(HZ/5, 1) + +#define RATELIMIT_CALC_SHIFT 10 + +/* + * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited + * will look to see if it needs to force writeback or throttling. + */ +static long ratelimit_pages = 32; + +/* The following parameters are exported via /proc/sys/vm */ + +/* + * Start background writeback (via writeback threads) at this percentage + */ +int dirty_background_ratio = 10; + +/* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +unsigned long dirty_background_bytes; + +/* + * free highmem will not be subtracted from the total free memory + * for calculating free ratios if vm_highmem_is_dirtyable is true + */ +int vm_highmem_is_dirtyable; + +/* + * The generator of dirty data starts writeback at this percentage + */ +int vm_dirty_ratio = 20; + +/* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +unsigned long vm_dirty_bytes; + +/* + * The interval between `kupdate'-style writebacks + */ +unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ + +EXPORT_SYMBOL_GPL(dirty_writeback_interval); + +/* + * The longest time for which data is allowed to remain dirty + */ +unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ + +/* + * Flag that makes the machine dump writes/reads and block dirtyings. + */ +int block_dump; + +/* + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. + */ +int laptop_mode; + +EXPORT_SYMBOL(laptop_mode); + +/* End of sysctl-exported parameters */ + +unsigned long global_dirty_limit; + +/* + * Scale the writeback cache size proportional to the relative writeout speeds. + * + * We do this by keeping a floating proportion between BDIs, based on page + * writeback completions [end_page_writeback()]. Those devices that write out + * pages fastest will get the larger share, while the slower will get a smaller + * share. + * + * We use page writeout completions because we are interested in getting rid of + * dirty pages. Having them written out is the primary goal. + * + * We introduce a concept of time, a period over which we measure these events, + * because demand can/will vary over time. The length of this period itself is + * measured in page writeback completions. + * + */ +static struct fprop_global writeout_completions; + +static void writeout_period(unsigned long t); +/* Timer for aging of writeout_completions */ +static struct timer_list writeout_period_timer = + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); +static unsigned long writeout_period_time = 0; + +/* + * Length of period for aging writeout fractions of bdis. This is an + * arbitrarily chosen number. The longer the period, the slower fractions will + * reflect changes in current writeout rate. + */ +#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) + +/* + * In a memory zone, there is a certain amount of pages we consider + * available for the page cache, which is essentially the number of + * free and reclaimable pages, minus some zone reserves to protect + * lowmem and the ability to uphold the zone's watermarks without + * requiring writeback. + * + * This number of dirtyable pages is the base value of which the + * user-configurable dirty ratio is the effictive number of pages that + * are allowed to be actually dirtied. Per individual zone, or + * globally by using the sum of dirtyable pages over all zones. + * + * Because the user is allowed to specify the dirty limit globally as + * absolute number of bytes, calculating the per-zone dirty limit can + * require translating the configured limit into a percentage of + * global dirtyable memory first. + */ + +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + unsigned long nr_pages; + + nr_pages = zone_page_state(zone, NR_FREE_PAGES); + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + + nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); + nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + + return nr_pages; +} + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_dirtyable_memory(z); + } + /* + * Unreclaimable memory (kernel memory or anonymous memory + * without swap) can bring down the dirtyable pages below + * the zone's dirty balance reserve and the above calculation + * will underflow. However we still want to add in nodes + * which are below threshold (negative values) to get a more + * accurate calculation but make sure that the total never + * underflows. + */ + if ((long)x < 0) + x = 0; + + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * global_dirtyable_memory - number of globally dirtyable pages + * + * Returns the global number of pages potentially available for dirty + * page cache. This is the base value for the global dirty limits. + */ +static unsigned long global_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES); + x -= min(x, dirty_balance_reserve); + + x += global_page_state(NR_INACTIVE_FILE); + x += global_page_state(NR_ACTIVE_FILE); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + +/* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + const unsigned long available_memory = global_dirtyable_memory(); + unsigned long background; + unsigned long dirty; + struct task_struct *tsk; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else + dirty = (vm_dirty_ratio * available_memory) / 100; + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + trace_global_dirty_state(background, dirty); +} + +/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone + * + * Returns the maximum number of dirty pages allowed in a zone, based + * on the zone's dirtyable memory. + */ +static unsigned long zone_dirty_limit(struct zone *zone) +{ + unsigned long zone_memory = zone_dirtyable_memory(zone); + struct task_struct *tsk = current; + unsigned long dirty; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * + zone_memory / global_dirtyable_memory(); + else + dirty = vm_dirty_ratio * zone_memory / 100; + + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + dirty += dirty / 4; + + return dirty; +} + +/** + * zone_dirty_ok - tells whether a zone is within its dirty limits + * @zone: the zone to check + * + * Returns %true when the dirty pages in @zone are within the zone's + * dirty limit, %false if the limit is exceeded. + */ +bool zone_dirty_ok(struct zone *zone) +{ + unsigned long limit = zone_dirty_limit(zone); + + return zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) + + zone_page_state(zone, NR_WRITEBACK) <= limit; +} + +int dirty_background_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +int dirty_background_bytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + +int dirty_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_ratio = vm_dirty_ratio; + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + writeback_set_ratelimit(); + vm_dirty_bytes = 0; + } + return ret; +} + +int dirty_bytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned long old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + writeback_set_ratelimit(); + vm_dirty_ratio = 0; + } + return ret; +} + +static unsigned long wp_next_time(unsigned long cur_time) +{ + cur_time += VM_COMPLETIONS_PERIOD_LEN; + /* 0 has a special meaning... */ + if (!cur_time) + return 1; + return cur_time; +} + +/* + * Increment the BDI's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +{ + __inc_bdi_stat(bdi, BDI_WRITTEN); + __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, + bdi->max_prop_frac); + /* First event after period switching was turned off? */ + if (!unlikely(writeout_period_time)) { + /* + * We can race with other __bdi_writeout_inc calls here but + * it does not cause any harm since the resulting time when + * timer will fire and what is in writeout_period_time will be + * roughly the same. + */ + writeout_period_time = wp_next_time(jiffies); + mod_timer(&writeout_period_timer, writeout_period_time); + } +} + +void bdi_writeout_inc(struct backing_dev_info *bdi) +{ + unsigned long flags; + + local_irq_save(flags); + __bdi_writeout_inc(bdi); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(bdi_writeout_inc); + +/* + * Obtain an accurate fraction of the BDI's portion. + */ +static void bdi_writeout_fraction(struct backing_dev_info *bdi, + long *numerator, long *denominator) +{ + fprop_fraction_percpu(&writeout_completions, &bdi->completions, + numerator, denominator); +} + +/* + * On idle system, we can be called long after we scheduled because we use + * deferred timers so count with missed periods. + */ +static void writeout_period(unsigned long t) +{ + int miss_periods = (jiffies - writeout_period_time) / + VM_COMPLETIONS_PERIOD_LEN; + + if (fprop_new_period(&writeout_completions, miss_periods + 1)) { + writeout_period_time = wp_next_time(writeout_period_time + + miss_periods * VM_COMPLETIONS_PERIOD_LEN); + mod_timer(&writeout_period_timer, writeout_period_time); + } else { + /* + * Aging has zeroed all fractions. Stop wasting CPU on period + * updates. + */ + writeout_period_time = 0; + } +} + +/* + * bdi_min_ratio keeps the sum of the minimum dirty shares of all + * registered backing devices, which, for obvious reasons, can not + * exceed 100%. + */ +static unsigned int bdi_min_ratio; + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + int ret = 0; + + spin_lock_bh(&bdi_lock); + if (min_ratio > bdi->max_ratio) { + ret = -EINVAL; + } else { + min_ratio -= bdi->min_ratio; + if (bdi_min_ratio + min_ratio < 100) { + bdi_min_ratio += min_ratio; + bdi->min_ratio += min_ratio; + } else { + ret = -EINVAL; + } + } + spin_unlock_bh(&bdi_lock); + + return ret; +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +{ + int ret = 0; + + if (max_ratio > 100) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (bdi->min_ratio > max_ratio) { + ret = -EINVAL; + } else { + bdi->max_ratio = max_ratio; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; + } + spin_unlock_bh(&bdi_lock); + + return ret; +} +EXPORT_SYMBOL(bdi_set_max_ratio); + +static unsigned long dirty_freerun_ceiling(unsigned long thresh, + unsigned long bg_thresh) +{ + return (thresh + bg_thresh) / 2; +} + +static unsigned long hard_dirty_limit(unsigned long thresh) +{ + return max(thresh, global_dirty_limit); +} + +/** + * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * @bdi: the backing_dev_info to query + * @dirty: global dirty limit in pages + * + * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * + * Note that balance_dirty_pages() will only seriously take it as a hard limit + * when sleeping max_pause per page is not enough to keep the dirty pages under + * control. For example, when the device is completely stalled due to some error + * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * In the other normal situations, it acts more gently by throttling the tasks + * more (rather than completely block them) when the bdi dirty pages go high. + * + * It allocates high/low dirty limits to fast/slow devices, in order to prevent + * - starving fast devices + * - piling up dirty pages (that will take long time to sync) on slow devices + * + * The bdi's share of dirty limit will be adapting to its throughput and + * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + */ +unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +{ + u64 bdi_dirty; + long numerator, denominator; + + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); + + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); + + bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) + bdi_dirty = dirty * bdi->max_ratio / 100; + + return bdi_dirty; +} + +/* + * setpoint - dirty 3 + * f(dirty) := 1.0 + (----------------) + * limit - setpoint + * + * it's a 3rd order polynomial that subjects to + * + * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast + * (2) f(setpoint) = 1.0 => the balance point + * (3) f(limit) = 0 => the hard limit + * (4) df/dx <= 0 => negative feedback control + * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) + * => fast response on large errors; small oscillation near setpoint + */ +static long long pos_ratio_polynom(unsigned long setpoint, + unsigned long dirty, + unsigned long limit) +{ + long long pos_ratio; + long x; + + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + (limit - setpoint) | 1); + pos_ratio = x; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + + return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); +} + +/* + * Dirty position control. + * + * (o) global/bdi setpoints + * + * We want the dirty pages be balanced around the global/bdi setpoints. + * When the number of dirty pages is higher/lower than the setpoint, the + * dirty position control ratio (and hence task dirty ratelimit) will be + * decreased/increased to bring the dirty pages back to the setpoint. + * + * pos_ratio = 1 << RATELIMIT_CALC_SHIFT + * + * if (dirty < setpoint) scale up pos_ratio + * if (dirty > setpoint) scale down pos_ratio + * + * if (bdi_dirty < bdi_setpoint) scale up pos_ratio + * if (bdi_dirty > bdi_setpoint) scale down pos_ratio + * + * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT + * + * (o) global control line + * + * ^ pos_ratio + * | + * | |<===== global dirty control scope ======>| + * 2.0 .............* + * | .* + * | . * + * | . * + * | . * + * | . * + * | . * + * 1.0 ................................* + * | . . * + * | . . * + * | . . * + * | . . * + * | . . * + * 0 +------------.------------------.----------------------*-------------> + * freerun^ setpoint^ limit^ dirty pages + * + * (o) bdi control line + * + * ^ pos_ratio + * | + * | * + * | * + * | * + * | * + * | * |<=========== span ============>| + * 1.0 .......................* + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * 1/4 ...............................................* * * * * * * * * * * * + * | . . + * | . . + * | . . + * 0 +----------------------.-------------------------------.-------------> + * bdi_setpoint^ x_intercept^ + * + * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can + * be smoothly throttled down to normal if it starts high in situations like + * - start writing to a slow SD card and a fast disk at the same time. The SD + * card's bdi_dirty may rush to many times higher than bdi_setpoint. + * - the bdi dirty thresh drops quickly due to change of JBOD workload + */ +static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty) +{ + unsigned long write_bw = bdi->avg_write_bandwidth; + unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); + unsigned long limit = hard_dirty_limit(thresh); + unsigned long x_intercept; + unsigned long setpoint; /* dirty pages' target balance point */ + unsigned long bdi_setpoint; + unsigned long span; + long long pos_ratio; /* for scaling up/down the rate limit */ + long x; + + if (unlikely(dirty >= limit)) + return 0; + + /* + * global setpoint + * + * See comment for pos_ratio_polynom(). + */ + setpoint = (freerun + limit) / 2; + pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + + /* + * The strictlimit feature is a tool preventing mistrusted filesystems + * from growing a large number of dirty pages before throttling. For + * such filesystems balance_dirty_pages always checks bdi counters + * against bdi limits. Even if global "nr_dirty" is under "freerun". + * This is especially important for fuse which sets bdi->max_ratio to + * 1% by default. Without strictlimit feature, fuse writeback may + * consume arbitrary amount of RAM because it is accounted in + * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". + * + * Here, in bdi_position_ratio(), we calculate pos_ratio based on + * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global + * limits are set by default to 10% and 20% (background and throttle). + * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is + * about ~6K pages (as the average of background and throttle bdi + * limits). The 3rd order polynomial will provide positive feedback if + * bdi_dirty is under bdi_setpoint and vice versa. + * + * Note, that we cannot use global counters in these calculations + * because we want to throttle process writing to a strictlimit BDI + * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB + * in the example above). + */ + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long bdi_pos_ratio; + unsigned long bdi_bg_thresh; + + if (bdi_dirty < 8) + return min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + + if (bdi_dirty >= bdi_thresh) + return 0; + + bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); + bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, + bdi_bg_thresh); + + if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) + return 0; + + bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, + bdi_thresh); + + /* + * Typically, for strictlimit case, bdi_setpoint << setpoint + * and pos_ratio >> bdi_pos_ratio. In the other words global + * state ("dirty") is not limiting factor and we have to + * make decision based on bdi counters. But there is an + * important case when global pos_ratio should get precedence: + * global limits are exceeded (e.g. due to activities on other + * BDIs) while given strictlimit BDI is below limit. + * + * "pos_ratio * bdi_pos_ratio" would work for the case above, + * but it would look too non-natural for the case of all + * activity in the system coming from a single strictlimit BDI + * with bdi->max_ratio == 100%. + * + * Note that min() below somewhat changes the dynamics of the + * control system. Normally, pos_ratio value can be well over 3 + * (when globally we are at freerun and bdi is well below bdi + * setpoint). Now the maximum pos_ratio in the same situation + * is 2. We might want to tweak this if we observe the control + * system is too slow to adapt. + */ + return min(pos_ratio, bdi_pos_ratio); + } + + /* + * We have computed basic pos_ratio above based on global situation. If + * the bdi is over/under its share of dirty pages, we want to scale + * pos_ratio further down/up. That is done by the following mechanism. + */ + + /* + * bdi setpoint + * + * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) + * + * x_intercept - bdi_dirty + * := -------------------------- + * x_intercept - bdi_setpoint + * + * The main bdi control line is a linear function that subjects to + * + * (1) f(bdi_setpoint) = 1.0 + * (2) k = - 1 / (8 * write_bw) (in single bdi case) + * or equally: x_intercept = bdi_setpoint + 8 * write_bw + * + * For single bdi case, the dirty pages are observed to fluctuate + * regularly within range + * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] + * for various filesystems, where (2) can yield in a reasonable 12.5% + * fluctuation range for pos_ratio. + * + * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its + * own size, so move the slope over accordingly and choose a slope that + * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. + */ + if (unlikely(bdi_thresh > thresh)) + bdi_thresh = thresh; + /* + * It's very possible that bdi_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ + bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); + /* + * scale global setpoint to bdi's: + * bdi_setpoint = setpoint * bdi_thresh / thresh + */ + x = div_u64((u64)bdi_thresh << 16, thresh | 1); + bdi_setpoint = setpoint * (u64)x >> 16; + /* + * Use span=(8*write_bw) in single bdi case as indicated by + * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. + * + * bdi_thresh thresh - bdi_thresh + * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh + * thresh thresh + */ + span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = bdi_setpoint + span; + + if (bdi_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), + (x_intercept - bdi_setpoint) | 1); + } else + pos_ratio /= 4; + + /* + * bdi reserve area, safeguard against dirty pool underrun and disk idle + * It may push the desired control point of global dirty pages higher + * than setpoint. + */ + x_intercept = bdi_thresh / 2; + if (bdi_dirty < x_intercept) { + if (bdi_dirty > x_intercept / 8) + pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty); + else + pos_ratio *= 8; + } + + return pos_ratio; +} + +static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = bdi->avg_write_bandwidth; + unsigned long old = bdi->write_bandwidth; + u64 bw; + + /* + * bw = written * HZ / elapsed + * + * bw * elapsed + write_bandwidth * (period - elapsed) + * write_bandwidth = --------------------------------------------------- + * period + * + * @written may have decreased due to account_page_redirty(). + * Avoid underflowing @bw calculation. + */ + bw = written - min(written, bdi->written_stamp); + bw *= HZ; + if (unlikely(elapsed > period)) { + do_div(bw, elapsed); + avg = bw; + goto out; + } + bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw >>= ilog2(period); + + /* + * one more level of smoothing, for filtering out sudden spikes + */ + if (avg > old && old >= (unsigned long)bw) + avg -= (avg - old) >> 3; + + if (avg < old && old <= (unsigned long)bw) + avg += (old - avg) >> 3; + +out: + bdi->write_bandwidth = bw; + bdi->avg_write_bandwidth = avg; +} + +/* + * The global dirtyable memory and dirty threshold could be suddenly knocked + * down by a large amount (eg. on the startup of KVM in a swapless system). + * This may throw the system into deep dirty exceeded state and throttle + * heavy/light dirtiers alike. To retain good responsiveness, maintain + * global_dirty_limit for tracking slowly down to the knocked down dirty + * threshold. + */ +static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +{ + unsigned long limit = global_dirty_limit; + + /* + * Follow up in one step. + */ + if (limit < thresh) { + limit = thresh; + goto update; + } + + /* + * Follow down slowly. Use the higher one as the target, because thresh + * may drop below dirty. This is exactly the reason to introduce + * global_dirty_limit which is guaranteed to lie above the dirty pages. + */ + thresh = max(thresh, dirty); + if (limit > thresh) { + limit -= (limit - thresh) >> 5; + goto update; + } + return; +update: + global_dirty_limit = limit; +} + +static void global_update_bandwidth(unsigned long thresh, + unsigned long dirty, + unsigned long now) +{ + static DEFINE_SPINLOCK(dirty_lock); + static unsigned long update_time = INITIAL_JIFFIES; + + /* + * check locklessly first to optimize away locking for the most time + */ + if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + return; + + spin_lock(&dirty_lock); + if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { + update_dirty_limit(thresh, dirty); + update_time = now; + } + spin_unlock(&dirty_lock); +} + +/* + * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. + * + * Normal bdi tasks will be curbed at or below it in long term. + * Obviously it should be around (write_bw / N) when there are N dd tasks. + */ +static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long dirtied, + unsigned long elapsed) +{ + unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); + unsigned long limit = hard_dirty_limit(thresh); + unsigned long setpoint = (freerun + limit) / 2; + unsigned long write_bw = bdi->avg_write_bandwidth; + unsigned long dirty_ratelimit = bdi->dirty_ratelimit; + unsigned long dirty_rate; + unsigned long task_ratelimit; + unsigned long balanced_dirty_ratelimit; + unsigned long pos_ratio; + unsigned long step; + unsigned long x; + + /* + * The dirty rate will match the writeout rate in long term, except + * when dirty pages are truncated by userspace or re-dirtied by FS. + */ + dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + + pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, + bdi_thresh, bdi_dirty); + /* + * task_ratelimit reflects each dd's dirty rate for the past 200ms. + */ + task_ratelimit = (u64)dirty_ratelimit * + pos_ratio >> RATELIMIT_CALC_SHIFT; + task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ + + /* + * A linear estimation of the "balanced" throttle rate. The theory is, + * if there are N dd tasks, each throttled at task_ratelimit, the bdi's + * dirty_rate will be measured to be (N * task_ratelimit). So the below + * formula will yield the balanced rate limit (write_bw / N). + * + * Note that the expanded form is not a pure rate feedback: + * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) + * but also takes pos_ratio into account: + * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) + * + * (1) is not realistic because pos_ratio also takes part in balancing + * the dirty rate. Consider the state + * pos_ratio = 0.5 (3) + * rate = 2 * (write_bw / N) (4) + * If (1) is used, it will stuck in that state! Because each dd will + * be throttled at + * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) + * yielding + * dirty_rate = N * task_ratelimit = write_bw (6) + * put (6) into (1) we get + * rate_(i+1) = rate_(i) (7) + * + * So we end up using (2) to always keep + * rate_(i+1) ~= (write_bw / N) (8) + * regardless of the value of pos_ratio. As long as (8) is satisfied, + * pos_ratio is able to drive itself to 1.0, which is not only where + * the dirty count meet the setpoint, but also where the slope of + * pos_ratio is most flat and hence task_ratelimit is least fluctuated. + */ + balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, + dirty_rate | 1); + /* + * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw + */ + if (unlikely(balanced_dirty_ratelimit > write_bw)) + balanced_dirty_ratelimit = write_bw; + + /* + * We could safely do this and return immediately: + * + * bdi->dirty_ratelimit = balanced_dirty_ratelimit; + * + * However to get a more stable dirty_ratelimit, the below elaborated + * code makes use of task_ratelimit to filter out singular points and + * limit the step size. + * + * The below code essentially only uses the relative value of + * + * task_ratelimit - dirty_ratelimit + * = (pos_ratio - 1) * dirty_ratelimit + * + * which reflects the direction and size of dirty position error. + */ + + /* + * dirty_ratelimit will follow balanced_dirty_ratelimit iff + * task_ratelimit is on the same side of dirty_ratelimit, too. + * For example, when + * - dirty_ratelimit > balanced_dirty_ratelimit + * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) + * lowering dirty_ratelimit will help meet both the position and rate + * control targets. Otherwise, don't update dirty_ratelimit if it will + * only help meet the rate target. After all, what the users ultimately + * feel and care are stable dirty rate and small position error. + * + * |task_ratelimit - dirty_ratelimit| is used to limit the step size + * and filter out the singular points of balanced_dirty_ratelimit. Which + * keeps jumping around randomly and can even leap far away at times + * due to the small 200ms estimation period of dirty_rate (we want to + * keep that period small to reduce time lags). + */ + step = 0; + + /* + * For strictlimit case, calculations above were based on bdi counters + * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). + * Hence, to calculate "step" properly, we have to use bdi_dirty as + * "dirty" and bdi_setpoint as "setpoint". + * + * We rampup dirty_ratelimit forcibly if bdi_dirty is low because + * it's possible that bdi_thresh is close to zero due to inactivity + * of backing device (see the implementation of bdi_dirty_limit()). + */ + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = bdi_dirty; + if (bdi_dirty < 8) + setpoint = bdi_dirty + 1; + else + setpoint = (bdi_thresh + + bdi_dirty_limit(bdi, bg_thresh)) / 2; + } + + if (dirty < setpoint) { + x = min3(bdi->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); + if (dirty_ratelimit < x) + step = x - dirty_ratelimit; + } else { + x = max3(bdi->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); + if (dirty_ratelimit > x) + step = dirty_ratelimit - x; + } + + /* + * Don't pursue 100% rate matching. It's impossible since the balanced + * rate itself is constantly fluctuating. So decrease the track speed + * when it gets close to the target. Helps eliminate pointless tremors. + */ + step >>= dirty_ratelimit / (2 * step + 1); + /* + * Limit the tracking speed to avoid overshooting. + */ + step = (step + 7) / 8; + + if (dirty_ratelimit < balanced_dirty_ratelimit) + dirty_ratelimit += step; + else + dirty_ratelimit -= step; + + bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); + bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; + + trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); +} + +void __bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long dirtied; + unsigned long written; + + /* + * rate-limit, only update once every 200ms. + */ + if (elapsed < BANDWIDTH_INTERVAL) + return; + + dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); + written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + + /* + * Skip quiet periods when disk bandwidth is under-utilized. + * (at least 1s idle time between two flusher runs) + */ + if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + goto snapshot; + + if (thresh) { + global_update_bandwidth(thresh, dirty, now); + bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, + bdi_thresh, bdi_dirty, + dirtied, elapsed); + } + bdi_update_write_bandwidth(bdi, elapsed, written); + +snapshot: + bdi->dirtied_stamp = dirtied; + bdi->written_stamp = written; + bdi->bw_time_stamp = now; +} + +static void bdi_update_bandwidth(struct backing_dev_info *bdi, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time) +{ + if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) + return; + spin_lock(&bdi->wb.list_lock); + __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, + bdi_thresh, bdi_dirty, start_time); + spin_unlock(&bdi->wb.list_lock); +} + +/* + * After a task dirtied this many pages, balance_dirty_pages_ratelimited() + * will look to see if it needs to start dirty throttling. + * + * If dirty_poll_interval is too low, big NUMA machines will call the expensive + * global_page_state() too often. So scale it near-sqrt to the safety margin + * (the number of pages we may dirty without exceeding the dirty limits). + */ +static unsigned long dirty_poll_interval(unsigned long dirty, + unsigned long thresh) +{ + if (thresh > dirty) + return 1UL << (ilog2(thresh - dirty) >> 1); + + return 1; +} + +static unsigned long bdi_max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) +{ + unsigned long bw = bdi->avg_write_bandwidth; + unsigned long t; + + /* + * Limit pause time for small memory systems. If sleeping for too long + * time, a small pool of dirty/writeback pages may go empty and disk go + * idle. + * + * 8 serves as the safety ratio. + */ + t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t++; + + return min_t(unsigned long, t, MAX_PAUSE); +} + +static long bdi_min_pause(struct backing_dev_info *bdi, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) +{ + long hi = ilog2(bdi->avg_write_bandwidth); + long lo = ilog2(bdi->dirty_ratelimit); + long t; /* target pause */ + long pause; /* estimated next pause */ + int pages; /* target nr_dirtied_pause */ + + /* target for 10ms pause on 1-dd case */ + t = max(1, HZ / 100); + + /* + * Scale up pause time for concurrent dirtiers in order to reduce CPU + * overheads. + * + * (N * 10ms) on 2^N concurrent tasks. + */ + if (hi > lo) + t += (hi - lo) * (10 * HZ) / 1024; + + /* + * This is a bit convoluted. We try to base the next nr_dirtied_pause + * on the much more stable dirty_ratelimit. However the next pause time + * will be computed based on task_ratelimit and the two rate limits may + * depart considerably at some time. Especially if task_ratelimit goes + * below dirty_ratelimit/2 and the target pause is max_pause, the next + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a + * result task_ratelimit won't be executed faithfully, which could + * eventually bring down dirty_ratelimit. + * + * We apply two rules to fix it up: + * 1) try to estimate the next pause time and if necessary, use a lower + * nr_dirtied_pause so as not to exceed max_pause. When this happens, + * nr_dirtied_pause will be "dancing" with task_ratelimit. + * 2) limit the target pause time to max_pause/2, so that the normal + * small fluctuations of task_ratelimit won't trigger rule (1) and + * nr_dirtied_pause will remain as stable as dirty_ratelimit. + */ + t = min(t, 1 + max_pause / 2); + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + + /* + * Tiny nr_dirtied_pause is found to hurt I/O performance in the test + * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. + * When the 16 consecutive reads are often interrupted by some dirty + * throttling pause during the async writes, cfq will go into idles + * (deadline is fine). So push nr_dirtied_pause as high as possible + * until reaches DIRTY_POLL_THRESH=32 pages. + */ + if (pages < DIRTY_POLL_THRESH) { + t = max_pause; + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + if (pages > DIRTY_POLL_THRESH) { + pages = DIRTY_POLL_THRESH; + t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; + } + } + + pause = HZ * pages / (task_ratelimit + 1); + if (pause > max_pause) { + t = max_pause; + pages = task_ratelimit * t / roundup_pow_of_two(HZ); + } + + *nr_dirtied_pause = pages; + /* + * The minimal pause time will normally be half the target pause time. + */ + return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; +} + +static inline void bdi_dirty_limits(struct backing_dev_info *bdi, + unsigned long dirty_thresh, + unsigned long background_thresh, + unsigned long *bdi_dirty, + unsigned long *bdi_thresh, + unsigned long *bdi_bg_thresh) +{ + unsigned long bdi_reclaimable; + + /* + * bdi_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (bdi_dirty >> bdi_thresh) either because + * bdi_dirty starts high, or because bdi_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until bdi_dirty drops under + * bdi_thresh. Instead the auxiliary bdi control line in + * bdi_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + */ + *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + + if (bdi_bg_thresh) + *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh) : 0; + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { + bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat_sum(bdi, BDI_WRITEBACK); + } else { + bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat(bdi, BDI_WRITEBACK); + } +} + +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. + */ +static void balance_dirty_pages(struct address_space *mapping, + unsigned long pages_dirtied) +{ + unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ + unsigned long background_thresh; + unsigned long dirty_thresh; + long period; + long pause; + long max_pause; + long min_pause; + int nr_dirtied_pause; + bool dirty_exceeded = false; + unsigned long task_ratelimit; + unsigned long dirty_ratelimit; + unsigned long pos_ratio; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; + unsigned long start_time = jiffies; + + for (;;) { + unsigned long now = jiffies; + unsigned long uninitialized_var(bdi_thresh); + unsigned long thresh; + unsigned long uninitialized_var(bdi_dirty); + unsigned long dirty; + unsigned long bg_thresh; + + /* + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + */ + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); + + global_dirty_limits(&background_thresh, &dirty_thresh); + + if (unlikely(strictlimit)) { + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, &bg_thresh); + + dirty = bdi_dirty; + thresh = bdi_thresh; + } else { + dirty = nr_dirty; + thresh = dirty_thresh; + bg_thresh = background_thresh; + } + + /* + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the bdi limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the bdi counters + * and limits. Small writeouts when the bdi limits are ramping + * up are the price we consciously pay for strictlimit-ing. + */ + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + current->nr_dirtied_pause = + dirty_poll_interval(dirty, thresh); + break; + } + + if (unlikely(!writeback_in_progress(bdi))) + bdi_start_background_writeback(bdi); + + if (!strictlimit) + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, NULL); + + dirty_exceeded = (bdi_dirty > bdi_thresh) && + ((nr_dirty > dirty_thresh) || strictlimit); + if (dirty_exceeded && !bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; + + bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, + nr_dirty, bdi_thresh, bdi_dirty, + start_time); + + dirty_ratelimit = bdi->dirty_ratelimit; + pos_ratio = bdi_position_ratio(bdi, dirty_thresh, + background_thresh, nr_dirty, + bdi_thresh, bdi_dirty); + task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> + RATELIMIT_CALC_SHIFT; + max_pause = bdi_max_pause(bdi, bdi_dirty); + min_pause = bdi_min_pause(bdi, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); + + if (unlikely(task_ratelimit == 0)) { + period = max_pause; + pause = max_pause; + goto pause; + } + period = HZ * pages_dirtied / task_ratelimit; + pause = period; + if (current->dirty_paused_when) + pause -= now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ + if (pause < min_pause) { + trace_balance_dirty_pages(bdi, + dirty_thresh, + background_thresh, + nr_dirty, + bdi_thresh, + bdi_dirty, + dirty_ratelimit, + task_ratelimit, + pages_dirtied, + period, + min(pause, 0L), + start_time); + if (pause < -HZ) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } else if (current->nr_dirtied_pause <= pages_dirtied) + current->nr_dirtied_pause += pages_dirtied; + break; + } + if (unlikely(pause > max_pause)) { + /* for occasional dropped task_ratelimit */ + now += min(pause - max_pause, max_pause); + pause = max_pause; + } + +pause: + trace_balance_dirty_pages(bdi, + dirty_thresh, + background_thresh, + nr_dirty, + bdi_thresh, + bdi_dirty, + dirty_ratelimit, + task_ratelimit, + pages_dirtied, + period, + pause, + start_time); + __set_current_state(TASK_KILLABLE); + io_schedule_timeout(pause); + + current->dirty_paused_when = now + pause; + current->nr_dirtied = 0; + current->nr_dirtied_pause = nr_dirtied_pause; + + /* + * This is typically equal to (nr_dirty < dirty_thresh) and can + * also keep "1000+ dd on a slow USB stick" under control. + */ + if (task_ratelimit) + break; + + /* + * In the case of an unresponding NFS server and the NFS dirty + * pages exceeds dirty_thresh, give the other good bdi's a pipe + * to go through, so that tasks on them still remain responsive. + * + * In theory 1 page is enough to keep the comsumer-producer + * pipe going: the flusher cleans 1 page => the task dirties 1 + * more page. However bdi_dirty has accounting errors. So use + * the larger and more IO friendly bdi_stat_error. + */ + if (bdi_dirty <= bdi_stat_error(bdi)) + break; + + if (fatal_signal_pending(current)) + break; + } + + if (!dirty_exceeded && bdi->dirty_exceeded) + bdi->dirty_exceeded = 0; + + if (writeback_in_progress(bdi)) + return; + + /* + * In laptop mode, we wait until hitting the higher threshold before + * starting background writeout, and then write out all the way down + * to the lower threshold. So slow writers cause minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if (laptop_mode) + return; + + if (nr_reclaimable > background_thresh) + bdi_start_background_writeback(bdi); +} + +static DEFINE_PER_CPU(int, bdp_ratelimits); + +/* + * Normal tasks are throttled by + * loop { + * dirty tsk->nr_dirtied_pause pages; + * take a snap in balance_dirty_pages(); + * } + * However there is a worst case. If every task exit immediately when dirtied + * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be + * called to throttle the page dirties. The solution is to save the not yet + * throttled page dirties in dirty_throttle_leaks on task exit and charge them + * randomly into the running tasks. This works well for the above worst case, + * as the new task will pick up and accumulate the old task's leaked dirty + * count and eventually get throttled. + */ +DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; + +/** + * balance_dirty_pages_ratelimited - balance dirty memory state + * @mapping: address_space which was dirtied + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * On really big machines, get_writeback_state is expensive, so try to avoid + * calling it too often (ratelimiting). But once we're over the dirty memory + * limit we decrease the ratelimiting by a lot, to prevent individual processes + * from overshooting the limit by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited(struct address_space *mapping) +{ + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + int ratelimit; + int *p; + + if (!bdi_cap_account_dirty(bdi)) + return; + + ratelimit = current->nr_dirtied_pause; + if (bdi->dirty_exceeded) + ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); + + preempt_disable(); + /* + * This prevents one CPU to accumulate too many dirtied pages without + * calling into balance_dirty_pages(), which can happen when there are + * 1000+ tasks, all of them start dirtying pages at exactly the same + * time, hence all honoured too large initial task->nr_dirtied_pause. + */ + p = this_cpu_ptr(&bdp_ratelimits); + if (unlikely(current->nr_dirtied >= ratelimit)) + *p = 0; + else if (unlikely(*p >= ratelimit_pages)) { + *p = 0; + ratelimit = 0; + } + /* + * Pick up the dirtied pages by the exited tasks. This avoids lots of + * short-lived tasks (eg. gcc invocations in a kernel build) escaping + * the dirty throttling and livelock other long-run dirtiers. + */ + p = this_cpu_ptr(&dirty_throttle_leaks); + if (*p > 0 && current->nr_dirtied < ratelimit) { + unsigned long nr_pages_dirtied; + nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); + *p -= nr_pages_dirtied; + current->nr_dirtied += nr_pages_dirtied; + } + preempt_enable(); + + if (unlikely(current->nr_dirtied >= ratelimit)) + balance_dirty_pages(mapping, current->nr_dirtied); +} +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); + +void throttle_vm_writeout(gfp_t gfp_mask) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + + for ( ; ; ) { + global_dirty_limits(&background_thresh, &dirty_thresh); + dirty_thresh = hard_dirty_limit(dirty_thresh); + + /* + * Boost the allowable dirty threshold a bit for page + * allocators so they don't get DoS'ed by heavy writers + */ + dirty_thresh += dirty_thresh / 10; /* wheeee... */ + + if (global_page_state(NR_UNSTABLE_NFS) + + global_page_state(NR_WRITEBACK) <= dirty_thresh) + break; + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* + * The caller might hold locks which can prevent IO completion + * or progress in the filesystem. So we cannot just sit here + * waiting for IO to complete. + */ + if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) + break; + } +} + +/* + * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs + */ +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, buffer, length, ppos); + return 0; +} + +#ifdef CONFIG_BLOCK +void laptop_mode_timer_fn(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + int nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + + /* + * We want to write everything out, not just down to the dirty + * threshold + */ + if (bdi_has_dirty_io(&q->backing_dev_info)) + bdi_start_writeback(&q->backing_dev_info, nr_pages, + WB_REASON_LAPTOP_TIMER); +} + +/* + * We've spun up the disk and we're in laptop mode: schedule writeback + * of all dirty data a few seconds from now. If the flush is already scheduled + * then push it back - the user is still using the disk. + */ +void laptop_io_completion(struct backing_dev_info *info) +{ + mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); +} + +/* + * We're in laptop mode and we've just synced. The sync's writes will have + * caused another writeback to be scheduled by laptop_io_completion. + * Nothing needs to be written back anymore, so we unschedule the writeback. + */ +void laptop_sync_completion(void) +{ + struct backing_dev_info *bdi; + + rcu_read_lock(); + + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) + del_timer(&bdi->laptop_mode_wb_timer); + + rcu_read_unlock(); +} +#endif + +/* + * If ratelimit_pages is too high then we can get into dirty-data overload + * if a large number of processes all perform writes at the same time. + * If it is too low then SMP machines will call the (expensive) + * get_writeback_state too often. + * + * Here we set ratelimit_pages to a level which ensures that when all CPUs are + * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory + * thresholds. + */ + +void writeback_set_ratelimit(void) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + global_dirty_limits(&background_thresh, &dirty_thresh); + global_dirty_limit = dirty_thresh; + ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); + if (ratelimit_pages < 16) + ratelimit_pages = 16; +} + +static int +ratelimit_handler(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DEAD: + writeback_set_ratelimit(); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block ratelimit_nb = { + .notifier_call = ratelimit_handler, + .next = NULL, +}; + +/* + * Called early on to tune the page writeback dirty limits. + * + * We used to scale dirty pages according to how total memory + * related to pages that could be allocated for buffers (by + * comparing nr_free_buffer_pages() to vm_total_pages. + * + * However, that was when we used "dirty_ratio" to scale with + * all memory, and we don't do that any more. "dirty_ratio" + * is now applied to total non-HIGHPAGE memory (by subtracting + * totalhigh_pages from vm_total_pages), and as such we can't + * get into the old insane situation any more where we had + * large amounts of dirty pages compared to a small amount of + * non-HIGHMEM memory. + * + * But we might still want to scale the dirty_ratio by how + * much memory the box has.. + */ +void __init page_writeback_init(void) +{ + writeback_set_ratelimit(); + register_cpu_notifier(&ratelimit_nb); + + fprop_global_init(&writeout_completions, GFP_KERNEL); +} + +/** + * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * @mapping: address space structure to write + * @start: starting page index + * @end: ending page index (inclusive) + * + * This function scans the page range from @start to @end (inclusive) and tags + * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is + * that write_cache_pages (or whoever calls this function) will then use + * TOWRITE tag to identify pages eligible for writeback. This mechanism is + * used to avoid livelocking of writeback by a process steadily creating new + * dirty pages in the file (thus it is important for this function to be quick + * so that it can tag pages faster than a dirtying process can create them). + */ +/* + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + */ +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ +#define WRITEBACK_TAG_BATCH 4096 + unsigned long tagged; + + do { + spin_lock_irq(&mapping->tree_lock); + tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, + &start, end, WRITEBACK_TAG_BATCH, + PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); + cond_resched(); + /* We check 'start' to handle wrapping when end == ~0UL */ + } while (tagged >= WRITEBACK_TAG_BATCH && start); +} +EXPORT_SYMBOL(tag_pages_for_writeback); + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + * + * To avoid livelocks (when other process dirties new pages), we first tag + * pages which should be written back with TOWRITE tag and only then start + * writing them. For data-integrity sync we have to be careful so that we do + * not miss some pages (e.g., because some other process has cleared TOWRITE + * tag we set). The rule we follow is that TOWRITE tag can be cleared only + * by the process clearing the DIRTY tag (and submitting the page for IO). + */ +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int cycled; + int range_whole = 0; + int tag; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), or + * even swizzled back from swapper_space to tmpfs file + * mapping. However, page->index will not change + * because we have a reference on the page. + */ + if (page->index > end) { + /* + * can't be range_cyclic (1st pass) because + * end == -1 in that case. + */ + done = 1; + break; + } + + done_index = page->index; + + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data interity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + ret = (*writepage)(page, wbc, data); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + } else { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } + } + + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!cycled && !done) { + /* + * range_cyclic: + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} +EXPORT_SYMBOL(write_cache_pages); + +/* + * Function used by generic_writepages to call the real writepage + * function and set the mapping flags on error + */ +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +/** + * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct blk_plug plug; + int ret; + + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; +} + +EXPORT_SYMBOL(generic_writepages); + +int do_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + int ret; + + if (wbc->nr_to_write <= 0) + return 0; + if (mapping->a_ops->writepages) + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + return ret; +} + +/** + * write_one_page - write out a single page and optionally wait on I/O + * @page: the page to write + * @wait: if true, wait on writeout + * + * The page must be locked by the caller and will be unlocked upon return. + * + * write_one_page() returns a negative error code if I/O failed. + */ +int write_one_page(struct page *page, int wait) +{ + struct address_space *mapping = page->mapping; + int ret = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + BUG_ON(!PageLocked(page)); + + if (wait) + wait_on_page_writeback(page); + + if (clear_page_dirty_for_io(page)) { + page_cache_get(page); + ret = mapping->a_ops->writepage(page, &wbc); + if (ret == 0 && wait) { + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + page_cache_release(page); + } else { + unlock_page(page); + } + return ret; +} +EXPORT_SYMBOL(write_one_page); + +/* + * For address_spaces which do not use buffers nor write back. + */ +int __set_page_dirty_no_writeback(struct page *page) +{ + if (!PageDirty(page)) + return !TestSetPageDirty(page); + return 0; +} + +/* + * Helper function for set_page_dirty family. + * NOTE: This relies on being atomic wrt interrupts. + */ +void account_page_dirtied(struct page *page, struct address_space *mapping) +{ + trace_writeback_dirty_page(page, mapping); + + if (mapping_cap_account_dirty(mapping)) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_DIRTIED); + __inc_bdi_stat(bdi, BDI_RECLAIMABLE); + __inc_bdi_stat(bdi, BDI_DIRTIED); + task_io_account_write(PAGE_CACHE_SIZE); + current->nr_dirtied++; + this_cpu_inc(bdp_ratelimits); + } +} +EXPORT_SYMBOL(account_page_dirtied); + +/* + * Helper function for deaccounting dirty page without writeback. + * + * Doing this should *normally* only ever be done when a page + * is truncated, and is not actually mapped anywhere at all. However, + * fs/buffer.c does this when it notices that somebody has cleaned + * out all the buffers on a page without actually doing it through + * the VM. Can you say "ext3 is horribly ugly"? Thought you could. + */ +void account_page_cleaned(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + task_io_account_cancelled_write(PAGE_CACHE_SIZE); + } +} +EXPORT_SYMBOL(account_page_cleaned); + +/* + * For address_spaces which do not use buffers. Just tag the page as dirty in + * its radix tree. + * + * This is also used when a single buffer is being dirtied: we want to set the + * page dirty in that case, but not all the buffers. This is a "bottom-up" + * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * + * The caller must ensure this doesn't race with truncation. Most will simply + * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and + * the pte lock held, which also locks out truncation. + */ +int __set_page_dirty_nobuffers(struct page *page) +{ + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + unsigned long flags; + + if (!mapping) + return 1; + + spin_lock_irqsave(&mapping->tree_lock, flags); + BUG_ON(page_mapping(page) != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return 1; + } + return 0; +} +EXPORT_SYMBOL(__set_page_dirty_nobuffers); + +/* + * Call this whenever redirtying a page, to de-account the dirty counters + * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written + * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to + * systematic errors in balanced_dirty_ratelimit and the dirty pages position + * control. + */ +void account_page_redirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + current->nr_dirtied--; + dec_zone_page_state(page, NR_DIRTIED); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); + } +} +EXPORT_SYMBOL(account_page_redirty); + +/* + * When a writepage implementation decides that it doesn't want to write this + * page for some reason, it should redirty the locked page via + * redirty_page_for_writepage() and it should then unlock the page and return 0 + */ +int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) +{ + int ret; + + wbc->pages_skipped++; + ret = __set_page_dirty_nobuffers(page); + account_page_redirty(page); + return ret; +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +/* + * Dirty a page. + * + * For pages with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + */ +int set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (likely(mapping)) { + int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; + /* + * readahead/lru_deactivate_page could remain + * PG_readahead/PG_reclaim due to race with end_page_writeback + * About readahead, if the page is written, the flags would be + * reset. So no problem. + * About lru_deactivate_page, if the page is redirty, the flag + * will be reset. So no problem. but if the page is used by readahead + * it will confuse readahead and make it restart the size rampup + * process. But it's a trivial problem. + */ + if (PageReclaim(page)) + ClearPageReclaim(page); +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif + return (*spd)(page); + } + if (!PageDirty(page)) { + if (!TestSetPageDirty(page)) + return 1; + } + return 0; +} +EXPORT_SYMBOL(set_page_dirty); + +/* + * set_page_dirty() is racy if the caller has no reference against + * page->mapping->host, and if the page is unlocked. This is because another + * CPU could truncate the page off the mapping and then free the mapping. + * + * Usually, the page _is_ locked, or the caller is a user-space process which + * holds a reference on the inode by having an open file. + * + * In other cases, the page should be locked before running set_page_dirty(). + */ +int set_page_dirty_lock(struct page *page) +{ + int ret; + + lock_page(page); + ret = set_page_dirty(page); + unlock_page(page); + return ret; +} +EXPORT_SYMBOL(set_page_dirty_lock); + +/* + * Clear a page's dirty flag, while caring for dirty memory accounting. + * Returns true if the page was previously dirty. + * + * This is for preparing to put the page under writeout. We leave the page + * tagged as dirty in the radix tree so that a concurrent write-for-sync + * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage + * implementation will run either set_page_writeback() or set_page_dirty(), + * at which stage we bring the page's dirty flag and radix-tree dirty tag + * back into sync. + * + * This incoherency between the page's dirty flag and radix-tree tag is + * unfortunate, but it only exists while the page is locked. + */ +int clear_page_dirty_for_io(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + BUG_ON(!PageLocked(page)); + + if (mapping && mapping_cap_account_dirty(mapping)) { + /* + * Yes, Virginia, this is indeed insane. + * + * We use this sequence to make sure that + * (a) we account for dirty stats properly + * (b) we tell the low-level filesystem to + * mark the whole page dirty if it was + * dirty in a pagetable. Only to then + * (c) clean the page again and return 1 to + * cause the writeback. + * + * This way we avoid all nasty races with the + * dirty bit in multiple places and clearing + * them concurrently from different threads. + * + * Note! Normally the "set_page_dirty(page)" + * has no effect on the actual dirty bit - since + * that will already usually be set. But we + * need the side effects, and it can help us + * avoid races. + * + * We basically use the page "master dirty bit" + * as a serialization point for all the different + * threads doing their things. + */ + if (page_mkclean(page)) + set_page_dirty(page); + /* + * We carefully synchronise fault handlers against + * installing a dirty pte and marking the page dirty + * at this point. We do this by having them hold the + * page lock while dirtying the page, and pages are + * always locked coming in here, so we get the desired + * exclusion. + */ + if (TestClearPageDirty(page)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(inode_to_bdi(mapping->host), + BDI_RECLAIMABLE); + return 1; + } + return 0; + } + return TestClearPageDirty(page); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +int test_clear_page_writeback(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + struct mem_cgroup *memcg; + int ret; + + memcg = mem_cgroup_begin_page_stat(page); + if (mapping) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestClearPageWriteback(page); + if (ret) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_account_writeback(bdi)) { + __dec_bdi_stat(bdi, BDI_WRITEBACK); + __bdi_writeout_inc(bdi); + } + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestClearPageWriteback(page); + } + if (ret) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + dec_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); + } + mem_cgroup_end_page_stat(memcg); + return ret; +} + +int __test_set_page_writeback(struct page *page, bool keep_write) +{ + struct address_space *mapping = page_mapping(page); + struct mem_cgroup *memcg; + int ret; + + memcg = mem_cgroup_begin_page_stat(page); + if (mapping) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + ret = TestSetPageWriteback(page); + if (!ret) { + radix_tree_tag_set(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_account_writeback(bdi)) + __inc_bdi_stat(bdi, BDI_WRITEBACK); + } + if (!PageDirty(page)) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + if (!keep_write) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + } else { + ret = TestSetPageWriteback(page); + } + if (!ret) { + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + inc_zone_page_state(page, NR_WRITEBACK); + } + mem_cgroup_end_page_stat(memcg); + return ret; + +} +EXPORT_SYMBOL(__test_set_page_writeback); + +/* + * Return true if any of the pages in the mapping are marked with the + * passed tag. + */ +int mapping_tagged(struct address_space *mapping, int tag) +{ + return radix_tree_tagged(&mapping->page_tree, tag); +} +EXPORT_SYMBOL(mapping_tagged); + +/** + * wait_for_stable_page() - wait for writeback to finish, if necessary. + * @page: The page to wait on. + * + * This function determines if the given page is related to a backing device + * that requires page contents to be held stable during writeback. If so, then + * it will wait for any pending writeback to complete. + */ +void wait_for_stable_page(struct page *page) +{ + if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) + wait_on_page_writeback(page); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); diff --git a/kernel/mm/page_alloc.c b/kernel/mm/page_alloc.c new file mode 100644 index 000000000..5b70c9977 --- /dev/null +++ b/kernel/mm/page_alloc.c @@ -0,0 +1,6695 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "internal.h" + +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) + +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DEFINE_PER_CPU(int, numa_node); +EXPORT_PER_CPU_SYMBOL(numa_node); +#endif + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() + * defined in . + */ +DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ +EXPORT_PER_CPU_SYMBOL(_numa_mem_); +int _node_numa_mem_[MAX_NUMNODES]; +#endif + +/* + * Array of node states. + */ +nodemask_t node_states[NR_NODE_STATES] __read_mostly = { + [N_POSSIBLE] = NODE_MASK_ALL, + [N_ONLINE] = { { [0] = 1UL } }, +#ifndef CONFIG_NUMA + [N_NORMAL_MEMORY] = { { [0] = 1UL } }, +#ifdef CONFIG_HIGHMEM + [N_HIGH_MEMORY] = { { [0] = 1UL } }, +#endif +#ifdef CONFIG_MOVABLE_NODE + [N_MEMORY] = { { [0] = 1UL } }, +#endif + [N_CPU] = { { [0] = 1UL } }, +#endif /* NUMA */ +}; +EXPORT_SYMBOL(node_states); + +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); + +unsigned long totalram_pages __read_mostly; +unsigned long totalreserve_pages __read_mostly; +unsigned long totalcma_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory. This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; + +int percpu_pagelist_fraction; +gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; + +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with pm_mutex held (gfp_allowed_mask also should + * only be modified with pm_mutex held, unless the suspend/hibernate code is + * guaranteed not to run in parallel with that modification). + */ + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~GFP_IOFS; +} + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + return false; + return true; +} +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE +int pageblock_order __read_mostly; +#endif + +static void __free_pages_ok(struct page *page, unsigned int order); + +/* + * results with 256, 32 in the lowmem_reserve sysctl: + * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) + * 1G machine -> (16M dma, 784M normal, 224M high) + * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA + * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL + * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation + */ +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +#ifdef CONFIG_ZONE_DMA + 256, +#endif +#ifdef CONFIG_ZONE_DMA32 + 256, +#endif +#ifdef CONFIG_HIGHMEM + 32, +#endif + 32, +}; + +EXPORT_SYMBOL(totalram_pages); + +static char * const zone_names[MAX_NR_ZONES] = { +#ifdef CONFIG_ZONE_DMA + "DMA", +#endif +#ifdef CONFIG_ZONE_DMA32 + "DMA32", +#endif + "Normal", +#ifdef CONFIG_HIGHMEM + "HighMem", +#endif + "Movable", +}; + +int min_free_kbytes = 1024; +int user_min_free_kbytes = -1; + +static unsigned long __meminitdata nr_kernel_pages; +static unsigned long __meminitdata nr_all_pages; +static unsigned long __meminitdata dma_reserve; + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __initdata required_kernelcore; +static unsigned long __initdata required_movablecore; +static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + +/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ +int movable_zone; +EXPORT_SYMBOL(movable_zone); +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +#if MAX_NUMNODES > 1 +int nr_node_ids __read_mostly = MAX_NUMNODES; +int nr_online_nodes __read_mostly = 1; +EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); +#endif + +static DEFINE_LOCAL_IRQ_LOCK(pa_lock); + +#ifdef CONFIG_PREEMPT_RT_BASE +# define cpu_lock_irqsave(cpu, flags) \ + local_lock_irqsave_on(pa_lock, flags, cpu) +# define cpu_unlock_irqrestore(cpu, flags) \ + local_unlock_irqrestore_on(pa_lock, flags, cpu) +#else +# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags) +# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags) +#endif + +int page_group_by_mobility_disabled __read_mostly; + +void set_pageblock_migratetype(struct page *page, int migratetype) +{ + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) + migratetype = MIGRATE_UNMOVABLE; + + set_pageblock_flags_group(page, (unsigned long)migratetype, + PB_migrate, PB_migrate_end); +} + +#ifdef CONFIG_DEBUG_VM +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + unsigned long sp, start_pfn; + + do { + seq = zone_span_seqbegin(zone); + start_pfn = zone->zone_start_pfn; + sp = zone->spanned_pages; + if (!zone_spans_pfn(zone, pfn)) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + if (ret) + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ + if (!pfn_valid_within(page_to_pfn(page))) + return 0; + if (zone != page_zone(page)) + return 0; + + return 1; +} +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_outside_zone_boundaries(zone, page)) + return 1; + if (!page_is_consistent(zone, page)) + return 1; + + return 0; +} +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + +static void bad_page(struct page *page, const char *reason, + unsigned long bad_flags) +{ + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* Don't complain about poisoned pages */ + if (PageHWPoison(page)) { + page_mapcount_reset(page); /* remove PageBuddy */ + return; + } + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + goto out; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page state: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + dump_page_badflags(page, reason, bad_flags); + + print_modules(); + dump_stack(); +out: + /* Leave bad fields for debug, except PageBuddy could make trouble */ + page_mapcount_reset(page); /* remove PageBuddy */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} + +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All tail pages have their ->first_page + * pointing at the head page. + * + * The first tail page's ->lru.next holds the address of the compound page's + * put_page() function. Its ->lru.prev holds the order of allocation. + * This usage means that zero-order pages may not be compound. + */ + +static void free_compound_page(struct page *page) +{ + __free_pages_ok(page, compound_order(page)); +} + +void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + set_compound_page_dtor(page, free_compound_page); + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; + set_page_count(p, 0); + p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); + } +} + +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) +{ + int i; + + /* + * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO + * and __GFP_HIGHMEM from hard or soft interrupt context. + */ + VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; +bool _debug_pagealloc_enabled __read_mostly; +bool _debug_guardpage_enabled __read_mostly; + +static int __init early_debug_pagealloc(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + _debug_pagealloc_enabled = true; + + return 0; +} +early_param("debug_pagealloc", early_debug_pagealloc); + +static bool need_debug_guardpage(void) +{ + /* If we don't use debug_pagealloc, we don't need guard page */ + if (!debug_pagealloc_enabled()) + return false; + + return true; +} + +static void init_debug_guardpage(void) +{ + if (!debug_pagealloc_enabled()) + return; + + _debug_guardpage_enabled = true; +} + +struct page_ext_operations debug_guardpage_ops = { + .need = need_debug_guardpage, + .init = init_debug_guardpage, +}; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + + INIT_LIST_HEAD(&page->lru); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + __mod_zone_freepage_state(zone, -(1 << order), migratetype); +} + +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + struct page_ext *page_ext; + + if (!debug_guardpage_enabled()) + return; + + page_ext = lookup_page_ext(page); + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); + + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); +} +#else +struct page_ext_operations debug_guardpage_ops = { NULL, }; +static inline void set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} +#endif + +static inline void set_page_order(struct page *page, unsigned int order) +{ + set_page_private(page, order); + __SetPageBuddy(page); +} + +static inline void rmv_page_order(struct page *page) +{ + __ClearPageBuddy(page); + set_page_private(page, 0); +} + +/* + * This function checks whether a page is free && is the buddy + * we can do coalesce a page and its buddy if + * (a) the buddy is not in a hole && + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. + * + * For recording whether a page is in the buddy system, we set ->_mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. + * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is + * serialized by zone->lock. + * + * For recording page's order, we use page_private(page). + */ +static inline int page_is_buddy(struct page *page, struct page *buddy, + unsigned int order) +{ + if (!pfn_valid_within(page_to_pfn(buddy))) + return 0; + + if (page_is_guard(buddy) && page_order(buddy) == order) { + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + return 1; + } + + if (PageBuddy(buddy) && page_order(buddy) == order) { + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + return 1; + } + return 0; +} + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep a list of pages, which are heads of continuous + * free pages of length of (1 << order) and marked with _mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) + * field. + * So when we are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- nyc + */ + +static inline void __free_one_page(struct page *page, + unsigned long pfn, + struct zone *zone, unsigned int order, + int migratetype) +{ + unsigned long page_idx; + unsigned long combined_idx; + unsigned long uninitialized_var(buddy_idx); + struct page *buddy; + int max_order = MAX_ORDER; + + VM_BUG_ON(!zone_is_initialized(zone)); + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + + VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { + __mod_zone_freepage_state(zone, 1 << order, migratetype); + } + + page_idx = pfn & ((1 << max_order) - 1); + + VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); + + while (order < max_order - 1) { + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + if (!page_is_buddy(page, buddy, order)) + break; + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard(zone, buddy, order, migratetype); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } + combined_idx = buddy_idx & page_idx; + page = page + (combined_idx - page_idx); + page_idx = combined_idx; + order++; + } + set_page_order(page, order); + + /* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ + if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + struct page *higher_page, *higher_buddy; + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = higher_page + (buddy_idx - combined_idx); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area[order].free_list[migratetype]); + goto out; + } + } + + list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); +out: + zone->free_area[order].nr_free++; +} + +static inline int free_pages_check(struct page *page) +{ + const char *bad_reason = NULL; + unsigned long bad_flags = 0; + + if (unlikely(page_mapcount(page))) + bad_reason = "nonzero mapcount"; + if (unlikely(page->mapping != NULL)) + bad_reason = "non-NULL mapping"; + if (unlikely(atomic_read(&page->_count) != 0)) + bad_reason = "nonzero _count"; + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; + bad_flags = PAGE_FLAGS_CHECK_AT_FREE; + } +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif + if (unlikely(bad_reason)) { + bad_page(page, bad_reason, bad_flags); + return 1; + } + page_cpupid_reset_last(page); + if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; +} + +/* + * Frees a number of pages which have been collected from the pcp lists. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static void free_pcppages_bulk(struct zone *zone, int count, + struct list_head *list) +{ + int to_free = count; + unsigned long nr_scanned; + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); + + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + + while (!list_empty(list)) { + struct page *page = list_first_entry(list, struct page, lru); + int mt; /* migratetype of the to-be-freed page */ + + /* must delete as __free_one_page list manipulates */ + list_del(&page->lru); + + mt = get_freepage_migratetype(page); + if (unlikely(has_isolate_pageblock(zone))) + mt = get_pageblock_migratetype(page); + + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ + __free_one_page(page, page_to_pfn(page), zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + to_free--; + } + WARN_ON(to_free != 0); + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Moves a number of pages from the PCP lists to free list which + * is freed outside of the locked region. + * + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free. + */ +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src, + struct list_head *dst) +{ + int migratetype = 0; + int batch_free = 0; + + while (to_free) { + struct page *page; + struct list_head *list; + + /* + * Remove pages from lists in a round-robin fashion. A + * batch_free count is maintained that is incremented when an + * empty list is encountered. This is so more pages are freed + * off fuller lists instead of spinning excessively around empty + * lists + */ + do { + batch_free++; + if (++migratetype == MIGRATE_PCPTYPES) + migratetype = 0; + list = &src->lists[migratetype]; + } while (list_empty(list)); + + /* This is the only non-empty list. Free them all. */ + if (batch_free == MIGRATE_PCPTYPES) + batch_free = to_free; + + do { + page = list_last_entry(list, struct page, lru); + list_del(&page->lru); + list_add(&page->lru, dst); + } while (--to_free && --batch_free && !list_empty(list)); + } +} + +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + unsigned int order, + int migratetype) +{ + unsigned long nr_scanned; + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } + __free_one_page(page, pfn, zone, order, migratetype); + spin_unlock_irqrestore(&zone->lock, flags); +} + +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return 0; + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + return 1; + } + if (unlikely(page->first_page != head_page)) { + bad_page(page, "first_page not consistent", 0); + return 1; + } + return 0; +} + +static bool free_pages_prepare(struct page *page, unsigned int order) +{ + bool compound = PageCompound(page); + int i, bad = 0; + + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + + trace_mm_page_free(page, order); + kmemcheck_free_shadow(page, order); + kasan_free_pages(page, order); + + if (PageAnon(page)) + page->mapping = NULL; + bad += free_pages_check(page); + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + bad += free_pages_check(page + i); + } + if (bad) + return false; + + reset_page_owner(page, order); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE << order); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } + arch_free_page(page, order); + kernel_map_pages(page, 1 << order, 0); + + return true; +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; + int migratetype; + unsigned long pfn = page_to_pfn(page); + + if (!free_pages_prepare(page, order)) + return; + + migratetype = get_pfnblock_migratetype(page, pfn); + local_lock_irqsave(pa_lock, flags); + __count_vm_events(PGFREE, 1 << order); + set_freepage_migratetype(page, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); + local_unlock_irqrestore(pa_lock, flags); +} + +void __init __free_pages_bootmem(struct page *page, unsigned int order) +{ + unsigned int nr_pages = 1 << order; + struct page *p = page; + unsigned int loop; + + prefetchw(p); + for (loop = 0; loop < (nr_pages - 1); loop++, p++) { + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); + } + __ClearPageReserved(p); + set_page_count(p, 0); + + page_zone(page)->managed_pages += nr_pages; + set_page_refcounted(page); + __free_pages(page, order); +} + +#ifdef CONFIG_CMA +/* Free whole pageblock and set its migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_pageblock_migratetype(page, MIGRATE_CMA); + + if (pageblock_order >= MAX_ORDER) { + i = pageblock_nr_pages; + p = page; + do { + set_page_refcounted(p); + __free_pages(p, MAX_ORDER - 1); + p += MAX_ORDER_NR_PAGES; + } while (i -= MAX_ORDER_NR_PAGES); + } else { + set_page_refcounted(page); + __free_pages(page, pageblock_order); + } + + adjust_managed_page_count(page, pageblock_nr_pages); +} +#endif + +/* + * The order of subdivision here is critical for the IO subsystem. + * Please do not alter this order without good reasons and regression + * testing. Specifically, as large blocks of memory are subdivided, + * the order in which smaller blocks are delivered depends on the order + * they're subdivided in this function. This is the primary factor + * influencing the order in which pages are delivered to the IO + * subsystem according to empirical testing, and this is also justified + * by considering the behavior of a buddy system containing a single + * large block of memory acted on by a series of small allocations. + * This behavior is a critical factor in sglist merging's success. + * + * -- nyc + */ +static inline void expand(struct zone *zone, struct page *page, + int low, int high, struct free_area *area, + int migratetype) +{ + unsigned long size = 1 << high; + + while (high > low) { + area--; + high--; + size >>= 1; + VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); + + if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + debug_guardpage_enabled() && + high < debug_guardpage_minorder()) { + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + set_page_guard(zone, &page[size], high, migratetype); + continue; + } + list_add(&page[size].lru, &area->free_list[migratetype]); + area->nr_free++; + set_page_order(&page[size], high); + } +} + +/* + * This page is about to be returned from the page allocator + */ +static inline int check_new_page(struct page *page) +{ + const char *bad_reason = NULL; + unsigned long bad_flags = 0; + + if (unlikely(page_mapcount(page))) + bad_reason = "nonzero mapcount"; + if (unlikely(page->mapping != NULL)) + bad_reason = "non-NULL mapping"; + if (unlikely(atomic_read(&page->_count) != 0)) + bad_reason = "nonzero _count"; + if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; + bad_flags = PAGE_FLAGS_CHECK_AT_PREP; + } +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif + if (unlikely(bad_reason)) { + bad_page(page, bad_reason, bad_flags); + return 1; + } + return 0; +} + +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + int alloc_flags) +{ + int i; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (unlikely(check_new_page(p))) + return 1; + } + + set_page_private(page, 0); + set_page_refcounted(page); + + arch_alloc_page(page, order); + kernel_map_pages(page, 1 << order, 1); + kasan_alloc_pages(page, order); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + + set_page_owner(page, order, gfp_flags); + + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * allocate the page. The expectation is that the caller is taking + * steps that will free more memory. The caller should avoid the page + * being used for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + + return 0; +} + +/* + * Go through the free lists for the given migratetype and remove + * the smallest available page from the freelists + */ +static inline +struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + int migratetype) +{ + unsigned int current_order; + struct free_area *area; + struct page *page; + + /* Find a page of the appropriate size in the preferred list */ + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = &(zone->free_area[current_order]); + if (list_empty(&area->free_list[migratetype])) + continue; + + page = list_entry(area->free_list[migratetype].next, + struct page, lru); + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); + return page; + } + + return NULL; +} + + +/* + * This array describes the order lists are fallen back to when + * the free lists for the desirable migrate type are depleted + */ +static int fallbacks[MIGRATE_TYPES][4] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, +#ifdef CONFIG_CMA + [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ +#endif + [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ +#ifdef CONFIG_MEMORY_ISOLATION + [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ +#endif +}; + +#ifdef CONFIG_CMA +static struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) +{ + return __rmqueue_smallest(zone, order, MIGRATE_CMA); +} +#else +static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) { return NULL; } +#endif + +/* + * Move the free pages in a range to the free lists of the requested type. + * Note that start_page and end_pages are not aligned on a pageblock + * boundary. If alignment is required, use move_freepages_block() + */ +int move_freepages(struct zone *zone, + struct page *start_page, struct page *end_page, + int migratetype) +{ + struct page *page; + unsigned long order; + int pages_moved = 0; + +#ifndef CONFIG_HOLES_IN_ZONE + /* + * page_zone is not safe to call in this context when + * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant + * anyway as we check zone boundaries in move_freepages_block(). + * Remove at a later date when no bug reports exist related to + * grouping pages by mobility + */ + VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); +#endif + + for (page = start_page; page <= end_page;) { + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + + if (!pfn_valid_within(page_to_pfn(page))) { + page++; + continue; + } + + if (!PageBuddy(page)) { + page++; + continue; + } + + order = page_order(page); + list_move(&page->lru, + &zone->free_area[order].free_list[migratetype]); + set_freepage_migratetype(page, migratetype); + page += 1 << order; + pages_moved += 1 << order; + } + + return pages_moved; +} + +int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) +{ + unsigned long start_pfn, end_pfn; + struct page *start_page, *end_page; + + start_pfn = page_to_pfn(page); + start_pfn = start_pfn & ~(pageblock_nr_pages-1); + start_page = pfn_to_page(start_pfn); + end_page = start_page + pageblock_nr_pages - 1; + end_pfn = start_pfn + pageblock_nr_pages - 1; + + /* Do not cross zone boundaries */ + if (!zone_spans_pfn(zone, start_pfn)) + start_page = page; + if (!zone_spans_pfn(zone, end_pfn)) + return 0; + + return move_freepages(zone, start_page, end_page, migratetype); +} + +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + +/* + * When we are falling back to another migratetype during allocation, try to + * steal extra free pages from the same pageblocks to satisfy further + * allocations, instead of polluting multiple pageblocks. + * + * If we are stealing a relatively large buddy page, it is likely there will + * be more free pages in the pageblock, so try to steal them all. For + * reclaimable and unmovable allocations, we steal regardless of page size, + * as fragmentation caused by those allocations polluting movable pageblocks + * is worse than movable allocations stealing from unmovable and reclaimable + * pageblocks. + */ +static bool can_steal_fallback(unsigned int order, int start_mt) +{ + /* + * Leaving this order check is intended, although there is + * relaxed order check in next check. The reason is that + * we can actually steal whole pageblock if this condition met, + * but, below check doesn't guarantee it and that is just heuristic + * so could be changed anytime. + */ + if (order >= pageblock_order) + return true; + + if (order >= pageblock_order / 2 || + start_mt == MIGRATE_RECLAIMABLE || + start_mt == MIGRATE_UNMOVABLE || + page_group_by_mobility_disabled) + return true; + + return false; +} + +/* + * This function implements actual steal behaviour. If order is large enough, + * we can steal whole pageblock. If not, we first move freepages in this + * pageblock and check whether half of pages are moved or not. If half of + * pages are moved, we can change migratetype of pageblock and permanently + * use it's pages as requested migratetype in the future. + */ +static void steal_suitable_fallback(struct zone *zone, struct page *page, + int start_type) +{ + int current_order = page_order(page); + int pages; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { + change_pageblock_range(page, current_order, start_type); + return; + } + + pages = move_freepages_block(zone, page, start_type); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) + set_pageblock_migratetype(page, start_type); +} + +/* + * Check whether there is a suitable fallback freepage with requested order. + * If only_stealable is true, this function returns fallback_mt only if + * we can steal other freepages all together. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal) +{ + int i; + int fallback_mt; + + if (area->nr_free == 0) + return -1; + + *can_steal = false; + for (i = 0;; i++) { + fallback_mt = fallbacks[migratetype][i]; + if (fallback_mt == MIGRATE_RESERVE) + break; + + if (list_empty(&area->free_list[fallback_mt])) + continue; + + if (can_steal_fallback(order, migratetype)) + *can_steal = true; + + if (!only_stealable) + return fallback_mt; + + if (*can_steal) + return fallback_mt; + } + + return -1; +} + +/* Remove an element from the buddy allocator from the fallback list */ +static inline struct page * +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) +{ + struct free_area *area; + unsigned int current_order; + struct page *page; + int fallback_mt; + bool can_steal; + + /* Find the largest possible block of pages in the other list */ + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt == -1) + continue; + + page = list_entry(area->free_list[fallback_mt].next, + struct page, lru); + if (can_steal) + steal_suitable_fallback(zone, page, start_migratetype); + + /* Remove the page from the freelists */ + area->nr_free--; + list_del(&page->lru); + rmv_page_order(page); + + expand(zone, page, order, current_order, area, + start_migratetype); + /* + * The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages(). This is OK as long as it + * does not differ for MIGRATE_CMA pageblocks. For CMA + * we need to make sure unallocated pages flushed from + * pcp lists are returned to the correct freelist. + */ + set_freepage_migratetype(page, start_migratetype); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + + return page; + } + + return NULL; +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order, + int migratetype) +{ + struct page *page; + +retry_reserve: + page = __rmqueue_smallest(zone, order, migratetype); + + if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { + if (migratetype == MIGRATE_MOVABLE) + page = __rmqueue_cma_fallback(zone, order); + + if (!page) + page = __rmqueue_fallback(zone, order, migratetype); + + /* + * Use MIGRATE_RESERVE rather than fail an allocation. goto + * is used because __rmqueue_smallest is an inline function + * and we want just one call site + */ + if (!page) { + migratetype = MIGRATE_RESERVE; + goto retry_reserve; + } + } + + trace_mm_page_alloc_zone_locked(page, order, migratetype); + return page; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list, + int migratetype, bool cold) +{ + int i; + + spin_lock(&zone->lock); + for (i = 0; i < count; ++i) { + struct page *page = __rmqueue(zone, order, migratetype); + if (unlikely(page == NULL)) + break; + + /* + * Split buddy pages returned by expand() are received here + * in physical page order. The page is added to the callers and + * list and the list head then moves forward. From the callers + * perspective, the linked list is ordered by page number in + * some conditions. This is useful for IO devices that can + * merge IO requests if the physical pages are ordered + * properly. + */ + if (likely(!cold)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); + list = &page->lru; + if (is_migrate_cma(get_freepage_migratetype(page))) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); + } + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + spin_unlock(&zone->lock); + return i; +} + +#ifdef CONFIG_NUMA +/* + * Called from the vmstat counter updater to drain pagesets of this + * currently executing processor on remote nodes after they have + * expired. + * + * Note that this function must be called with the thread pinned to + * a single processor. + */ +void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) +{ + unsigned long flags; + LIST_HEAD(dst); + int to_drain, batch; + + local_lock_irqsave(pa_lock, flags); + batch = READ_ONCE(pcp->batch); + to_drain = min(pcp->count, batch); + if (to_drain > 0) { + isolate_pcp_pages(to_drain, pcp, &dst); + pcp->count -= to_drain; + } + local_unlock_irqrestore(pa_lock, flags); + free_pcppages_bulk(zone, to_drain, &dst); +} +#endif + +/* + * Drain pcplists of the indicated processor and zone. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages_zone(unsigned int cpu, struct zone *zone) +{ + unsigned long flags; + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + LIST_HEAD(dst); + int count; + + cpu_lock_irqsave(cpu, flags); + pset = per_cpu_ptr(zone->pageset, cpu); + + pcp = &pset->pcp; + count = pcp->count; + if (count) { + isolate_pcp_pages(count, pcp, &dst); + pcp->count = 0; + } + cpu_unlock_irqrestore(cpu, flags); + if (count) + free_pcppages_bulk(zone, count, &dst); +} + +/* + * Drain pcplists of all zones on the indicated processor. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + drain_pages_zone(cpu, zone); + } +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + * + * The CPU has to be pinned. When zone parameter is non-NULL, spill just + * the single zone's pages. + */ +void drain_local_pages(struct zone *zone) +{ + int cpu = smp_processor_id(); + + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); +} + +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * When zone parameter is non-NULL, spill just the single zone's pages. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask(). + */ +void drain_all_pages(struct zone *zone) +{ + int cpu; + + /* + * Allocate in the BSS so we wont require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + struct per_cpu_pageset *pcp; + struct zone *z; + bool has_pcps = false; + + if (zone) { + pcp = per_cpu_ptr(zone->pageset, cpu); + if (pcp->pcp.count) + has_pcps = true; + } else { + for_each_populated_zone(z) { + pcp = per_cpu_ptr(z->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } + } + } + + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } +#ifndef CONFIG_PREEMPT_RT_BASE + on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, + zone, 1); +#else + for_each_cpu(cpu, &cpus_with_pcps) { + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); + } +#endif +} + +#ifdef CONFIG_HIBERNATION + +void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn; + unsigned long flags; + unsigned int order, t; + struct list_head *curr; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each(curr, &zone->free_area[order].free_list[t]) { + unsigned long i; + + pfn = page_to_pfn(list_entry(curr, struct page, lru)); + for (i = 0; i < (1UL << order); i++) + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} +#endif /* CONFIG_PM */ + +/* + * Free a 0-order page + * cold == true ? free a cold page : free a hot page + */ +void free_hot_cold_page(struct page *page, bool cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + unsigned long pfn = page_to_pfn(page); + int migratetype; + + if (!free_pages_prepare(page, 0)) + return; + + migratetype = get_pfnblock_migratetype(page, pfn); + set_freepage_migratetype(page, migratetype); + local_lock_irqsave(pa_lock, flags); + __count_vm_event(PGFREE); + + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Free ISOLATE pages back to the allocator because they are being + * offlined but treat RESERVE as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + if (migratetype >= MIGRATE_PCPTYPES) { + if (unlikely(is_migrate_isolate(migratetype))) { + free_one_page(zone, page, pfn, 0, migratetype); + goto out; + } + migratetype = MIGRATE_MOVABLE; + } + + pcp = &this_cpu_ptr(zone->pageset)->pcp; + if (!cold) + list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); + pcp->count++; + if (pcp->count >= pcp->high) { + unsigned long batch = READ_ONCE(pcp->batch); + LIST_HEAD(dst); + + isolate_pcp_pages(batch, pcp, &dst); + pcp->count -= batch; + local_unlock_irqrestore(pa_lock, flags); + free_pcppages_bulk(zone, batch, &dst); + return; + } + +out: + local_unlock_irqrestore(pa_lock, flags); +} + +/* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, bool cold) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + trace_mm_page_free_batched(page, cold); + free_hot_cold_page(page, cold); + } +} + +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1<lru); + zone->free_area[order].nr_free--; + rmv_page_order(page); + + /* Set the pageblock if the isolated page is at least a pageblock */ + if (order >= pageblock_order - 1) { + struct page *endpage = page + (1 << order) - 1; + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } + } + + set_page_owner(page, order, 0); + return 1UL << order; +} + +/* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + int nr_pages; + + order = page_order(page); + + nr_pages = __isolate_free_page(page, order); + if (!nr_pages) + return 0; + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + return nr_pages; +} + +/* + * Allocate a page from the given zone. Use pcplists for order-0 allocations. + */ +static inline +struct page *buffered_rmqueue(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) +{ + unsigned long flags; + struct page *page; + bool cold = ((gfp_flags & __GFP_COLD) != 0); + + if (likely(order == 0)) { + struct per_cpu_pages *pcp; + struct list_head *list; + + local_lock_irqsave(pa_lock, flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + goto failed; + } + + if (cold) + page = list_entry(list->prev, struct page, lru); + else + page = list_entry(list->next, struct page, lru); + + list_del(&page->lru); + pcp->count--; + } else { + if (unlikely(gfp_flags & __GFP_NOFAIL)) { + /* + * __GFP_NOFAIL is not to be used in new code. + * + * All __GFP_NOFAIL callers should be fixed so that they + * properly detect and handle allocation failures. + * + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with + * __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 1); + } + local_spin_lock_irqsave(pa_lock, &zone->lock, flags); + page = __rmqueue(zone, order, migratetype); + if (!page) { + spin_unlock(&zone->lock); + goto failed; + } + __mod_zone_freepage_state(zone, -(1 << order), + get_freepage_migratetype(page)); + spin_unlock(&zone->lock); + } + + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && + !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) + set_bit(ZONE_FAIR_DEPLETED, &zone->flags); + + __count_zone_vm_events(PGALLOC, zone, 1 << order); + zone_statistics(preferred_zone, zone, gfp_flags); + local_unlock_irqrestore(pa_lock, flags); + + VM_BUG_ON_PAGE(bad_range(zone, page), page); + return page; + +failed: + local_unlock_irqrestore(pa_lock, flags); + return NULL; +} + +#ifdef CONFIG_FAIL_PAGE_ALLOC + +static struct { + struct fault_attr attr; + + u32 ignore_gfp_highmem; + u32 ignore_gfp_wait; + u32 min_order; +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_wait = 1, + .ignore_gfp_highmem = 1, + .min_order = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + if (order < fail_page_alloc.min_order) + return false; + if (gfp_mask & __GFP_NOFAIL) + return false; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return false; + if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + return false; + + return should_fail(&fail_page_alloc.attr, 1 << order); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_page_alloc", NULL, + &fail_page_alloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_wait)) + goto fail; + if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem)) + goto fail; + if (!debugfs_create_u32("min-order", mode, dir, + &fail_page_alloc.min_order)) + goto fail; + + return 0; +fail: + debugfs_remove_recursive(dir); + + return -ENOMEM; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAIL_PAGE_ALLOC */ + +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return false; +} + +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + +/* + * Return true if free pages are above 'mark'. This takes into account the order + * of the allocation. + */ +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) +{ + /* free_pages may go negative - that's OK */ + long min = mark; + int o; + long free_cma = 0; + + free_pages -= (1 << order) - 1; + if (alloc_flags & ALLOC_HIGH) + min -= min / 2; + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + return false; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ + free_pages -= z->free_area[o].nr_free << o; + + /* Require fewer higher order pages to be free */ + min >>= 1; + + if (free_pages <= min) + return false; + } + return true; +} + +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); +} + +#ifdef CONFIG_NUMA +/* + * zlc_setup - Setup for "zonelist cache". Uses cached zone data to + * skip over zones that are not allowed by the cpuset, or that have + * been recently (in last second) found to be nearly full. See further + * comments in mmzone.h. Reduces cache footprint of zonelist scans + * that have to skip over a lot of full or unallowed zones. + * + * If the zonelist cache is present in the passed zonelist, then + * returns a pointer to the allowed node mask (either the current + * tasks mems_allowed, or node_states[N_MEMORY].) + * + * If the zonelist cache is not available for this zonelist, does + * nothing and returns NULL. + * + * If the fullzones BITMAP in the zonelist cache is stale (more than + * a second since last zap'd) then we zap it out (clear its bits.) + * + * We hold off even calling zlc_setup, until after we've checked the + * first zone in the zonelist, on the theory that most allocations will + * be satisfied from that first zone, so best to examine that zone as + * quickly as we can. + */ +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + nodemask_t *allowednodes; /* zonelist_cache approximation */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return NULL; + + if (time_after(jiffies, zlc->last_full_zap + HZ)) { + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + zlc->last_full_zap = jiffies; + } + + allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? + &cpuset_current_mems_allowed : + &node_states[N_MEMORY]; + return allowednodes; +} + +/* + * Given 'z' scanning a zonelist, run a couple of quick checks to see + * if it is worth looking at further for free memory: + * 1) Check that the zone isn't thought to be full (doesn't have its + * bit set in the zonelist_cache fullzones BITMAP). + * 2) Check that the zones node (obtained from the zonelist_cache + * z_to_n[] mapping) is allowed in the passed in allowednodes mask. + * Return true (non-zero) if zone is worth looking at further, or + * else return false (zero) if it is not. + * + * This check -ignores- the distinction between various watermarks, + * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is + * found to be full for any variation of these watermarks, it will + * be considered full for up to one second by all requests, unless + * we are so low on memory on all allowed nodes that we are forced + * into the second scan of the zonelist. + * + * In the second scan we ignore this zonelist cache and exactly + * apply the watermarks to all zones, even it is slower to do so. + * We are low on memory in the second scan, and should leave no stone + * unturned looking for a free page. + */ +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, + nodemask_t *allowednodes) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + int n; /* node that zone *z is on */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return 1; + + i = z - zonelist->_zonerefs; + n = zlc->z_to_n[i]; + + /* This zone is worth trying if it is allowed but not full */ + return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); +} + +/* + * Given 'z' scanning a zonelist, set the corresponding bit in + * zlc->fullzones, so that subsequent attempts to allocate a page + * from that zone don't waste time re-examining it. + */ +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + int i; /* index of *z in zonelist zones */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + i = z - zonelist->_zonerefs; + + set_bit(i, zlc->fullzones); +} + +/* + * clear all zones full, called after direct reclaim makes progress so that + * a zone that was recently full is not skipped over for up to a second + */ +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); +} + +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return local_zone->node == zone->node; +} + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + RECLAIM_DISTANCE; +} + +#else /* CONFIG_NUMA */ + +static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) +{ + return NULL; +} + +static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, + nodemask_t *allowednodes) +{ + return 1; +} + +static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) +{ +} + +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +} + +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return true; +} + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return true; +} + +#endif /* CONFIG_NUMA */ + +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); + } while (zone++ != preferred_zone); +} + +/* + * get_page_from_freelist goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + const struct alloc_context *ac) +{ + struct zonelist *zonelist = ac->zonelist; + struct zoneref *z; + struct page *page = NULL; + struct zone *zone; + nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ + int zlc_active = 0; /* set if using zonelist_cache */ + int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; + +zonelist_scan: + zonelist_rescan = false; + + /* + * Scan zonelist, looking for a zone with enough free. + * See also __cpuset_node_allowed() comment in kernel/cpuset.c. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long mark; + + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* + * Distribute pages in proportion to the individual + * zone size to ensure fair page aging. The zone a + * page was allocated in should have no effect on the + * time the page has in memory before being reclaimed. + */ + if (alloc_flags & ALLOC_FAIR) { + if (!zone_local(ac->preferred_zone, zone)) + break; + if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { + nr_fair_skipped++; + continue; + } + } + /* + * When allocating a page cache page for writing, we + * want to get it from a zone that is within its dirty + * limit, such that no single zone holds more than its + * proportional share of globally allowed dirty pages. + * The dirty limits take into account the zone's + * lowmem reserves and high watermark so that kswapd + * should be able to balance it without having to + * write pages from its LRU list. + * + * This may look like it could increase pressure on + * lower zones by failing allocations in higher zones + * before they are full. But the pages that do spill + * over are limited as the lower zones are protected + * by this very same mechanism. It should not become + * a practical burden to them. + * + * XXX: For now, allow allocations to potentially + * exceed the per-zone dirty limit in the slowpath + * (ALLOC_WMARK_LOW unset) before going into reclaim, + * which is important when on a NUMA setup the allowed + * zones are together not big enough to reach the + * global limit. The proper fix for these situations + * will require awareness of zones in the + * dirty-throttling and the flusher threads. + */ + if (consider_zone_dirty && !zone_dirty_ok(zone)) + continue; + + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (!zone_watermark_ok(zone, order, mark, + ac->classzone_idx, alloc_flags)) { + int ret; + + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + + if (IS_ENABLED(CONFIG_NUMA) && + !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup if there are multiple nodes + * and before considering the first zone allowed + * by the cpuset. + */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; + } + + if (zone_reclaim_mode == 0 || + !zone_allows_reclaim(ac->preferred_zone, zone)) + goto this_zone_full; + + /* + * As we may have just activated ZLC, check if the first + * eligible zone has failed zone_reclaim recently. + */ + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ + continue; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ + continue; + default: + /* did we reclaim enough */ + if (zone_watermark_ok(zone, order, mark, + ac->classzone_idx, alloc_flags)) + goto try_this_zone; + + /* + * Failed to reclaim enough to meet watermark. + * Only mark the zone full if checking the min + * watermark or if we failed to reclaim just + * 1<preferred_zone, zone, order, + gfp_mask, ac->migratetype); + if (page) { + if (prep_new_page(page, order, gfp_mask, alloc_flags)) + goto try_this_zone; + return page; + } +this_zone_full: + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) + zlc_mark_zone_full(zonelist, z); + } + + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(ac->preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; +} + +/* + * Large machines with many possible nodes should not always dump per-node + * meminfo in irq context. + */ +static inline bool should_suppress_show_mem(void) +{ + bool ret = false; + +#if NODES_SHIFT > 8 + ret = in_interrupt(); +#endif + return ret; +} + +static DEFINE_RATELIMIT_STATE(nopage_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +{ + unsigned int filter = SHOW_MEM_FILTER_NODES; + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) + return; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (test_thread_flag(TIF_MEMDIE) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + filter &= ~SHOW_MEM_FILTER_NODES; + + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("%pV", &vaf); + + va_end(args); + } + + pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", + current->comm, order, gfp_mask); + + dump_stack(); + if (!should_suppress_show_mem()) + show_mem(filter); +} + +static inline int +should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long did_some_progress, + unsigned long pages_reclaimed) +{ + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + return 0; + + /* Always retry if specifically requested */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + /* + * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim + * making forward progress without invoking OOM. Suspend also disables + * storage devices so kswapd will not help. Bail if we are suspending. + */ + if (!did_some_progress && pm_suspended_storage()) + return 0; + + /* + * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER + * means __GFP_NOFAIL, but that may not be true in other + * implementations. + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return 1; + + /* + * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is + * specified, then we retry until we no longer reclaim any pages + * (above), or we've reclaimed an order of pages at least as + * large as the allocation's order. In both cases, if the + * allocation still fails, we stop retrying. + */ + if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) + return 1; + + return 0; +} + +static inline struct page * +__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac, unsigned long *did_some_progress) +{ + struct page *page; + + *did_some_progress = 0; + + /* + * Acquire the per-zone oom lock for each zone. If that + * fails, somebody else is making progress for us. + */ + if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { + *did_some_progress = 1; + schedule_timeout_uninterruptible(1); + return NULL; + } + + /* + * Go through the zonelist yet one more time, keep very high watermark + * here, this is only to catch a parallel oom killing, we must fail if + * we're still under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); + if (page) + goto out; + + if (!(gfp_mask & __GFP_NOFAIL)) { + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + /* The OOM killer does not compensate for light reclaim */ + if (!(gfp_mask & __GFP_FS)) { + /* + * XXX: Page reclaim didn't yield anything, + * and the OOM killer can't be invoked, but + * keep looping as per should_alloc_retry(). + */ + *did_some_progress = 1; + goto out; + } + /* The OOM killer may not free memory on a specific node */ + if (gfp_mask & __GFP_THISNODE) + goto out; + } + /* Exhausted what can be done so it's blamo time */ + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) + || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + *did_some_progress = 1; +out: + oom_zonelist_unlock(ac->zonelist, gfp_mask); + return page; +} + +#ifdef CONFIG_COMPACTION +/* Try memory compaction for high-order allocations before reclaim */ +static struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) +{ + unsigned long compact_result; + struct page *page; + + if (!order) + return NULL; + + current->flags |= PF_MEMALLOC; + compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + mode, contended_compaction); + current->flags &= ~PF_MEMALLOC; + + switch (compact_result) { + case COMPACT_DEFERRED: + *deferred_compaction = true; + /* fall-through */ + case COMPACT_SKIPPED: + return NULL; + default: + break; + } + + /* + * At least in one zone compaction wasn't deferred or skipped, so let's + * count a compaction stall + */ + count_vm_event(COMPACTSTALL); + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + + if (page) { + struct zone *zone = page_zone(page); + + zone->compact_blockskip_flush = false; + compaction_defer_reset(zone, order, true); + count_vm_event(COMPACTSUCCESS); + return page; + } + + /* + * It's bad if compaction run occurs and fails. The most likely reason + * is that pages exist, but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + + cond_resched(); + + return NULL; +} +#else +static inline struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) +{ + return NULL; +} +#endif /* CONFIG_COMPACTION */ + +/* Perform direct synchronous page reclaim */ +static int +__perform_reclaim(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) +{ + struct reclaim_state reclaim_state; + int progress; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + current->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + current->reclaim_state = &reclaim_state; + + progress = try_to_free_pages(ac->zonelist, order, gfp_mask, + ac->nodemask); + + current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + current->flags &= ~PF_MEMALLOC; + + cond_resched(); + + return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + int alloc_flags, const struct alloc_context *ac, + unsigned long *did_some_progress) +{ + struct page *page = NULL; + bool drained = false; + + *did_some_progress = __perform_reclaim(gfp_mask, order, ac); + if (unlikely(!(*did_some_progress))) + return NULL; + + /* After successful reclaim, reconsider all zones for allocation */ + if (IS_ENABLED(CONFIG_NUMA)) + zlc_clear_zones_full(ac->zonelist); + +retry: + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + + /* + * If an allocation failed after direct reclaim, it could be because + * pages are pinned on the per-cpu lists. Drain them and try again + */ + if (!page && !drained) { + drain_all_pages(NULL); + drained = true; + goto retry; + } + + return page; +} + +/* + * This is called in the allocator slow-path if the allocation request is of + * sufficient urgency to ignore watermarks and take other desperate measures + */ +static inline struct page * +__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) +{ + struct page *page; + + do { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + + if (!page && gfp_mask & __GFP_NOFAIL) + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, + HZ/50); + } while (!page && (gfp_mask & __GFP_NOFAIL)); + + return page; +} + +static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->high_zoneidx, ac->nodemask) + wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); +} + +static inline int +gfp_to_alloc_flags(gfp_t gfp_mask) +{ + int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); + + /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); + + if (atomic) { + /* + * Not worth trying to allocate harder for __GFP_NOMEMALLOC even + * if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; + /* + * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the + * comment for __cpuset_node_allowed(). + */ + alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(current)) && !in_interrupt()) + alloc_flags |= ALLOC_HARDER; + + if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { + if (gfp_mask & __GFP_MEMALLOC) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) + alloc_flags |= ALLOC_NO_WATERMARKS; + } +#ifdef CONFIG_CMA + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif + return alloc_flags; +} + +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); +} + +static inline struct page * +__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct alloc_context *ac) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct page *page = NULL; + int alloc_flags; + unsigned long pages_reclaimed = 0; + unsigned long did_some_progress; + enum migrate_mode migration_mode = MIGRATE_ASYNC; + bool deferred_compaction = false; + int contended_compaction = COMPACT_CONTENDED_NONE; + + /* + * In the slowpath, we sanity check order to avoid ever trying to + * reclaim >= MAX_ORDER areas which will never succeed. Callers may + * be using allocators in order of preference for an area that is + * too large. + */ + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + return NULL; + } + + /* + * If this allocation cannot block and it is for a specific node, then + * fail early. There's no need to wakeup kswapd or retry for a + * speculative node-specific allocation. + */ + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) + goto nopage; + +retry: + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapds(order, ac); + + /* + * OK, we're below the kswapd watermark and have kicked background + * reclaim. Now things get more complex, so set up alloc_flags according + * to how we want to proceed. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, NULL, &ac->preferred_zone); + ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); + } + + /* This is the last chance, in general, before the goto nopage. */ + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); + if (page) + goto got_pg; + + /* Allocate without watermarks if the context allows */ + if (alloc_flags & ALLOC_NO_WATERMARKS) { + /* + * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds + * the allocation is high priority and these type of + * allocations are system rather than user orientated + */ + ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); + + page = __alloc_pages_high_priority(gfp_mask, order, ac); + + if (page) { + goto got_pg; + } + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) { + /* + * All existing users of the deprecated __GFP_NOFAIL are + * blockable, so warn of any new users that actually allow this + * type of allocation to fail. + */ + WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); + goto nopage; + } + + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) + goto nopage; + + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + migration_mode, + &contended_compaction, + &deferred_compaction); + if (page) + goto got_pg; + + /* Checks for THP-specific high-order allocations */ + if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a THP allocation, we do not want + * to heavily disrupt the system, so we fail the allocation + * instead of entering direct reclaim. + */ + if (deferred_compaction) + goto nopage; + + /* + * In all zones where compaction was attempted (and not + * deferred or skipped), lock contention has been detected. + * For THP allocation we do not want to disrupt the others + * so we fallback to base pages instead. + */ + if (contended_compaction == COMPACT_CONTENDED_LOCK) + goto nopage; + + /* + * If compaction was aborted due to need_resched(), we do not + * want to further increase allocation latency, unless it is + * khugepaged trying to collapse. + */ + if (contended_compaction == COMPACT_CONTENDED_SCHED + && !(current->flags & PF_KTHREAD)) + goto nopage; + } + + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; + + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, + &did_some_progress); + if (page) + goto got_pg; + + /* Check if we should retry the allocation */ + pages_reclaimed += did_some_progress; + if (should_alloc_retry(gfp_mask, order, did_some_progress, + pages_reclaimed)) { + /* + * If we fail to make progress by freeing individual + * pages, but the allocation wants us to keep going, + * start OOM killing tasks. + */ + if (!did_some_progress) { + page = __alloc_pages_may_oom(gfp_mask, order, ac, + &did_some_progress); + if (page) + goto got_pg; + if (!did_some_progress) + goto nopage; + } + /* Wait for some write requests to complete then retry */ + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + goto retry; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, migration_mode, + &contended_compaction, + &deferred_compaction); + if (page) + goto got_pg; + } + +nopage: + warn_alloc_failed(gfp_mask, order, NULL); +got_pg: + return page; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + struct zoneref *preferred_zoneref; + struct page *page = NULL; + unsigned int cpuset_mems_cookie; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { + .high_zoneidx = gfp_zone(gfp_mask), + .nodemask = nodemask, + .migratetype = gfpflags_to_migratetype(gfp_mask), + }; + + gfp_mask &= gfp_allowed_mask; + + lockdep_trace_alloc(gfp_mask); + + might_sleep_if(gfp_mask & __GFP_WAIT); + + if (should_fail_alloc_page(gfp_mask, order)) + return NULL; + + /* + * Check the zones suitable for the gfp_mask contain at least one + * valid zone. It's possible to have an empty zonelist as a result + * of __GFP_THISNODE and a memoryless node + */ + if (unlikely(!zonelist->_zonerefs->zone)) + return NULL; + + if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + + /* We set it here, as __alloc_pages_slowpath might have changed it */ + ac.zonelist = zonelist; + /* The preferred zone is used for statistics later */ + preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, + ac.nodemask ? : &cpuset_current_mems_allowed, + &ac.preferred_zone); + if (!ac.preferred_zone) + goto out; + ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); + + /* First allocation attempt */ + alloc_mask = gfp_mask|__GFP_HARDWALL; + page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); + if (unlikely(!page)) { + /* + * Runtime PM, block IO and its error handling path + * can deadlock because I/O on the device might not + * complete. + */ + alloc_mask = memalloc_noio_flags(gfp_mask); + + page = __alloc_pages_slowpath(alloc_mask, order, &ac); + } + + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + +out: + /* + * When updating a task's mems_allowed, it is possible to race with + * parallel threads in such a way that an allocation can fail while + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return page; +} +EXPORT_SYMBOL(__alloc_pages_nodemask); + +/* + * Common helper functions. + */ +unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + + /* + * __get_free_pages() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + + page = alloc_pages(gfp_mask, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} +EXPORT_SYMBOL(__get_free_pages); + +unsigned long get_zeroed_page(gfp_t gfp_mask) +{ + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); +} +EXPORT_SYMBOL(get_zeroed_page); + +void __free_pages(struct page *page, unsigned int order) +{ + if (put_page_testzero(page)) { + if (order == 0) + free_hot_cold_page(page, false); + else + __free_pages_ok(page, order); + } +} + +EXPORT_SYMBOL(__free_pages); + +void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_pages(virt_to_page((void *)addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. + * + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. + */ +void __free_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_kmem_pages(virt_to_page((void *)addr), order); + } +} + +static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +{ + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page((void *)addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + return (void *)addr; +} + +/** + * alloc_pages_exact - allocate an exact number physically-contiguous pages. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * This function is similar to alloc_pages(), except that it allocates the + * minimum number of pages to satisfy the request. alloc_pages() can only + * allocate memory in power-of-two pages. + * + * This function is also limited by MAX_ORDER. + * + * Memory allocated by this function must be released by free_pages_exact(). + */ +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +{ + unsigned int order = get_order(size); + unsigned long addr; + + addr = __get_free_pages(gfp_mask, order); + return make_alloc_exact(addr, order, size); +} +EXPORT_SYMBOL(alloc_pages_exact); + +/** + * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + * pages on a node. + * @nid: the preferred node ID where memory should be allocated + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * Like alloc_pages_exact(), but try to allocate on node nid first before falling + * back. + * Note this is not alloc_pages_exact_node() which allocates on a specific node, + * but is not exact. + */ +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +{ + unsigned order = get_order(size); + struct page *p = alloc_pages_node(nid, gfp_mask, order); + if (!p) + return NULL; + return make_alloc_exact((unsigned long)page_address(p), order, size); +} + +/** + * free_pages_exact - release memory allocated via alloc_pages_exact() + * @virt: the value returned by alloc_pages_exact. + * @size: size of allocation, same value as passed to alloc_pages_exact(). + * + * Release the memory allocated by a previous call to alloc_pages_exact. + */ +void free_pages_exact(void *virt, size_t size) +{ + unsigned long addr = (unsigned long)virt; + unsigned long end = addr + PAGE_ALIGN(size); + + while (addr < end) { + free_page(addr); + addr += PAGE_SIZE; + } +} +EXPORT_SYMBOL(free_pages_exact); + +/** + * nr_free_zone_pages - count number of pages beyond high watermark + * @offset: The zone index of the highest zone + * + * nr_free_zone_pages() counts the number of counts pages which are beyond the + * high watermark within all zones at or below a given zone index. For each + * zone, the number of pages is calculated as: + * managed_pages - high_pages + */ +static unsigned long nr_free_zone_pages(int offset) +{ + struct zoneref *z; + struct zone *zone; + + /* Just pick one node, since fallback list is circular */ + unsigned long sum = 0; + + struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); + + for_each_zone_zonelist(zone, z, zonelist, offset) { + unsigned long size = zone->managed_pages; + unsigned long high = high_wmark_pages(zone); + if (size > high) + sum += size - high; + } + + return sum; +} + +/** + * nr_free_buffer_pages - count number of pages beyond high watermark + * + * nr_free_buffer_pages() counts the number of pages which are beyond the high + * watermark within ZONE_DMA and ZONE_NORMAL. + */ +unsigned long nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_USER)); +} +EXPORT_SYMBOL_GPL(nr_free_buffer_pages); + +/** + * nr_free_pagecache_pages - count number of pages beyond high watermark + * + * nr_free_pagecache_pages() counts the number of pages which are beyond the + * high watermark within all zones. + */ +unsigned long nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); +} + +static inline void show_node(struct zone *zone) +{ + if (IS_ENABLED(CONFIG_NUMA)) + printk("Node %d ", zone_to_nid(zone)); +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = global_page_state(NR_SHMEM); + val->freeram = global_page_state(NR_FREE_PAGES); + val->bufferram = nr_blockdev_pages(); + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; + pg_data_t *pgdat = NODE_DATA(nid); + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += pgdat->node_zones[zone_type].managed_pages; + val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); + val->freeram = node_page_state(nid, NR_FREE_PAGES); +#ifdef CONFIG_HIGHMEM + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; + val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], + NR_FREE_PAGES); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} +#endif + +/* + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). + */ +bool skip_free_areas_node(unsigned int flags, int nid) +{ + bool ret = false; + unsigned int cpuset_mems_cookie; + + if (!(flags & SHOW_MEM_FILTER_NODES)) + goto out; + + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + ret = !node_isset(nid, cpuset_current_mems_allowed); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); +out: + return ret; +} + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +static void show_migration_types(unsigned char type) +{ + static const char types[MIGRATE_TYPES] = { + [MIGRATE_UNMOVABLE] = 'U', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RESERVE] = 'R', +#ifdef CONFIG_CMA + [MIGRATE_CMA] = 'C', +#endif +#ifdef CONFIG_MEMORY_ISOLATION + [MIGRATE_ISOLATE] = 'I', +#endif + }; + char tmp[MIGRATE_TYPES + 1]; + char *p = tmp; + int i; + + for (i = 0; i < MIGRATE_TYPES; i++) { + if (type & (1 << i)) + *p++ = types[i]; + } + + *p = '\0'; + printk("(%s) ", tmp); +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. + */ +void show_free_areas(unsigned int filter) +{ + unsigned long free_pcp = 0; + int cpu; + struct zone *zone; + + for_each_populated_zone(zone) { + if (skip_free_areas_node(filter, zone_to_nid(zone))) + continue; + + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + } + + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " free:%lu free_pcp:%lu free_cma:%lu\n", + global_page_state(NR_ACTIVE_ANON), + global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_ISOLATED_ANON), + global_page_state(NR_ACTIVE_FILE), + global_page_state(NR_INACTIVE_FILE), + global_page_state(NR_ISOLATED_FILE), + global_page_state(NR_UNEVICTABLE), + global_page_state(NR_FILE_DIRTY), + global_page_state(NR_WRITEBACK), + global_page_state(NR_UNSTABLE_NFS), + global_page_state(NR_SLAB_RECLAIMABLE), + global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_FILE_MAPPED), + global_page_state(NR_SHMEM), + global_page_state(NR_PAGETABLE), + global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_PAGES), + free_pcp, + global_page_state(NR_FREE_CMA_PAGES)); + + for_each_populated_zone(zone) { + int i; + + if (skip_free_areas_node(filter, zone_to_nid(zone))) + continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " present:%lukB" + " managed:%lukB" + " mlocked:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " mapped:%lukB" + " shmem:%lukB" + " slab_reclaimable:%lukB" + " slab_unreclaimable:%lukB" + " kernel_stack:%lukB" + " pagetables:%lukB" + " unstable:%lukB" + " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" + " free_cma:%lukB" + " writeback_tmp:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + zone->name, + K(zone_page_state(zone, NR_FREE_PAGES)), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), + K(zone_page_state(zone, NR_ACTIVE_ANON)), + K(zone_page_state(zone, NR_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ACTIVE_FILE)), + K(zone_page_state(zone, NR_INACTIVE_FILE)), + K(zone_page_state(zone, NR_UNEVICTABLE)), + K(zone_page_state(zone, NR_ISOLATED_ANON)), + K(zone_page_state(zone, NR_ISOLATED_FILE)), + K(zone->present_pages), + K(zone->managed_pages), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_FILE_DIRTY)), + K(zone_page_state(zone, NR_WRITEBACK)), + K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SHMEM)), + K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), + K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + zone_page_state(zone, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, + K(zone_page_state(zone, NR_PAGETABLE)), + K(zone_page_state(zone, NR_UNSTABLE_NFS)), + K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->pageset->pcp.count)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), + K(zone_page_state(zone, NR_PAGES_SCANNED)), + (!zone_reclaimable(zone) ? "yes" : "no") + ); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %ld", zone->lowmem_reserve[i]); + printk("\n"); + } + + for_each_populated_zone(zone) { + unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned char types[MAX_ORDER]; + + if (skip_free_areas_node(filter, zone_to_nid(zone))) + continue; + show_node(zone); + printk("%s: ", zone->name); + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + int type; + + nr[order] = area->nr_free; + total += nr[order] << order; + + types[order] = 0; + for (type = 0; type < MIGRATE_TYPES; type++) { + if (!list_empty(&area->free_list[type])) + types[order] |= 1 << type; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + printk("%lu*%lukB ", nr[order], K(1UL) << order); + if (nr[order]) + show_migration_types(types[order]); + } + printk("= %lukB\n", K(total)); + } + + hugetlb_show_meminfo(); + + printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); + + show_swap_cache_info(); +} + +static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) +{ + zoneref->zone = zone; + zoneref->zone_idx = zone_idx(zone); +} + +/* + * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. + */ +static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, + int nr_zones) +{ + struct zone *zone; + enum zone_type zone_type = MAX_NR_ZONES; + + do { + zone_type--; + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { + zoneref_set_zone(zone, + &zonelist->_zonerefs[nr_zones++]); + check_highest_zone(zone_type); + } + } while (zone_type); + + return nr_zones; +} + + +/* + * zonelist_order: + * 0 = automatic detection of better ordering. + * 1 = order by ([node] distance, -zonetype) + * 2 = order by (-zonetype, [node] distance) + * + * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create + * the same zonelist. So only NUMA can configure this param. + */ +#define ZONELIST_ORDER_DEFAULT 0 +#define ZONELIST_ORDER_NODE 1 +#define ZONELIST_ORDER_ZONE 2 + +/* zonelist order in the kernel. + * set_zonelist_order() will set this to NODE or ZONE. + */ +static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; +static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; + + +#ifdef CONFIG_NUMA +/* The value user specified ....changed by config */ +static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; +/* string for sysctl */ +#define NUMA_ZONELIST_ORDER_LEN 16 +char numa_zonelist_order[16] = "default"; + +/* + * interface for configure zonelist ordering. + * command line option "numa_zonelist_order" + * = "[dD]efault - default, automatic configuration. + * = "[nN]ode - order by node locality, then by zone within node + * = "[zZ]one - order by zone, then by locality within zone + */ + +static int __parse_numa_zonelist_order(char *s) +{ + if (*s == 'd' || *s == 'D') { + user_zonelist_order = ZONELIST_ORDER_DEFAULT; + } else if (*s == 'n' || *s == 'N') { + user_zonelist_order = ZONELIST_ORDER_NODE; + } else if (*s == 'z' || *s == 'Z') { + user_zonelist_order = ZONELIST_ORDER_ZONE; + } else { + printk(KERN_WARNING + "Ignoring invalid numa_zonelist_order value: " + "%s\n", s); + return -EINVAL; + } + return 0; +} + +static __init int setup_numa_zonelist_order(char *s) +{ + int ret; + + if (!s) + return 0; + + ret = __parse_numa_zonelist_order(s); + if (ret == 0) + strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + + return ret; +} +early_param("numa_zonelist_order", setup_numa_zonelist_order); + +/* + * sysctl handler for numa_zonelist_order + */ +int numa_zonelist_order_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + char saved_string[NUMA_ZONELIST_ORDER_LEN]; + int ret; + static DEFINE_MUTEX(zl_order_mutex); + + mutex_lock(&zl_order_mutex); + if (write) { + if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { + ret = -EINVAL; + goto out; + } + strcpy(saved_string, (char *)table->data); + } + ret = proc_dostring(table, write, buffer, length, ppos); + if (ret) + goto out; + if (write) { + int oldval = user_zonelist_order; + + ret = __parse_numa_zonelist_order((char *)table->data); + if (ret) { + /* + * bogus value. restore saved string + */ + strncpy((char *)table->data, saved_string, + NUMA_ZONELIST_ORDER_LEN); + user_zonelist_order = oldval; + } else if (oldval != user_zonelist_order) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } + } +out: + mutex_unlock(&zl_order_mutex); + return ret; +} + + +#define MAX_NODE_LOAD (nr_online_nodes) +static int node_load[MAX_NUMNODES]; + +/** + * find_next_best_node - find the next node that should appear in a given node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: nodemask_t of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + int n, val; + int min_val = INT_MAX; + int best_node = NUMA_NO_NODE; + const struct cpumask *tmp = cpumask_of_node(0); + + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { + node_set(node, *used_node_mask); + return node; + } + + for_each_node_state(n, N_MEMORY) { + + /* Don't want a node to appear more than once */ + if (node_isset(n, *used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Penalize nodes under us ("prefer the next node") */ + val += (n < node); + + /* Give preference to headless and unused nodes */ + tmp = cpumask_of_node(n); + if (!cpumask_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + node_set(best_node, *used_node_mask); + + return best_node; +} + + +/* + * Build zonelists ordered by node and zones within node. + * This results in maximum locality--normal zone overflows into local + * DMA zone, if any--but risks exhausting DMA zone. + */ +static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) +{ + int j; + struct zonelist *zonelist; + + zonelist = &pgdat->node_zonelists[0]; + for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) + ; + j = build_zonelists_node(NODE_DATA(node), zonelist, j); + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; +} + +/* + * Build gfp_thisnode zonelists + */ +static void build_thisnode_zonelists(pg_data_t *pgdat) +{ + int j; + struct zonelist *zonelist; + + zonelist = &pgdat->node_zonelists[1]; + j = build_zonelists_node(pgdat, zonelist, 0); + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; +} + +/* + * Build zonelists ordered by zone and nodes within zones. + * This results in conserving DMA zone[s] until all Normal memory is + * exhausted, but results in overflowing to remote node while memory + * may still exist in local DMA zone. + */ +static int node_order[MAX_NUMNODES]; + +static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) +{ + int pos, j, node; + int zone_type; /* needs to be signed */ + struct zone *z; + struct zonelist *zonelist; + + zonelist = &pgdat->node_zonelists[0]; + pos = 0; + for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { + for (j = 0; j < nr_nodes; j++) { + node = node_order[j]; + z = &NODE_DATA(node)->node_zones[zone_type]; + if (populated_zone(z)) { + zoneref_set_zone(z, + &zonelist->_zonerefs[pos++]); + check_highest_zone(zone_type); + } + } + } + zonelist->_zonerefs[pos].zone = NULL; + zonelist->_zonerefs[pos].zone_idx = 0; +} + +#if defined(CONFIG_64BIT) +/* + * Devices that require DMA32/DMA are relatively rare and do not justify a + * penalty to every machine in case the specialised case applies. Default + * to Node-ordering on 64-bit NUMA machines + */ +static int default_zonelist_order(void) +{ + return ZONELIST_ORDER_NODE; +} +#else +/* + * On 32-bit, the Normal zone needs to be preserved for allocations accessible + * by the kernel. If processes running on node 0 deplete the low memory zone + * then reclaim will occur more frequency increasing stalls and potentially + * be easier to OOM if a large percentage of the zone is under writeback or + * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. + * Hence, default to zone ordering on 32-bit. + */ +static int default_zonelist_order(void) +{ + return ZONELIST_ORDER_ZONE; +} +#endif /* CONFIG_64BIT */ + +static void set_zonelist_order(void) +{ + if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) + current_zonelist_order = default_zonelist_order(); + else + current_zonelist_order = user_zonelist_order; +} + +static void build_zonelists(pg_data_t *pgdat) +{ + int j, node, load; + enum zone_type i; + nodemask_t used_mask; + int local_node, prev_node; + struct zonelist *zonelist; + int order = current_zonelist_order; + + /* initialize zonelists */ + for (i = 0; i < MAX_ZONELISTS; i++) { + zonelist = pgdat->node_zonelists + i; + zonelist->_zonerefs[0].zone = NULL; + zonelist->_zonerefs[0].zone_idx = 0; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = nr_online_nodes; + prev_node = local_node; + nodes_clear(used_mask); + + memset(node_order, 0, sizeof(node_order)); + j = 0; + + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] = load; + + prev_node = node; + load--; + if (order == ZONELIST_ORDER_NODE) + build_zonelists_in_node_order(pgdat, node); + else + node_order[j++] = node; /* remember order */ + } + + if (order == ZONELIST_ORDER_ZONE) { + /* calculate node order -- i.e., DMA last! */ + build_zonelists_in_zone_order(pgdat, j); + } + + build_thisnode_zonelists(pgdat); +} + +/* Construct the zonelist performance cache - see further mmzone.h */ +static void build_zonelist_cache(pg_data_t *pgdat) +{ + struct zonelist *zonelist; + struct zonelist_cache *zlc; + struct zoneref *z; + + zonelist = &pgdat->node_zonelists[0]; + zonelist->zlcache_ptr = zlc = &zonelist->zlcache; + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); + for (z = zonelist->_zonerefs; z->zone; z++) + zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); +} + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * Return node id of node used for "local" allocations. + * I.e., first node id of first zone in arg node's generic zonelist. + * Used for initializing percpu 'numa_mem', which is used primarily + * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. + */ +int local_memory_node(int node) +{ + struct zone *zone; + + (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + gfp_zone(GFP_KERNEL), + NULL, + &zone); + return zone->node; +} +#endif + +#else /* CONFIG_NUMA */ + +static void set_zonelist_order(void) +{ + current_zonelist_order = ZONELIST_ORDER_ZONE; +} + +static void build_zonelists(pg_data_t *pgdat) +{ + int node, local_node; + enum zone_type j; + struct zonelist *zonelist; + + local_node = pgdat->node_id; + + zonelist = &pgdat->node_zonelists[0]; + j = build_zonelists_node(pgdat, zonelist, 0); + + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j); + } + for (node = 0; node < local_node; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j); + } + + zonelist->_zonerefs[j].zone = NULL; + zonelist->_zonerefs[j].zone_idx = 0; +} + +/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ +static void build_zonelist_cache(pg_data_t *pgdat) +{ + pgdat->node_zonelists[0].zlcache_ptr = NULL; +} + +#endif /* CONFIG_NUMA */ + +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); + +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex); + +/* return values int ....just for stop_machine() */ +static int __build_all_zonelists(void *data) +{ + int nid; + int cpu; + pg_data_t *self = data; + +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif + + if (self && !node_online(self->node_id)) { + build_zonelists(self); + build_zonelist_cache(self); + } + + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + build_zonelists(pgdat); + build_zonelist_cache(pgdat); + } + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) { + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * We now know the "local memory node" for each node-- + * i.e., the node of the first zone in the generic zonelist. + * Set up numa_mem percpu variable for on-line cpus. During + * boot, only the boot cpu should be on-line; we'll init the + * secondary cpus' numa_mem as they come on-line. During + * node/memory hotplug, we'll fixup all on-line cpus. + */ + if (cpu_online(cpu)) + set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); +#endif + } + + return 0; +} + +static noinline void __init +build_all_zonelists_init(void) +{ + __build_all_zonelists(NULL); + mminit_verify_zonelist(); + cpuset_init_current_mems_allowed(); +} + +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + * + * __ref due to (1) call of __meminit annotated setup_zone_pageset + * [we're only called with non-NULL zone through __meminit paths] and + * (2) call of __init annotated helper build_all_zonelists_init + * [protected by SYSTEM_BOOTING]. + */ +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) +{ + set_zonelist_order(); + + if (system_state == SYSTEM_BOOTING) { + build_all_zonelists_init(); + } else { +#ifdef CONFIG_MEMORY_HOTPLUG + if (zone) + setup_zone_pageset(zone); +#endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ + stop_machine(__build_all_zonelists, pgdat, NULL); + /* cpuset refresh routine should be here */ + } + vm_total_pages = nr_free_pagecache_pages(); + /* + * Disable grouping by mobility if the number of pages in the + * system is too low to allow the mechanism to work. It would be + * more accurate, but expensive to check per-zone. This check is + * made on memory-hotadd so a system can start with mobility + * disabled and enable it later + */ + if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) + page_group_by_mobility_disabled = 1; + else + page_group_by_mobility_disabled = 0; + + pr_info("Built %i zonelists in %s order, mobility grouping %s. " + "Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); +#ifdef CONFIG_NUMA + pr_info("Policy zone: %s\n", zone_names[policy_zone]); +#endif +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} +#else +/* + * A zone's size might be changed by hot-add, so it is not possible to determine + * a suitable size for its wait_table. So we use the maximum size now. + * + * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: + * + * i386 (preemption config) : 4096 x 16 = 64Kbyte. + * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. + * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. + * + * The maximum entries are prepared when a zone's memory is (512K + 256) pages + * or more by the traditional way. (See above). It equals: + * + * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. + * ia64(16K page size) : = ( 8G + 4M)byte. + * powerpc (64K page size) : = (32G +16M)byte. + */ +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + return 4096UL; +} +#endif + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +/* + * Check if a pageblock contains reserved pages + */ +static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) + return 1; + } + return 0; +} + +/* + * Mark a number of pageblocks as MIGRATE_RESERVE. The number + * of blocks reserved is based on min_wmark_pages(zone). The memory within + * the reserve will tend to store contiguous free pages. Setting min_free_kbytes + * higher will lead to a bigger reserve which will get freed as contiguous + * blocks as reclaim kicks in + */ +static void setup_zone_migrate_reserve(struct zone *zone) +{ + unsigned long start_pfn, pfn, end_pfn, block_end_pfn; + struct page *page; + unsigned long block_migratetype; + int reserve; + int old_reserve; + + /* + * Get the start pfn, end pfn and the number of blocks to reserve + * We have to be careful to be aligned to pageblock_nr_pages to + * make sure that we always check pfn_valid for the first page in + * the block. + */ + start_pfn = zone->zone_start_pfn; + end_pfn = zone_end_pfn(zone); + start_pfn = roundup(start_pfn, pageblock_nr_pages); + reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> + pageblock_order; + + /* + * Reserve blocks are generally in place to help high-order atomic + * allocations that are short-lived. A min_free_kbytes value that + * would result in more than 2 reserve blocks for atomic allocations + * is assumed to be in place to help anti-fragmentation for the + * future allocation of hugepages at runtime. + */ + reserve = min(2, reserve); + old_reserve = zone->nr_migrate_reserve_block; + + /* When memory hot-add, we almost always need to do nothing */ + if (reserve == old_reserve) + return; + zone->nr_migrate_reserve_block = reserve; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + + /* Watch out for overlapping nodes */ + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + + block_migratetype = get_pageblock_migratetype(page); + + /* Only test what is necessary when the reserves are not met */ + if (reserve > 0) { + /* + * Blocks with reserved pages will never free, skip + * them. + */ + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) + continue; + + /* If this block is reserved, account for it */ + if (block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, + MIGRATE_RESERVE); + move_freepages_block(zone, page, + MIGRATE_RESERVE); + reserve--; + continue; + } + } else if (!old_reserve) { + /* + * At boot time we don't need to scan the whole zone + * for turning off MIGRATE_RESERVE. + */ + break; + } + + /* + * If the reserve is met and this is a previous reserved block, + * take it back + */ + if (block_migratetype == MIGRATE_RESERVE) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + move_freepages_block(zone, page, MIGRATE_MOVABLE); + } + } +} + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn, enum memmap_context context) +{ + struct page *page; + unsigned long end_pfn = start_pfn + size; + unsigned long pfn; + struct zone *z; + + if (highest_memmap_pfn < end_pfn - 1) + highest_memmap_pfn = end_pfn - 1; + + z = &NODE_DATA(nid)->node_zones[zone]; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + /* + * There can be holes in boot-time mem_map[]s + * handed to this function. They do not + * exist on hotplugged memory. + */ + if (context == MEMMAP_EARLY) { + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + } + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); + mminit_verify_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + SetPageReserved(page); + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. Later some blocks near + * the start are marked MIGRATE_RESERVE by + * setup_zone_migrate_reserve() + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + */ + if ((z->zone_start_pfn <= pfn) + && (pfn < zone_end_pfn(z)) + && !(pfn & (pageblock_nr_pages - 1))) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif + } +} + +static void __meminit zone_init_free_lists(struct zone *zone) +{ + unsigned int order, t; + for_each_migratetype_order(order, t) { + INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); + zone->free_area[order].nr_free = 0; + } +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(size, nid, zone, start_pfn) \ + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) +#endif + +static int zone_batchsize(struct zone *zone) +{ +#ifdef CONFIG_MMU + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/2 of a meg. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->managed_pages / 1024; + if (batch * PAGE_SIZE > 512 * 1024) + batch = (512 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = rounddown_pow_of_two(batch + batch/2) - 1; + + return batch; + +#else + /* The deferral and batching of frees should be suppressed under NOMMU + * conditions. + * + * The problem is that NOMMU needs to be able to allocate large chunks + * of contiguous memory as there's no hardware page translation to + * assemble apparent contiguous memory from discontiguous pages. + * + * Queueing large contiguous runs of pages for batching, however, + * causes the pages to actually be freed in smaller chunks. As there + * can be a significant delay between the individual batches being + * recycled, this leads to the once large chunks of space being + * fragmented and becoming unavailable for high-order allocations. + */ + return 0; +#endif +} + +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + +/* a companion to pageset_set_high() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); +} + +static void pageset_init(struct per_cpu_pageset *p) +{ + struct per_cpu_pages *pcp; + int migratetype; + + memset(p, 0, sizeof(*p)); + + pcp = &p->pcp; + pcp->count = 0; + for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) + INIT_LIST_HEAD(&pcp->lists[migratetype]); +} + +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + +/* + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ +static void pageset_set_high(struct per_cpu_pageset *p, + unsigned long high) +{ + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; + + pageset_update(&p->pcp, high, batch); +} + +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) +{ + if (percpu_pagelist_fraction) + pageset_set_high(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} + +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + pageset_set_high_and_batch(zone, pcp); +} + +static void __meminit setup_zone_pageset(struct zone *zone) +{ + int cpu; + zone->pageset = alloc_percpu(struct per_cpu_pageset); + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); +} + +/* + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + */ +void __init setup_per_cpu_pageset(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + setup_zone_pageset(zone); +} + +static noinline __init_refok +int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + size_t alloc_size; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_hash_nr_entries = + wait_table_hash_nr_entries(zone_size_pages); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_hash_nr_entries); + alloc_size = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + + if (!slab_is_available()) { + zone->wait_table = (wait_queue_head_t *) + memblock_virt_alloc_node_nopanic( + alloc_size, zone->zone_pgdat->node_id); + } else { + /* + * This case means that a zone whose size was 0 gets new memory + * via memory hot-add. + * But it may be the case that a new node was hot-added. In + * this case vmalloc() will not be able to use this new node's + * memory - this wait_table must be initialized to use this new + * node itself as well. + * To use this new node's memory, further consideration will be + * necessary. + */ + zone->wait_table = vmalloc(alloc_size); + } + if (!zone->wait_table) + return -ENOMEM; + + for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) + init_waitqueue_head(zone->wait_table + i); + + return 0; +} + +static __meminit void zone_pcp_init(struct zone *zone) +{ + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->pageset = &boot_pageset; + + if (populated_zone(zone)) + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", + zone->name, zone->present_pages, + zone_batchsize(zone)); +} + +int __meminit init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, + unsigned long size, + enum memmap_context context) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int ret; + ret = zone_wait_table_init(zone, size); + if (ret) + return ret; + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_start_pfn = zone_start_pfn; + + mminit_dprintk(MMINIT_TRACE, "memmap_init", + "Initialising map node %d zone %lu pfns %lu -> %lu\n", + pgdat->node_id, + (unsigned long)zone_idx(zone), + zone_start_pfn, (zone_start_pfn + size)); + + zone_init_free_lists(zone); + + return 0; +} + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +int __meminit __early_pfn_to_nid(unsigned long pfn) +{ + unsigned long start_pfn, end_pfn; + int nid; + /* + * NOTE: The following SMP-unsafe globals are only used early in boot + * when the kernel is running single-threaded. + */ + static unsigned long __meminitdata last_start_pfn, last_end_pfn; + static int __meminitdata last_nid; + + if (last_start_pfn <= pfn && pfn < last_end_pfn) + return last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != -1) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; + } + + return nid; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; +} + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0 && nid != node) + return false; + return true; +} +#endif + +/** + * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range + * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. + * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid + * + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually. + */ +void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) +{ + unsigned long start_pfn, end_pfn; + int i, this_nid; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { + start_pfn = min(start_pfn, max_low_pfn); + end_pfn = min(end_pfn, max_low_pfn); + + if (start_pfn < end_pfn) + memblock_free_early_nid(PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT, + this_nid); + } +} + +/** + * sparse_memory_present_with_active_regions - Call memory_present for each active range + * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. + * + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually. + */ +void __init sparse_memory_present_with_active_regions(int nid) +{ + unsigned long start_pfn, end_pfn; + int i, this_nid; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); +} + +/** + * get_pfn_range_for_nid - Return the start and end page frames for a node + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. + * @start_pfn: Passed by reference. On return, it will have the node start_pfn. + * @end_pfn: Passed by reference. On return, it will have the node end_pfn. + * + * It returns the start and end page frame of a node based on information + * provided by memblock_set_node(). If called for a node + * with no available memory, a warning is printed and the start and end + * PFNs will be 0. + */ +void __meminit get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + unsigned long this_start_pfn, this_end_pfn; + int i; + + *start_pfn = -1UL; + *end_pfn = 0; + + for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { + *start_pfn = min(*start_pfn, this_start_pfn); + *end_pfn = max(*end_pfn, this_end_pfn); + } + + if (*start_pfn == -1UL) + *start_pfn = 0; +} + +/* + * This finds a zone that can be used for ZONE_MOVABLE pages. The + * assumption is made that zones within a node are ordered in monotonic + * increasing memory addresses so that the "highest" populated zone is used + */ +static void __init find_usable_zone_for_movable(void) +{ + int zone_index; + for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { + if (zone_index == ZONE_MOVABLE) + continue; + + if (arch_zone_highest_possible_pfn[zone_index] > + arch_zone_lowest_possible_pfn[zone_index]) + break; + } + + VM_BUG_ON(zone_index == -1); + movable_zone = zone_index; +} + +/* + * The zone ranges provided by the architecture do not include ZONE_MOVABLE + * because it is sized independent of architecture. Unlike the other zones, + * the starting point for ZONE_MOVABLE is not fixed. It may be different + * in each node depending on the size of each node and how evenly kernelcore + * is distributed. This helper function adjusts the zone ranges + * provided by the architecture for a given node by using the end of the + * highest usable zone for ZONE_MOVABLE. This preserves the assumption that + * zones within a node are in order of monotonic increases memory addresses + */ +static void __meminit adjust_zone_range_for_zone_movable(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn) +{ + /* Only adjust if ZONE_MOVABLE is on this node */ + if (zone_movable_pfn[nid]) { + /* Size ZONE_MOVABLE */ + if (zone_type == ZONE_MOVABLE) { + *zone_start_pfn = zone_movable_pfn[nid]; + *zone_end_pfn = min(node_end_pfn, + arch_zone_highest_possible_pfn[movable_zone]); + + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (*zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + + /* Check if this whole range is within ZONE_MOVABLE */ + } else if (*zone_start_pfn >= zone_movable_pfn[nid]) + *zone_start_pfn = *zone_end_pfn; + } +} + +/* + * Return the number of pages a zone spans in a node, including holes + * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() + */ +static unsigned long __meminit zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *ignored) +{ + unsigned long zone_start_pfn, zone_end_pfn; + + /* Get the start and end of the zone */ + zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); + + /* Check that this node has pages within the zone's required range */ + if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + return 0; + + /* Move the zone boundaries inside the node if necessary */ + zone_end_pfn = min(zone_end_pfn, node_end_pfn); + zone_start_pfn = max(zone_start_pfn, node_start_pfn); + + /* Return the spanned pages */ + return zone_end_pfn - zone_start_pfn; +} + +/* + * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, + * then all holes in the requested range will be accounted for. + */ +unsigned long __meminit __absent_pages_in_range(int nid, + unsigned long range_start_pfn, + unsigned long range_end_pfn) +{ + unsigned long nr_absent = range_end_pfn - range_start_pfn; + unsigned long start_pfn, end_pfn; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + nr_absent -= end_pfn - start_pfn; + } + return nr_absent; +} + +/** + * absent_pages_in_range - Return number of page frames in holes within a range + * @start_pfn: The start PFN to start searching for holes + * @end_pfn: The end PFN to stop searching for holes + * + * It returns the number of pages frames in memory holes within a range. + */ +unsigned long __init absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn) +{ + return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); +} + +/* Return the number of page frames in holes in a zone on a node */ +static unsigned long __meminit zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *ignored) +{ + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; + unsigned long zone_start_pfn, zone_end_pfn; + + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); + + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); + return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); +} + +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size) +{ + return zones_size[zone_type]; +} + +static inline unsigned long __meminit zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zholes_size) +{ + if (!zholes_size) + return 0; + + return zholes_size[zone_type]; +} + +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + enum zone_type i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zones_size); + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= + zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, node_end_pfn, + zholes_size); + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, + realtotalpages); +} + +#ifndef CONFIG_SPARSEMEM +/* + * Calculate the size of the zone->blockflags rounded to an unsigned long + * Start by making sure zonesize is a multiple of pageblock_order by rounding + * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally + * round what is now in bits to nearest long in bits, then return it in + * bytes. + */ +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) +{ + unsigned long usemapsize; + + zonesize += zone_start_pfn & (pageblock_nr_pages-1); + usemapsize = roundup(zonesize, pageblock_nr_pages); + usemapsize = usemapsize >> pageblock_order; + usemapsize *= NR_PAGEBLOCK_BITS; + usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); + + return usemapsize / 8; +} + +static void __init setup_usemap(struct pglist_data *pgdat, + struct zone *zone, + unsigned long zone_start_pfn, + unsigned long zonesize) +{ + unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); + zone->pageblock_flags = NULL; + if (usemapsize) + zone->pageblock_flags = + memblock_virt_alloc_node_nopanic(usemapsize, + pgdat->node_id); +} +#else +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, + unsigned long zone_start_pfn, unsigned long zonesize) {} +#endif /* CONFIG_SPARSEMEM */ + +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE + +/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ +void __paginginit set_pageblock_order(void) +{ + unsigned int order; + + /* Check that pageblock_nr_pages has not already been setup */ + if (pageblock_order) + return; + + if (HPAGE_SHIFT > PAGE_SHIFT) + order = HUGETLB_PAGE_ORDER; + else + order = MAX_ORDER - 1; + + /* + * Assume the largest contiguous order of interest is a huge page. + * This value may be variable depending on boot parameters on IA64 and + * powerpc. + */ + pageblock_order = order; +} +#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +/* + * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config + */ +void __paginginit set_pageblock_order(void) +{ +} + +#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) +{ + unsigned long pages = spanned_pages; + + /* + * Provide a more accurate estimation if there are holes within + * the zone and SPARSEMEM is in use. If there are holes within the + * zone, each populated memory region may cost us one or two extra + * memmap pages due to alignment because memmap pages for each + * populated regions may not naturally algined on page boundary. + * So the (present_pages >> 4) heuristic is a tradeoff for that. + */ + if (spanned_pages > present_pages + (present_pages >> 4) && + IS_ENABLED(CONFIG_SPARSEMEM)) + pages = present_pages; + + return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; +} + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. + */ +static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zones_size, unsigned long *zholes_size) +{ + enum zone_type j; + int nid = pgdat->node_id; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + int ret; + + pgdat_resize_init(pgdat); +#ifdef CONFIG_NUMA_BALANCING + spin_lock_init(&pgdat->numabalancing_migrate_lock); + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies; +#endif + init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); + pgdat_page_ext_init(pgdat); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize, freesize, memmap_pages; + + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); + realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, + zholes_size); + + /* + * Adjust freesize so that it accounts for how much memory + * is used by this zone for memmap. This affects the watermark + * and per-cpu initialisations + */ + memmap_pages = calc_memmap_size(size, realsize); + if (!is_highmem_idx(j)) { + if (freesize >= memmap_pages) { + freesize -= memmap_pages; + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); + } + + /* Account for reserved pages */ + if (j == 0 && freesize > dma_reserve) { + freesize -= dma_reserve; + printk(KERN_DEBUG " %s zone: %lu pages reserved\n", + zone_names[0], dma_reserve); + } + + if (!is_highmem_idx(j)) + nr_kernel_pages += freesize; + /* Charge for highmem memmap if there are enough kernel pages */ + else if (nr_kernel_pages > memmap_pages * 2) + nr_kernel_pages -= memmap_pages; + nr_all_pages += freesize; + + zone->spanned_pages = size; + zone->present_pages = realsize; + /* + * Set an approximate value for lowmem here, it will be adjusted + * when the bootmem allocator frees pages into the buddy system. + * And all highmem pages will be managed by the buddy system. + */ + zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; +#ifdef CONFIG_NUMA + zone->node = nid; + zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) + / 100; + zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; +#endif + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); + zone->zone_pgdat = pgdat; + zone_pcp_init(zone); + + /* For bootup, initialized properly in watermark setup */ + mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); + + lruvec_init(&zone->lruvec); + if (!size) + continue; + + set_pageblock_order(); + setup_usemap(pgdat, zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, + size, MEMMAP_EARLY); + BUG_ON(ret); + memmap_init(size, nid, j, zone_start_pfn); + zone_start_pfn += size; + } +} + +static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) +{ + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + +#ifdef CONFIG_FLAT_NODE_MEM_MAP + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + unsigned long size, start, end; + struct page *map; + + /* + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. + */ + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + end = pgdat_end_pfn(pgdat); + end = ALIGN(end, MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); + map = alloc_remap(pgdat->node_id, size); + if (!map) + map = memblock_virt_alloc_node_nopanic(size, + pgdat->node_id); + pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); + } +#ifndef CONFIG_NEED_MULTIPLE_NODES + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) { + mem_map = NODE_DATA(0)->node_mem_map; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + } +#endif +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ +} + +void __paginginit free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) +{ + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; + + /* pg_data_t should be reset to zero when it's allocated */ + WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); + + alloc_node_mem_map(pgdat); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", + nid, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#endif + + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); +} + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + +#if MAX_NUMNODES > 1 +/* + * Figure out the number of possible node ids. + */ +void __init setup_nr_node_ids(void) +{ + unsigned int node; + unsigned int highest = 0; + + for_each_node_mask(node, node_possible_map) + highest = node; + nr_node_ids = highest + 1; +} +#endif + +/** + * node_map_pfn_alignment - determine the maximum internode alignment + * + * This function should be called after node map is populated and sorted. + * It calculates the maximum power of two alignment which can distinguish + * all the nodes. + * + * For example, if all nodes are 1GiB and aligned to 1GiB, the return value + * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the + * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is + * shifted, 1GiB is enough and this function will indicate so. + * + * This is used to test whether pfn -> nid mapping of the chosen memory + * model has fine enough granularity to avoid incorrect mapping for the + * populated node map. + * + * Returns the determined alignment in pfn's. 0 if there is no alignment + * requirement (single node). + */ +unsigned long __init node_map_pfn_alignment(void) +{ + unsigned long accl_mask = 0, last_end = 0; + unsigned long start, end, mask; + int last_nid = -1; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { + if (!start || last_nid < 0 || last_nid == nid) { + last_nid = nid; + last_end = end; + continue; + } + + /* + * Start with a mask granular enough to pin-point to the + * start pfn and tick off bits one-by-one until it becomes + * too coarse to separate the current node from the last. + */ + mask = ~((1 << __ffs(start)) - 1); + while (mask && last_end <= (start & (mask << 1))) + mask <<= 1; + + /* accumulate all internode masks */ + accl_mask |= mask; + } + + /* convert mask to number of pages */ + return ~accl_mask + 1; +} + +/* Find the lowest pfn for a node */ +static unsigned long __init find_min_pfn_for_node(int nid) +{ + unsigned long min_pfn = ULONG_MAX; + unsigned long start_pfn; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) + min_pfn = min(min_pfn, start_pfn); + + if (min_pfn == ULONG_MAX) { + printk(KERN_WARNING + "Could not find start_pfn for node %d\n", nid); + return 0; + } + + return min_pfn; +} + +/** + * find_min_pfn_with_active_regions - Find the minimum PFN registered + * + * It returns the minimum PFN based on information provided via + * memblock_set_node(). + */ +unsigned long __init find_min_pfn_with_active_regions(void) +{ + return find_min_pfn_for_node(MAX_NUMNODES); +} + +/* + * early_calculate_totalpages() + * Sum pages in active regions for movable zone. + * Populate N_MEMORY for calculating usable_nodes. + */ +static unsigned long __init early_calculate_totalpages(void) +{ + unsigned long totalpages = 0; + unsigned long start_pfn, end_pfn; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + unsigned long pages = end_pfn - start_pfn; + + totalpages += pages; + if (pages) + node_set_state(nid, N_MEMORY); + } + return totalpages; +} + +/* + * Find the PFN the Movable zone begins in each node. Kernel memory + * is spread evenly between nodes as long as the nodes have enough + * memory. When they don't, some nodes will have more kernelcore than + * others + */ +static void __init find_zone_movable_pfns_for_nodes(void) +{ + int i, nid; + unsigned long usable_startpfn; + unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_MEMORY]; + unsigned long totalpages = early_calculate_totalpages(); + int usable_nodes = nodes_weight(node_states[N_MEMORY]); + struct memblock_region *r; + + /* Need to find movable_zone earlier when movable_node is specified. */ + find_usable_zone_for_movable(); + + /* + * If movable_node is specified, ignore kernelcore and movablecore + * options. + */ + if (movable_node_is_enabled()) { + for_each_memblock(memory, r) { + if (!memblock_is_hotpluggable(r)) + continue; + + nid = r->nid; + + usable_startpfn = PFN_DOWN(r->base); + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + goto out2; + } + + /* + * If movablecore=nn[KMG] was specified, calculate what size of + * kernelcore that corresponds so that memory usable for + * any allocation type is evenly spread. If both kernelcore + * and movablecore are specified, then the value of kernelcore + * will be used for required_kernelcore if it's greater than + * what movablecore would have allowed. + */ + if (required_movablecore) { + unsigned long corepages; + + /* + * Round-up so that ZONE_MOVABLE is at least as large as what + * was requested by the user + */ + required_movablecore = + roundup(required_movablecore, MAX_ORDER_NR_PAGES); + corepages = totalpages - required_movablecore; + + required_kernelcore = max(required_kernelcore, corepages); + } + + /* If kernelcore was not specified, there is no ZONE_MOVABLE */ + if (!required_kernelcore) + goto out; + + /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ + usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; + +restart: + /* Spread kernelcore memory as evenly as possible throughout nodes */ + kernelcore_node = required_kernelcore / usable_nodes; + for_each_node_state(nid, N_MEMORY) { + unsigned long start_pfn, end_pfn; + + /* + * Recalculate kernelcore_node if the division per node + * now exceeds what is necessary to satisfy the requested + * amount of memory for the kernel + */ + if (required_kernelcore < kernelcore_node) + kernelcore_node = required_kernelcore / usable_nodes; + + /* + * As the map is walked, we track how much memory is usable + * by the kernel using kernelcore_remaining. When it is + * 0, the rest of the node is usable by ZONE_MOVABLE + */ + kernelcore_remaining = kernelcore_node; + + /* Go through each range of PFNs within this node */ + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + unsigned long size_pages; + + start_pfn = max(start_pfn, zone_movable_pfn[nid]); + if (start_pfn >= end_pfn) + continue; + + /* Account for what is only usable for kernelcore */ + if (start_pfn < usable_startpfn) { + unsigned long kernel_pages; + kernel_pages = min(end_pfn, usable_startpfn) + - start_pfn; + + kernelcore_remaining -= min(kernel_pages, + kernelcore_remaining); + required_kernelcore -= min(kernel_pages, + required_kernelcore); + + /* Continue if range is now fully accounted */ + if (end_pfn <= usable_startpfn) { + + /* + * Push zone_movable_pfn to the end so + * that if we have to rebalance + * kernelcore across nodes, we will + * not double account here + */ + zone_movable_pfn[nid] = end_pfn; + continue; + } + start_pfn = usable_startpfn; + } + + /* + * The usable PFN range for ZONE_MOVABLE is from + * start_pfn->end_pfn. Calculate size_pages as the + * number of pages used as kernelcore + */ + size_pages = end_pfn - start_pfn; + if (size_pages > kernelcore_remaining) + size_pages = kernelcore_remaining; + zone_movable_pfn[nid] = start_pfn + size_pages; + + /* + * Some kernelcore has been met, update counts and + * break if the kernelcore for this node has been + * satisfied + */ + required_kernelcore -= min(required_kernelcore, + size_pages); + kernelcore_remaining -= size_pages; + if (!kernelcore_remaining) + break; + } + } + + /* + * If there is still required_kernelcore, we do another pass with one + * less node in the count. This will push zone_movable_pfn[nid] further + * along on the nodes that still have memory until kernelcore is + * satisfied + */ + usable_nodes--; + if (usable_nodes && required_kernelcore > usable_nodes) + goto restart; + +out2: + /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + zone_movable_pfn[nid] = + roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_MEMORY] = saved_node_state; +} + +/* Any regular or high memory on that node ? */ +static void check_for_memory(pg_data_t *pgdat, int nid) +{ + enum zone_type zone_type; + + if (N_MEMORY == N_NORMAL_MEMORY) + return; + + for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + if (populated_zone(zone)) { + node_set_state(nid, N_HIGH_MEMORY); + if (N_NORMAL_MEMORY != N_HIGH_MEMORY && + zone_type <= ZONE_NORMAL) + node_set_state(nid, N_NORMAL_MEMORY); + break; + } + } +} + +/** + * free_area_init_nodes - Initialise all pg_data_t and zone data + * @max_zone_pfn: an array of max PFNs for each zone + * + * This will call free_area_init_node() for each active node in the system. + * Using the page ranges provided by memblock_set_node(), the size of each + * zone in each node and their holes is calculated. If the maximum PFN + * between two adjacent zones match, it is assumed that the zone is empty. + * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed + * that arch_max_dma32_pfn has no pages. It is also assumed that a zone + * starts where the previous one ended. For example, ZONE_DMA32 starts + * at arch_max_dma_pfn. + */ +void __init free_area_init_nodes(unsigned long *max_zone_pfn) +{ + unsigned long start_pfn, end_pfn; + int i, nid; + + /* Record where the zone boundaries are */ + memset(arch_zone_lowest_possible_pfn, 0, + sizeof(arch_zone_lowest_possible_pfn)); + memset(arch_zone_highest_possible_pfn, 0, + sizeof(arch_zone_highest_possible_pfn)); + arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); + arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; + for (i = 1; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; + arch_zone_lowest_possible_pfn[i] = + arch_zone_highest_possible_pfn[i-1]; + arch_zone_highest_possible_pfn[i] = + max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + } + arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; + arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; + + /* Find the PFNs that ZONE_MOVABLE begins at in each node */ + memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); + find_zone_movable_pfns_for_nodes(); + + /* Print out the zone ranges */ + pr_info("Zone ranges:\n"); + for (i = 0; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; + pr_info(" %-8s ", zone_names[i]); + if (arch_zone_lowest_possible_pfn[i] == + arch_zone_highest_possible_pfn[i]) + pr_cont("empty\n"); + else + pr_cont("[mem %#018Lx-%#018Lx]\n", + (u64)arch_zone_lowest_possible_pfn[i] + << PAGE_SHIFT, + ((u64)arch_zone_highest_possible_pfn[i] + << PAGE_SHIFT) - 1); + } + + /* Print out the PFNs ZONE_MOVABLE begins at in each node */ + pr_info("Movable zone start for each node\n"); + for (i = 0; i < MAX_NUMNODES; i++) { + if (zone_movable_pfn[i]) + pr_info(" Node %d: %#018Lx\n", i, + (u64)zone_movable_pfn[i] << PAGE_SHIFT); + } + + /* Print out the early node map */ + pr_info("Early memory node ranges\n"); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + ((u64)end_pfn << PAGE_SHIFT) - 1); + + /* Initialise every node */ + mminit_verify_pageflags_layout(); + setup_nr_node_ids(); + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + free_area_init_node(nid, NULL, + find_min_pfn_for_node(nid), NULL); + + /* Any memory on that node */ + if (pgdat->node_present_pages) + node_set_state(nid, N_MEMORY); + check_for_memory(pgdat, nid); + } +} + +static int __init cmdline_parse_core(char *p, unsigned long *core) +{ + unsigned long long coremem; + if (!p) + return -EINVAL; + + coremem = memparse(p, &p); + *core = coremem >> PAGE_SHIFT; + + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + + return 0; +} + +/* + * kernelcore=size sets the amount of memory for use for allocations that + * cannot be reclaimed or migrated. + */ +static int __init cmdline_parse_kernelcore(char *p) +{ + return cmdline_parse_core(p, &required_kernelcore); +} + +/* + * movablecore=size sets the amount of memory for use for allocations that + * can be reclaimed or migrated. + */ +static int __init cmdline_parse_movablecore(char *p) +{ + return cmdline_parse_core(p, &required_movablecore); +} + +early_param("kernelcore", cmdline_parse_kernelcore); +early_param("movablecore", cmdline_parse_movablecore); + +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +void adjust_managed_page_count(struct page *page, long count) +{ + spin_lock(&managed_page_count_lock); + page_zone(page)->managed_pages += count; + totalram_pages += count; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += count; +#endif + spin_unlock(&managed_page_count_lock); +} +EXPORT_SYMBOL(adjust_managed_page_count); + +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) +{ + void *pos; + unsigned long pages = 0; + + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + if ((unsigned int)poison <= 0xFF) + memset(pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); + } + + if (pages && s) + pr_info("Freeing %s memory: %ldK (%p - %p)\n", + s, pages << (PAGE_SHIFT - 10), start, end); + + return pages; +} +EXPORT_SYMBOL(free_reserved_area); + +#ifdef CONFIG_HIGHMEM +void free_highmem_page(struct page *page) +{ + __free_reserved_page(page); + totalram_pages++; + page_zone(page)->managed_pages++; + totalhigh_pages++; +} +#endif + + +void __init mem_init_print_info(const char *str) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + do { \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; \ + } while (0) + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + pr_info("Memory: %luK/%luK available " + "(%luK kernel code, %luK rwdata, %luK rodata, " + "%luK init, %luK bss, %luK reserved, %luK cma-reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), + totalcma_pages << (PAGE_SHIFT-10), +#ifdef CONFIG_HIGHMEM + totalhigh_pages << (PAGE_SHIFT-10), +#endif + str ? ", " : "", str ? str : ""); +} + +/** + * set_dma_reserve - set the specified number of pages reserved in the first zone + * @new_dma_reserve: The number of pages to mark reserved + * + * The per-cpu batchsize and zone watermarks are determined by present_pages. + * In the DMA zone, a significant percentage may be consumed by kernel image + * and other unfreeable allocations which can skew the watermarks badly. This + * function may optionally be used to account for unfreeable pages in the + * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and + * smaller per-cpu batchsize. + */ +void __init set_dma_reserve(unsigned long new_dma_reserve) +{ + dma_reserve = new_dma_reserve; +} + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); +} + +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + lru_add_drain_cpu(cpu); + drain_pages(cpu); + + /* + * Spill the event counters of the dead processor + * into the current processors event counters. + * This artificially elevates the count of the current + * processor. + */ + vm_events_fold_cpu(cpu); + + /* + * Zero the differential counters of the dead processor + * so that the vm statistics are consistent. + * + * This is only okay since the processor is dead and cannot + * race with what we are doing. + */ + cpu_vm_stats_fold(cpu); + } + return NOTIFY_OK; +} + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); + local_irq_lock_init(pa_lock); +} + +/* + * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * or min_free_kbytes changes. + */ +static void calculate_totalreserve_pages(void) +{ + struct pglist_data *pgdat; + unsigned long reserve_pages = 0; + enum zone_type i, j; + + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { + if (zone->lowmem_reserve[j] > max) + max = zone->lowmem_reserve[j]; + } + + /* we treat the high watermark as reserved pages. */ + max += high_wmark_pages(zone); + + if (max > zone->managed_pages) + max = zone->managed_pages; + reserve_pages += max; + /* + * Lowmem reserves are not available to + * GFP_HIGHUSER page cache allocations and + * kswapd tries to balance zones to their high + * watermark. As a result, neither should be + * regarded as dirtyable memory, to prevent a + * situation where reclaim has to clean pages + * in order to balance the zones. + */ + zone->dirty_balance_reserve = max; + } + } + dirty_balance_reserve = reserve_pages; + totalreserve_pages = reserve_pages; +} + +/* + * setup_per_zone_lowmem_reserve - called whenever + * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * has a correct pages reserved value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + */ +static void setup_per_zone_lowmem_reserve(void) +{ + struct pglist_data *pgdat; + enum zone_type j, idx; + + for_each_online_pgdat(pgdat) { + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long managed_pages = zone->managed_pages; + + zone->lowmem_reserve[j] = 0; + + idx = j; + while (idx) { + struct zone *lower_zone; + + idx--; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) + sysctl_lowmem_reserve_ratio[idx] = 1; + + lower_zone = pgdat->node_zones + idx; + lower_zone->lowmem_reserve[j] = managed_pages / + sysctl_lowmem_reserve_ratio[idx]; + managed_pages += lower_zone->managed_pages; + } + } + } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); +} + +static void __setup_per_zone_wmarks(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone->managed_pages; + } + + for_each_zone(zone) { + u64 tmp; + + spin_lock_irqsave(&zone->lock, flags); + tmp = (u64)pages_min * zone->managed_pages; + do_div(tmp, lowmem_pages); + if (is_highmem(zone)) { + /* + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) + * deltas control asynch page reclaim, and so should + * not be capped for highmem. + */ + unsigned long min_pages; + + min_pages = zone->managed_pages / 1024; + min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); + zone->watermark[WMARK_MIN] = min_pages; + } else { + /* + * If it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->watermark[WMARK_MIN] = tmp; + } + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + + __mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + + setup_zone_migrate_reserve(zone); + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); +} + +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ + mutex_lock(&zonelists_mutex); + __setup_per_zone_wmarks(); + mutex_unlock(&zonelists_mutex); +} + +/* + * The inactive anon list should be small enough that the VM never has to + * do too much work, but large enough that each inactive page has a chance + * to be referenced again before it is swapped out. + * + * The inactive_anon ratio is the target ratio of ACTIVE_ANON to + * INACTIVE_ANON pages on this zone's LRU, maintained by the + * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of + * the anonymous pages are kept on the inactive list. + * + * total target max + * memory ratio inactive anon + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +static void __meminit calculate_zone_inactive_ratio(struct zone *zone) +{ + unsigned int gb, ratio; + + /* Zone size in gigabytes */ + gb = zone->managed_pages >> (30 - PAGE_SHIFT); + if (gb) + ratio = int_sqrt(10 * gb); + else + ratio = 1; + + zone->inactive_ratio = ratio; +} + +static void __meminit setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (64MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = sqrt(lowmem_kbytes * 16) + * + * which yields + * + * 16MB: 512k + * 32MB: 724k + * 64MB: 1024k + * 128MB: 1448k + * 256MB: 2048k + * 512MB: 2896k + * 1024MB: 4096k + * 2048MB: 5792k + * 4096MB: 8192k + * 8192MB: 11584k + * 16384MB: 16384k + */ +int __meminit init_per_zone_wmark_min(void) +{ + unsigned long lowmem_kbytes; + int new_min_free_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } + setup_per_zone_wmarks(); + refresh_zone_stat_thresholds(); + setup_per_zone_lowmem_reserve(); + setup_per_zone_inactive_ratio(); + return 0; +} +module_init(init_per_zone_wmark_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) { + user_min_free_kbytes = min_free_kbytes; + setup_per_zone_wmarks(); + } + return 0; +} + +#ifdef CONFIG_NUMA +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_unmapped_pages = (zone->managed_pages * + sysctl_min_unmapped_ratio) / 100; + return 0; +} + +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_slab_pages = (zone->managed_pages * + sysctl_min_slab_ratio) / 100; + return 0; +} +#endif + +/* + * lowmem_reserve_ratio_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() + * whenever sysctl_lowmem_reserve_ratio changes. + * + * The reserve ratio obviously has absolutely no relation with the + * minimum watermarks. The lowmem reserve ratio can only make sense + * if in function of the boot time zone sizes. + */ +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, buffer, length, ppos); + setup_per_zone_lowmem_reserve(); + return 0; +} + +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. + */ +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int old_percpu_pagelist_fraction; + int ret; + + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; + + for_each_populated_zone(zone) { + unsigned int cpu; + + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); + } +out: + mutex_unlock(&pcp_batch_high_lock); + return ret; +} + +int hashdist = HASHDIST_DEFAULT; + +#ifdef CONFIG_NUMA +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned long long max = high_limit; + unsigned long log2qty, size; + void *table = NULL; + + /* allow the kernel cmdline to have a say */ + if (!numentries) { + /* round applicable memory size up to nearest megabyte */ + numentries = nr_kernel_pages; + + /* It isn't necessary when PAGE_SIZE >= 1MB */ + if (PAGE_SHIFT < 20) + numentries = round_up(numentries, (1<<20)/PAGE_SIZE); + + /* limit to 1 bucket per 2^scale bytes of low memory */ + if (scale > PAGE_SHIFT) + numentries >>= (scale - PAGE_SHIFT); + else + numentries <<= (PAGE_SHIFT - scale); + + /* Make sure we've got at least a 0-order allocation.. */ + if (unlikely(flags & HASH_SMALL)) { + /* Makes no sense without HASH_EARLY */ + WARN_ON(!(flags & HASH_EARLY)); + if (!(numentries >> *_hash_shift)) { + numentries = 1UL << *_hash_shift; + BUG_ON(!numentries); + } + } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + numentries = PAGE_SIZE / bucketsize; + } + numentries = roundup_pow_of_two(numentries); + + /* limit allocation size to 1/16 total memory by default */ + if (max == 0) { + max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; + do_div(max, bucketsize); + } + max = min(max, 0x80000000ULL); + + if (numentries < low_limit) + numentries = low_limit; + if (numentries > max) + numentries = max; + + log2qty = ilog2(numentries); + + do { + size = bucketsize << log2qty; + if (flags & HASH_EARLY) + table = memblock_virt_alloc_nopanic(size, 0); + else if (hashdist) + table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + else { + /* + * If bucketsize is not a power-of-two, we may free + * some pages at the end of hash table which + * alloc_pages_exact() automatically does + */ + if (get_order(size) < MAX_ORDER) { + table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } + } + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", + tablename, + (1UL << log2qty), + ilog2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} + +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct zone *zone, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return zone->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + struct zone *zone; + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + zone = page_zone(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + struct zone *zone; + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + zone = page_zone(page); + bitmap = get_pageblock_bitmap(zone, pfn); + bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); + + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = READ_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } +} + +/* + * This function checks whether pageblock includes unmovable pages or not. + * If @count is not zero, it is okay to include less @count unmovable pages + * + * PageLRU check without isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. It means you can't + * expect this function should be exact. + */ +bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + bool skip_hwpoisoned_pages) +{ + unsigned long pfn, iter, found; + int mt; + + /* + * For avoiding noise data, lru_add_drain_all() should be called + * If ZONE_MOVABLE, the zone never contains unmovable pages + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return false; + mt = get_pageblock_migratetype(page); + if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) + return false; + + pfn = page_to_pfn(page); + for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + unsigned long check = pfn + iter; + + if (!pfn_valid_within(check)) + continue; + + page = pfn_to_page(check); + + /* + * Hugepages are not in LRU lists, but they're movable. + * We need not scan over tail pages bacause we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page)) { + iter = round_up(iter + 1, 1<_count is zero at all time. + */ + if (!atomic_read(&page->_count)) { + if (PageBuddy(page)) + iter += (1 << page_order(page)) - 1; + continue; + } + + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (skip_hwpoisoned_pages && PageHWPoison(page)) + continue; + + if (!PageLRU(page)) + found++; + /* + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. + */ + /* + * If the page is not RAM, page_count()should be 0. + * we don't need more check. This is an _used_ not-movable page. + * + * The problematic thing here is PG_reserved pages. PG_reserved + * is set to both of a memory hole page and a _used_ kernel + * page at boot. + */ + if (found > count) + return true; + } + return false; +} + +bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (!zone_spans_pfn(zone, pfn)) + return false; + + return !has_unmovable_pages(zone, page, 0, true); +} + +#ifdef CONFIG_CMA + +static unsigned long pfn_max_align_down(unsigned long pfn) +{ + return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) - 1); +} + +static unsigned long pfn_max_align_up(unsigned long pfn) +{ + return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, + pageblock_nr_pages)); +} + +/* [start, end) must belong to a single zone. */ +static int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + unsigned long nr_reclaimed; + unsigned long pfn = start; + unsigned int tries = 0; + int ret = 0; + + migrate_prep(); + + while (pfn < end || !list_empty(&cc->migratepages)) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (list_empty(&cc->migratepages)) { + cc->nr_migratepages = 0; + pfn = isolate_migratepages_range(cc, pfn, end); + if (!pfn) { + ret = -EINTR; + break; + } + tries = 0; + } else if (++tries == 5) { + ret = ret < 0 ? ret : -EBUSY; + break; + } + + nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, + &cc->migratepages); + cc->nr_migratepages -= nr_reclaimed; + + ret = migrate_pages(&cc->migratepages, alloc_migrate_target, + NULL, 0, cc->mode, MR_CMA); + } + if (ret < 0) { + putback_movable_pages(&cc->migratepages); + return ret; + } + return 0; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlaying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. + * + * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES + * aligned, however it's the caller's responsibility to guarantee that + * we are the only thread that changes migrate type of pageblocks the + * pages fall in. + * + * The PFN range must belong to a single zone. + * + * Returns zero on success or negative error code. On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype) +{ + unsigned long outer_start, outer_end; + int ret = 0, order; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + /* + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because pageblock and max order pages may + * have different sizes, and due to the way page allocator + * work, we align the range to biggest of the two pages so + * that page allocator won't try to merge buddies from + * different pageblocks and change MIGRATE_ISOLATE to some + * other migration type. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * aligned range but not in the unaligned, original range are + * put back to page allocator so that buddy can use them. + */ + + ret = start_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype, + false); + if (ret) + return ret; + + ret = __alloc_contig_migrate_range(&cc, start, end); + if (ret) + goto done; + + /* + * Pages from [start, end) are within a MAX_ORDER_NR_PAGES + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocator). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + * + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ + + lru_add_drain_all(); + drain_all_pages(cc.zone); + + order = 0; + outer_start = start; + while (!PageBuddy(pfn_to_page(outer_start))) { + if (++order >= MAX_ORDER) { + ret = -EBUSY; + goto done; + } + outer_start &= ~0UL << order; + } + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end, false)) { + pr_info("%s: [%lx, %lx) PFNs busy\n", + __func__, outer_start, end); + ret = -EBUSY; + goto done; + } + + /* Grab isolated pages from freelists. */ + outer_end = isolate_freepages_range(&cc, outer_start, end); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + +done: + undo_isolate_page_range(pfn_max_align_down(start), + pfn_max_align_up(end), migratetype); + return ret; +} + +void free_contig_range(unsigned long pfn, unsigned nr_pages) +{ + unsigned int count = 0; + + for (; nr_pages--; pfn++) { + struct page *page = pfn_to_page(pfn); + + count += page_count(page) != 1; + __free_page(page); + } + WARN(count != 0, "%d pages are still in use!\n", count); +} +#endif + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ +void __meminit zone_pcp_update(struct zone *zone) +{ + unsigned cpu; + mutex_lock(&pcp_batch_high_lock); + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); + mutex_unlock(&pcp_batch_high_lock); +} +#endif + +void zone_pcp_reset(struct zone *zone) +{ + unsigned long flags; + int cpu; + struct per_cpu_pageset *pset; + + /* avoid races with drain_pages() */ + local_lock_irqsave(pa_lock, flags); + if (zone->pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pset = per_cpu_ptr(zone->pageset, cpu); + drain_zonestat(zone, pset); + } + free_percpu(zone->pageset); + zone->pageset = &boot_pageset; + } + local_unlock_irqrestore(pa_lock, flags); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * All pages in the range must be isolated before calling this. + */ +void +__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *page; + struct zone *zone; + unsigned int order, i; + unsigned long pfn; + unsigned long flags; + /* find the first valid pfn */ + for (pfn = start_pfn; pfn < end_pfn; pfn++) + if (pfn_valid(pfn)) + break; + if (pfn == end_pfn) + return; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + pfn = start_pfn; + while (pfn < end_pfn) { + if (!pfn_valid(pfn)) { + pfn++; + continue; + } + page = pfn_to_page(pfn); + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { + pfn++; + SetPageReserved(page); + continue; + } + + BUG_ON(page_count(page)); + BUG_ON(!PageBuddy(page)); + order = page_order(page); +#ifdef CONFIG_DEBUG_VM + printk(KERN_INFO "remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); +#endif + list_del(&page->lru); + rmv_page_order(page); + zone->free_area[order].nr_free--; + for (i = 0; i < (1 << order); i++) + SetPageReserved((page+i)); + pfn += (1 << order); + } + spin_unlock_irqrestore(&zone->lock, flags); +} +#endif + +#ifdef CONFIG_MEMORY_FAILURE +bool is_free_buddy_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + unsigned int order; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && page_order(page_head) >= order) + break; + } + spin_unlock_irqrestore(&zone->lock, flags); + + return order < MAX_ORDER; +} +#endif diff --git a/kernel/mm/page_counter.c b/kernel/mm/page_counter.c new file mode 100644 index 000000000..11b4beda1 --- /dev/null +++ b/kernel/mm/page_counter.c @@ -0,0 +1,193 @@ +/* + * Lockless hierarchical page accounting & limiting + * + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner + */ + +#include +#include +#include +#include +#include +#include +#include + +/** + * page_counter_cancel - take pages out of the local counter + * @counter: counter + * @nr_pages: number of pages to cancel + */ +void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) +{ + long new; + + new = atomic_long_sub_return(nr_pages, &counter->count); + /* More uncharges than charges? */ + WARN_ON_ONCE(new < 0); +} + +/** + * page_counter_charge - hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * + * NOTE: This does not consider any configured counter limits. + */ +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + + new = atomic_long_add_return(nr_pages, &c->count); + /* + * This is indeed racy, but we can live with some + * inaccuracy in the watermark. + */ + if (new > c->watermark) + c->watermark = new; + } +} + +/** + * page_counter_try_charge - try to hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * @fail: points first counter to hit its limit, if any + * + * Returns 0 on success, or -ENOMEM and @fail if the counter or one of + * its ancestors has hit its configured limit. + */ +int page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + /* + * Charge speculatively to avoid an expensive CAS. If + * a bigger charge fails, it might falsely lock out a + * racing smaller charge and send it into reclaim + * early, but the error is limited to the difference + * between the two sizes, which is less than 2M/4M in + * case of a THP locking out a regular page charge. + * + * The atomic_long_add_return() implies a full memory + * barrier between incrementing the count and reading + * the limit. When racing with page_counter_limit(), + * we either see the new limit or the setter sees the + * counter has changed and retries. + */ + new = atomic_long_add_return(nr_pages, &c->count); + if (new > c->limit) { + atomic_long_sub(nr_pages, &c->count); + /* + * This is racy, but we can live with some + * inaccuracy in the failcnt. + */ + c->failcnt++; + *fail = c; + goto failed; + } + /* + * Just like with failcnt, we can live with some + * inaccuracy in the watermark. + */ + if (new > c->watermark) + c->watermark = new; + } + return 0; + +failed: + for (c = counter; c != *fail; c = c->parent) + page_counter_cancel(c, nr_pages); + + return -ENOMEM; +} + +/** + * page_counter_uncharge - hierarchically uncharge pages + * @counter: counter + * @nr_pages: number of pages to uncharge + */ +void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) + page_counter_cancel(c, nr_pages); +} + +/** + * page_counter_limit - limit the number of pages allowed + * @counter: counter + * @limit: limit to set + * + * Returns 0 on success, -EBUSY if the current number of pages on the + * counter already exceeds the specified limit. + * + * The caller must serialize invocations on the same counter. + */ +int page_counter_limit(struct page_counter *counter, unsigned long limit) +{ + for (;;) { + unsigned long old; + long count; + + /* + * Update the limit while making sure that it's not + * below the concurrently-changing counter value. + * + * The xchg implies two full memory barriers before + * and after, so the read-swap-read is ordered and + * ensures coherency with page_counter_try_charge(): + * that function modifies the count before checking + * the limit, so if it sees the old limit, we see the + * modified counter and retry. + */ + count = atomic_long_read(&counter->count); + + if (count > limit) + return -EBUSY; + + old = xchg(&counter->limit, limit); + + if (atomic_long_read(&counter->count) <= count) + return 0; + + counter->limit = old; + cond_resched(); + } +} + +/** + * page_counter_memparse - memparse() for page counter limits + * @buf: string to parse + * @max: string meaning maximum possible value + * @nr_pages: returns the result in number of pages + * + * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be + * limited to %PAGE_COUNTER_MAX. + */ +int page_counter_memparse(const char *buf, const char *max, + unsigned long *nr_pages) +{ + char *end; + u64 bytes; + + if (!strcmp(buf, max)) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + bytes = memparse(buf, &end); + if (*end != '\0') + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + return 0; +} diff --git a/kernel/mm/page_ext.c b/kernel/mm/page_ext.c new file mode 100644 index 000000000..d86fd2f53 --- /dev/null +++ b/kernel/mm/page_ext.c @@ -0,0 +1,403 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * struct page extension + * + * This is the feature to manage memory for extended data per page. + * + * Until now, we must modify struct page itself to store extra data per page. + * This requires rebuilding the kernel and it is really time consuming process. + * And, sometimes, rebuild is impossible due to third party module dependency. + * At last, enlarging struct page could cause un-wanted system behaviour change. + * + * This feature is intended to overcome above mentioned problems. This feature + * allocates memory for extended data per page in certain place rather than + * the struct page itself. This memory can be accessed by the accessor + * functions provided by this code. During the boot process, it checks whether + * allocation of huge chunk of memory is needed or not. If not, it avoids + * allocating memory at all. With this advantage, we can include this feature + * into the kernel in default and can avoid rebuild and solve related problems. + * + * To help these things to work well, there are two callbacks for clients. One + * is the need callback which is mandatory if user wants to avoid useless + * memory allocation at boot-time. The other is optional, init callback, which + * is used to do proper initialization after memory is allocated. + * + * The need callback is used to decide whether extended memory allocation is + * needed or not. Sometimes users want to deactivate some features in this + * boot and extra memory would be unneccessary. In this case, to avoid + * allocating huge chunk of memory, each clients represent their need of + * extra memory through the need callback. If one of the need callbacks + * returns true, it means that someone needs extra memory so that + * page extension core should allocates memory for page extension. If + * none of need callbacks return true, memory isn't needed at all in this boot + * and page extension core can skip to allocate memory. As result, + * none of memory is wasted. + * + * The init callback is used to do proper initialization after page extension + * is completely initialized. In sparse memory system, extra memory is + * allocated some time later than memmap is allocated. In other words, lifetime + * of memory for page extension isn't same with memmap for struct page. + * Therefore, clients can't store extra data until page extension is + * initialized, even if pages are allocated and used freely. This could + * cause inadequate state of extra data per page, so, to prevent it, client + * can utilize this callback to initialize the state of it correctly. + */ + +static struct page_ext_operations *page_ext_ops[] = { + &debug_guardpage_ops, +#ifdef CONFIG_PAGE_POISONING + &page_poisoning_ops, +#endif +#ifdef CONFIG_PAGE_OWNER + &page_owner_ops, +#endif +}; + +static unsigned long total_usage; + +static bool __init invoke_need_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->need && page_ext_ops[i]->need()) + return true; + } + + return false; +} + +static void __init invoke_init_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->init) + page_ext_ops[i]->init(); + } +} + +#if !defined(CONFIG_SPARSEMEM) + + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ + pgdat->node_page_ext = NULL; +} + +struct page_ext *lookup_page_ext(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long offset; + struct page_ext *base; + + base = NODE_DATA(page_to_nid(page))->node_page_ext; +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (unlikely(!base)) + return NULL; +#endif + offset = pfn - round_down(node_start_pfn(page_to_nid(page)), + MAX_ORDER_NR_PAGES); + return base + offset; +} + +static int __init alloc_node_page_ext(int nid) +{ + struct page_ext *base; + unsigned long table_size; + unsigned long nr_pages; + + nr_pages = NODE_DATA(nid)->node_spanned_pages; + if (!nr_pages) + return 0; + + /* + * Need extra space if node range is not aligned with + * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm + * checks buddy's status, range could be out of exact node range. + */ + if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || + !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) + nr_pages += MAX_ORDER_NR_PAGES; + + table_size = sizeof(struct page_ext) * nr_pages; + + base = memblock_virt_alloc_try_nid_nopanic( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + if (!base) + return -ENOMEM; + NODE_DATA(nid)->node_page_ext = base; + total_usage += table_size; + return 0; +} + +void __init page_ext_init_flatmem(void) +{ + + int nid, fail; + + if (!invoke_need_callbacks()) + return; + + for_each_online_node(nid) { + fail = alloc_node_page_ext(nid); + if (fail) + goto fail; + } + pr_info("allocated %ld bytes of page_ext\n", total_usage); + invoke_init_callbacks(); + return; + +fail: + pr_crit("allocation of page_ext failed.\n"); + panic("Out of memory"); +} + +#else /* CONFIG_FLAT_NODE_MEM_MAP */ + +struct page_ext *lookup_page_ext(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); +#ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (!section->page_ext) + return NULL; +#endif + return section->page_ext + pfn; +} + +static void *__meminit alloc_page_ext(size_t size, int nid) +{ + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; + void *addr = NULL; + + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); + return addr; + } + + if (node_state(nid, N_HIGH_MEMORY)) + addr = vzalloc_node(size, nid); + else + addr = vzalloc(size); + + return addr; +} + +static int __meminit init_section_page_ext(unsigned long pfn, int nid) +{ + struct mem_section *section; + struct page_ext *base; + unsigned long table_size; + + section = __pfn_to_section(pfn); + + if (section->page_ext) + return 0; + + table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + base = alloc_page_ext(table_size, nid); + + /* + * The value stored in section->page_ext is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); + + if (!base) { + pr_err("page ext allocation failure\n"); + return -ENOMEM; + } + + /* + * The passed "pfn" may not be aligned to SECTION. For the calculation + * we need to apply a mask. + */ + pfn &= PAGE_SECTION_MASK; + section->page_ext = base - pfn; + total_usage += table_size; + return 0; +} +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_ext(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size; + + table_size = sizeof(struct page_ext) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} + +static void __free_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + struct page_ext *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + base = ms->page_ext + pfn; + free_page_ext(base); + ms->page_ext = NULL; +} + +static int __meminit online_page_ext(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + if (nid == -1) { + /* + * In this case, "nid" already exists and contains valid memory. + * "start_pfn" passed to us is a pfn which is an arg for + * online__pages(), and start_pfn should exist. + */ + nid = pfn_to_nid(start_pfn); + VM_BUG_ON(!node_state(nid, N_ONLINE)); + } + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_ext(pfn, nid); + } + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + + return -ENOMEM; +} + +static int __meminit offline_page_ext(unsigned long start_pfn, + unsigned long nr_pages, int nid) +{ + unsigned long start, end, pfn; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + return 0; + +} + +static int __meminit page_ext_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_OFFLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_CANCEL_ONLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + + return notifier_from_errno(ret); +} + +#endif + +void __init page_ext_init(void) +{ + unsigned long pfn; + int nid; + + if (!invoke_need_callbacks()) + return; + + for_each_node_state(nid, N_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + /* + * start_pfn and end_pfn may not be aligned to SECTION and the + * page->flags of out of node pages are not initialized. So we + * scan [start_pfn, the biggest section's pfn < end_pfn) here. + */ + for (pfn = start_pfn; pfn < end_pfn; + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { + + if (!pfn_valid(pfn)) + continue; + /* + * Nodes's pfns can be overlapping. + * We know some arch can have a nodes layout such as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2|.... + */ + if (pfn_to_nid(pfn) != nid) + continue; + if (init_section_page_ext(pfn, nid)) + goto oom; + } + } + hotplug_memory_notifier(page_ext_callback, 0); + pr_info("allocated %ld bytes of page_ext\n", total_usage); + invoke_init_callbacks(); + return; + +oom: + panic("Out of memory"); +} + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ +} + +#endif diff --git a/kernel/mm/page_io.c b/kernel/mm/page_io.c new file mode 100644 index 000000000..6424869e2 --- /dev/null +++ b/kernel/mm/page_io.c @@ -0,0 +1,381 @@ +/* + * linux/mm/page_io.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, + * Asynchronous swapping added 30.12.95. Stephen Tweedie + * Removed race in async swapping. 14.4.1996. Bruno Haible + * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie + * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct bio *get_swap_bio(gfp_t gfp_flags, + struct page *page, bio_end_io_t end_io) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, 1); + if (bio) { + bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); + bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = PAGE_SIZE; + bio->bi_io_vec[0].bv_offset = 0; + bio->bi_vcnt = 1; + bio->bi_iter.bi_size = PAGE_SIZE; + bio->bi_end_io = end_io; + } + return bio; +} + +void end_swap_bio_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate) { + SetPageError(page); + /* + * We failed to write the page out to swap-space. + * Re-dirty the page in order to avoid it being reclaimed. + * Also print a dire warning that things will go BAD (tm) + * very quickly. + * + * Also clear PG_reclaim to avoid rotate_reclaimable_page() + */ + set_page_dirty(page); + printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); + ClearPageReclaim(page); + } + end_page_writeback(page); + bio_put(bio); +} + +void end_swap_bio_read(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate) { + SetPageError(page); + ClearPageUptodate(page); + printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); + goto out; + } + + SetPageUptodate(page); + + /* + * There is no guarantee that the page is in swap cache - the software + * suspend code (at least) uses end_swap_bio_read() against a non- + * swapcache page. So we must check PG_swapcache before proceeding with + * this optimization. + */ + if (likely(PageSwapCache(page))) { + struct swap_info_struct *sis; + + sis = page_swap_info(page); + if (sis->flags & SWP_BLKDEV) { + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + struct gendisk *disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } + } + } + +out: + unlock_page(page); + bio_put(bio); +} + +int generic_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, + sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + unsigned blocks_per_page; + unsigned long page_no; + unsigned blkbits; + sector_t probe_block; + sector_t last_block; + sector_t lowest_block = -1; + sector_t highest_block = 0; + int nr_extents = 0; + int ret; + + blkbits = inode->i_blkbits; + blocks_per_page = PAGE_SIZE >> blkbits; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + probe_block = 0; + page_no = 0; + last_block = i_size_read(inode) >> blkbits; + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { + unsigned block_in_page; + sector_t first_block; + + first_block = bmap(inode, probe_block); + if (first_block == 0) + goto bad_bmap; + + /* + * It must be PAGE_SIZE aligned on-disk + */ + if (first_block & (blocks_per_page - 1)) { + probe_block++; + goto reprobe; + } + + for (block_in_page = 1; block_in_page < blocks_per_page; + block_in_page++) { + sector_t block; + + block = bmap(inode, probe_block + block_in_page); + if (block == 0) + goto bad_bmap; + if (block != first_block + block_in_page) { + /* Discontiguity */ + probe_block++; + goto reprobe; + } + } + + first_block >>= (PAGE_SHIFT - blkbits); + if (page_no) { /* exclude the header page */ + if (first_block < lowest_block) + lowest_block = first_block; + if (first_block > highest_block) + highest_block = first_block; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; + page_no++; + probe_block += blocks_per_page; +reprobe: + continue; + } + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +out: + return ret; +bad_bmap: + printk(KERN_ERR "swapon: swapfile has holes\n"); + ret = -EINVAL; + goto out; +} + +/* + * We may have stale swap cache pages in memory: notice + * them here and get rid of the unnecessary final write. + */ +int swap_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret = 0; + + if (try_to_free_swap(page)) { + unlock_page(page); + goto out; + } + if (frontswap_store(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } + ret = __swap_writepage(page, wbc, end_swap_bio_write); +out: + return ret; +} + +static sector_t swap_page_sector(struct page *page) +{ + return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); +} + +int __swap_writepage(struct page *page, struct writeback_control *wbc, + void (*end_write_func)(struct bio *, int)) +{ + struct bio *bio; + int ret, rw = WRITE; + struct swap_info_struct *sis = page_swap_info(page); + + if (sis->flags & SWP_FILE) { + struct kiocb kiocb; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct bio_vec bv = { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0 + }; + struct iov_iter from; + + iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); + init_sync_kiocb(&kiocb, swap_file); + kiocb.ki_pos = page_file_offset(page); + + set_page_writeback(page); + unlock_page(page); + ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos); + if (ret == PAGE_SIZE) { + count_vm_event(PSWPOUT); + ret = 0; + } else { + /* + * In the case of swap-over-nfs, this can be a + * temporary failure if the system has limited + * memory for allocating transmit buffers. + * Mark the page dirty and avoid + * rotate_reclaimable_page but rate-limit the + * messages but do not flag PageError like + * the normal direct-to-bio case as it could + * be temporary. + */ + set_page_dirty(page); + ClearPageReclaim(page); + pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", + page_file_offset(page)); + } + end_page_writeback(page); + return ret; + } + + ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); + if (!ret) { + count_vm_event(PSWPOUT); + return 0; + } + + ret = 0; + bio = get_swap_bio(GFP_NOIO, page, end_write_func); + if (bio == NULL) { + set_page_dirty(page); + unlock_page(page); + ret = -ENOMEM; + goto out; + } + if (wbc->sync_mode == WB_SYNC_ALL) + rw |= REQ_SYNC; + count_vm_event(PSWPOUT); + set_page_writeback(page); + unlock_page(page); + submit_bio(rw, bio); +out: + return ret; +} + +int swap_readpage(struct page *page) +{ + struct bio *bio; + int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageUptodate(page), page); + if (frontswap_load(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } + + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + ret = mapping->a_ops->readpage(swap_file, page); + if (!ret) + count_vm_event(PSWPIN); + return ret; + } + + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { + count_vm_event(PSWPIN); + return 0; + } + + ret = 0; + bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); + if (bio == NULL) { + unlock_page(page); + ret = -ENOMEM; + goto out; + } + count_vm_event(PSWPIN); + submit_bio(READ, bio); +out: + return ret; +} + +int swap_set_page_dirty(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis->flags & SWP_FILE) { + struct address_space *mapping = sis->swap_file->f_mapping; + return mapping->a_ops->set_page_dirty(page); + } else { + return __set_page_dirty_no_writeback(page); + } +} diff --git a/kernel/mm/page_isolation.c b/kernel/mm/page_isolation.c new file mode 100644 index 000000000..303c90879 --- /dev/null +++ b/kernel/mm/page_isolation.c @@ -0,0 +1,314 @@ +/* + * linux/mm/page_isolation.c + */ + +#include +#include +#include +#include +#include +#include "internal.h" + +int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) +{ + struct zone *zone; + unsigned long flags, pfn; + struct memory_isolate_notify arg; + int notifier_ret; + int ret = -EBUSY; + + zone = page_zone(page); + + spin_lock_irqsave(&zone->lock, flags); + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + + /* + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. + */ + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret) + goto out; + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (!has_unmovable_pages(zone, page, arg.pages_found, + skip_hwpoisoned_pages)) + ret = 0; + + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + +out: + if (!ret) { + unsigned long nr_pages; + int migratetype = get_pageblock_migratetype(page); + + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + + __mod_zone_freepage_state(zone, -nr_pages, migratetype); + } + + spin_unlock_irqrestore(&zone->lock, flags); + if (!ret) + drain_all_pages(zone); + return ret; +} + +void unset_migratetype_isolate(struct page *page, unsigned migratetype) +{ + struct zone *zone; + unsigned long flags, nr_pages; + struct page *isolated_page = NULL; + unsigned int order; + unsigned long page_idx, buddy_idx; + struct page *buddy; + + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + goto out; + + /* + * Because freepage with more than pageblock_order on isolated + * pageblock is restricted to merge due to freepage counting problem, + * it is possible that there is free buddy page. + * move_freepages_block() doesn't care of merge so we need other + * approach in order to merge them. Isolation and free will make + * these pages to be merged. + */ + if (PageBuddy(page)) { + order = page_order(page); + if (order >= pageblock_order) { + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + + if (pfn_valid_within(page_to_pfn(buddy)) && + !is_migrate_isolate_page(buddy)) { + __isolate_free_page(page, order); + kernel_map_pages(page, (1 << order), 1); + set_page_refcounted(page); + isolated_page = page; + } + } + } + + /* + * If we isolate freepage with more than pageblock_order, there + * should be no freepage in the range, so we could avoid costly + * pageblock scanning for freepage moving. + */ + if (!isolated_page) { + nr_pages = move_freepages_block(zone, page, migratetype); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } + set_pageblock_migratetype(page, migratetype); + zone->nr_isolate_pageblock--; +out: + spin_unlock_irqrestore(&zone->lock, flags); + if (isolated_page) + __free_pages(isolated_page, order); +} + +static inline struct page * +__first_valid_page(unsigned long pfn, unsigned long nr_pages) +{ + int i; + for (i = 0; i < nr_pages; i++) + if (pfn_valid_within(pfn + i)) + break; + if (unlikely(i == nr_pages)) + return NULL; + return pfn_to_page(pfn + i); +} + +/* + * start_isolate_page_range() -- make page-allocation-type of range of pages + * to be MIGRATE_ISOLATE. + * @start_pfn: The lower PFN of the range to be isolated. + * @end_pfn: The upper PFN of the range to be isolated. + * @migratetype: migrate type to set in error recovery. + * + * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in + * the range will never be allocated. Any free pages and pages freed in the + * future will not be allocated again. + * + * start_pfn/end_pfn must be aligned to pageblock_order. + * Returns 0 on success and -EBUSY if any part of range cannot be isolated. + */ +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype, bool skip_hwpoisoned_pages) +{ + unsigned long pfn; + unsigned long undo_pfn; + struct page *page; + + BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); + BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + + for (pfn = start_pfn; + pfn < end_pfn; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && + set_migratetype_isolate(page, skip_hwpoisoned_pages)) { + undo_pfn = pfn; + goto undo; + } + } + return 0; +undo: + for (pfn = start_pfn; + pfn < undo_pfn; + pfn += pageblock_nr_pages) + unset_migratetype_isolate(pfn_to_page(pfn), migratetype); + + return -EBUSY; +} + +/* + * Make isolated pages available again. + */ +int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + unsigned migratetype) +{ + unsigned long pfn; + struct page *page; + BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); + BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + for (pfn = start_pfn; + pfn < end_pfn; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + continue; + unset_migratetype_isolate(page, migratetype); + } + return 0; +} +/* + * Test all pages in the range is free(means isolated) or not. + * all pages in [start_pfn...end_pfn) must be in the same zone. + * zone->lock must be held before call this. + * + * Returns 1 if all pages in the range are isolated. + */ +static int +__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) +{ + struct page *page; + + while (pfn < end_pfn) { + if (!pfn_valid_within(pfn)) { + pfn++; + continue; + } + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + /* + * If race between isolatation and allocation happens, + * some free pages could be in MIGRATE_MOVABLE list + * although pageblock's migratation type of the page + * is MIGRATE_ISOLATE. Catch it and move the page into + * MIGRATE_ISOLATE list. + */ + if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { + struct page *end_page; + + end_page = page + (1 << page_order(page)) - 1; + move_freepages(page_zone(page), page, end_page, + MIGRATE_ISOLATE); + } + pfn += 1 << page_order(page); + } + else if (page_count(page) == 0 && + get_freepage_migratetype(page) == MIGRATE_ISOLATE) + pfn += 1; + else if (skip_hwpoisoned_pages && PageHWPoison(page)) { + /* + * The HWPoisoned page may be not in buddy + * system, and page_count() is not 0. + */ + pfn++; + continue; + } + else + break; + } + if (pfn < end_pfn) + return 0; + return 1; +} + +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) +{ + unsigned long pfn, flags; + struct page *page; + struct zone *zone; + int ret; + + /* + * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages + * are not aligned to pageblock_nr_pages. + * Then we just check migratetype first. + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + break; + } + page = __first_valid_page(start_pfn, end_pfn - start_pfn); + if ((pfn < end_pfn) || !page) + return -EBUSY; + /* Check all pages are free or marked as ISOLATED */ + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, + skip_hwpoisoned_pages); + spin_unlock_irqrestore(&zone->lock, flags); + return ret ? 0 : -EBUSY; +} + +struct page *alloc_migrate_target(struct page *page, unsigned long private, + int **resultp) +{ + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + /* + * TODO: allocate a destination hugepage from a nearest neighbor node, + * accordance with memory policy of the user process if possible. For + * now as a simple work-around, we use the next node for destination. + */ + if (PageHuge(page)) { + nodemask_t src = nodemask_of_node(page_to_nid(page)); + nodemask_t dst; + nodes_complement(dst, src); + return alloc_huge_page_node(page_hstate(compound_head(page)), + next_node(page_to_nid(page), dst)); + } + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); +} diff --git a/kernel/mm/page_owner.c b/kernel/mm/page_owner.c new file mode 100644 index 000000000..0993f5f36 --- /dev/null +++ b/kernel/mm/page_owner.c @@ -0,0 +1,313 @@ +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static bool page_owner_disabled = true; +bool page_owner_inited __read_mostly; + +static void init_early_allocated_pages(void); + +static int early_page_owner_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + page_owner_disabled = false; + + return 0; +} +early_param("page_owner", early_page_owner_param); + +static bool need_page_owner(void) +{ + if (page_owner_disabled) + return false; + + return true; +} + +static void init_page_owner(void) +{ + if (page_owner_disabled) + return; + + page_owner_inited = true; + init_early_allocated_pages(); +} + +struct page_ext_operations page_owner_ops = { + .need = need_page_owner, + .init = init_page_owner, +}; + +void __reset_page_owner(struct page *page, unsigned int order) +{ + int i; + struct page_ext *page_ext; + + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); + } +} + +void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { + .nr_entries = 0, + .max_entries = ARRAY_SIZE(page_ext->trace_entries), + .entries = &page_ext->trace_entries[0], + .skip = 3, + }; + + save_stack_trace(&trace); + + page_ext->order = order; + page_ext->gfp_mask = gfp_mask; + page_ext->nr_entries = trace.nr_entries; + + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); +} + +static ssize_t +print_page_owner(char __user *buf, size_t count, unsigned long pfn, + struct page *page, struct page_ext *page_ext) +{ + int ret; + int pageblock_mt, page_mt; + char *kbuf; + struct stack_trace trace = { + .nr_entries = page_ext->nr_entries, + .entries = &page_ext->trace_entries[0], + }; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = snprintf(kbuf, count, + "Page allocated via order %u, mask 0x%x\n", + page_ext->order, page_ext->gfp_mask); + + if (ret >= count) + goto err; + + /* Print information relevant to grouping pages by mobility */ + pageblock_mt = get_pfnblock_migratetype(page, pfn); + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + ret += snprintf(kbuf + ret, count - ret, + "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + pfn, + pfn >> pageblock_order, + pageblock_mt, + pageblock_mt != page_mt ? "Fallback" : " ", + PageLocked(page) ? "K" : " ", + PageError(page) ? "E" : " ", + PageReferenced(page) ? "R" : " ", + PageUptodate(page) ? "U" : " ", + PageDirty(page) ? "D" : " ", + PageLRU(page) ? "L" : " ", + PageActive(page) ? "A" : " ", + PageSlab(page) ? "S" : " ", + PageWriteback(page) ? "W" : " ", + PageCompound(page) ? "C" : " ", + PageSwapCache(page) ? "B" : " ", + PageMappedToDisk(page) ? "M" : " "); + + if (ret >= count) + goto err; + + ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); + if (ret >= count) + goto err; + + ret += snprintf(kbuf + ret, count - ret, "\n"); + if (ret >= count) + goto err; + + if (copy_to_user(buf, kbuf, ret)) + ret = -EFAULT; + + kfree(kbuf); + return ret; + +err: + kfree(kbuf); + return -ENOMEM; +} + +static ssize_t +read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long pfn; + struct page *page; + struct page_ext *page_ext; + + if (!page_owner_inited) + return -EINVAL; + + page = NULL; + pfn = min_low_pfn + *ppos; + + /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ + while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) + pfn++; + + drain_all_pages(NULL); + + /* Find an allocated page */ + for (; pfn < max_pfn; pfn++) { + /* + * If the new page is in a new MAX_ORDER_NR_PAGES area, + * validate the area as existing, skip it if not + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { + pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + + /* Check for holes within a MAX_ORDER area */ + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + unsigned long freepage_order = page_order_unsafe(page); + + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; + continue; + } + + page_ext = lookup_page_ext(page); + + /* + * Some pages could be missed by concurrent allocation or free, + * because we don't hold the zone lock. + */ + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + /* Record the next PFN to read in the file offset */ + *ppos = (pfn - min_low_pfn) + 1; + + return print_page_owner(buf, count, pfn, page, page_ext); + } + + return 0; +} + +static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count = 0; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + + /* + * We are safe to check buddy flag and order, because + * this is init stage and only single thread runs. + */ + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + + /* Maybe overraping zone */ + if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + /* Found early allocated page */ + set_page_owner(page, 0, 0); + count++; + } + } + + pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", + pgdat->node_id, zone->name, count); +} + +static void init_zones_in_node(pg_data_t *pgdat) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + init_pages_in_zone(pgdat, zone); + spin_unlock_irqrestore(&zone->lock, flags); + } +} + +static void init_early_allocated_pages(void) +{ + pg_data_t *pgdat; + + drain_all_pages(NULL); + for_each_online_pgdat(pgdat) + init_zones_in_node(pgdat); +} + +static const struct file_operations proc_page_owner_operations = { + .read = read_page_owner, +}; + +static int __init pageowner_init(void) +{ + struct dentry *dentry; + + if (!page_owner_inited) { + pr_info("page_owner is disabled\n"); + return 0; + } + + dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, + NULL, &proc_page_owner_operations); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + return 0; +} +module_init(pageowner_init) diff --git a/kernel/mm/pagewalk.c b/kernel/mm/pagewalk.c new file mode 100644 index 000000000..29f2f8b85 --- /dev/null +++ b/kernel/mm/pagewalk.c @@ -0,0 +1,304 @@ +#include +#include +#include +#include + +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + int err = 0; + + pte = pte_offset_map(pmd, addr); + for (;;) { + err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + if (err) + break; + addr += PAGE_SIZE; + if (addr == end) + break; + pte++; + } + + pte_unmap(pte); + return err; +} + +static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + int err = 0; + + pmd = pmd_offset(pud, addr); + do { +again: + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || !walk->vma) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + /* + * This implies that each ->pmd_entry() handler + * needs to know about pmd_trans_huge() pmds + */ + if (walk->pmd_entry) + err = walk->pmd_entry(pmd, addr, next, walk); + if (err) + break; + + /* + * Check this here so we only break down trans_huge + * pages when we _need_ to + */ + if (!walk->pte_entry) + continue; + + split_huge_page_pmd_mm(walk->mm, addr, pmd); + if (pmd_trans_unstable(pmd)) + goto again; + err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pmd_range(pud, addr, next, walk); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int walk_pgd_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (walk->pte_hole) + err = walk->pte_hole(addr, next, walk); + if (err) + break; + continue; + } + if (walk->pmd_entry || walk->pte_entry) + err = walk_pud_range(pgd, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +{ + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); + return boundary < end ? boundary : end; +} + +static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); + pte_t *pte; + int err = 0; + + do { + next = hugetlb_entry_end(h, addr, end); + pte = huge_pte_offset(walk->mm, addr & hmask); + if (pte && walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + if (err) + break; + } while (addr = next, addr != end); + + return err; +} + +#else /* CONFIG_HUGETLB_PAGE */ +static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + return 0; +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Decide whether we really walk over the current vma on [@start, @end) + * or skip it via the returned value. Return 0 if we do walk over the + * current vma, and return 1 if we skip the vma. Negative values means + * error, where we abort the current walk. + */ +static int walk_page_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (walk->test_walk) + return walk->test_walk(start, end, walk); + + /* + * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP + * range, so we don't walk over it as we do for normal vmas. However, + * Some callers are interested in handling hole range and they don't + * want to just ignore any single address range. Such users certainly + * define their ->pte_hole() callbacks, so let's delegate them to handle + * vma(VM_PFNMAP). + */ + if (vma->vm_flags & VM_PFNMAP) { + int err = 1; + if (walk->pte_hole) + err = walk->pte_hole(start, end, walk); + return err ? err : 1; + } + return 0; +} + +static int __walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + struct vm_area_struct *vma = walk->vma; + + if (vma && is_vm_hugetlb_page(vma)) { + if (walk->hugetlb_entry) + err = walk_hugetlb_range(start, end, walk); + } else + err = walk_pgd_range(start, end, walk); + + return err; +} + +/** + * walk_page_range - walk page table with caller specific callbacks + * + * Recursively walk the page table tree of the process represented by @walk->mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. + * + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @walk->test_walk() are used for this + * purpose. + * + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @walk->private should be helpful. + * + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold + * @walk->mm->mmap_sem, because these function traverse vma list and/or + * access to vma's data. + */ +int walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + unsigned long next; + struct vm_area_struct *vma; + + if (start >= end) + return -EINVAL; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); + + vma = find_vma(walk->mm, start); + do { + if (!vma) { /* after the last vma */ + walk->vma = NULL; + next = end; + } else if (start < vma->vm_start) { /* outside vma */ + walk->vma = NULL; + next = min(end, vma->vm_start); + } else { /* inside vma */ + walk->vma = vma; + next = min(end, vma->vm_end); + vma = vma->vm_next; + + err = walk_page_test(start, next, walk); + if (err > 0) { + /* + * positive return values are purely for + * controlling the pagewalk, so should never + * be passed to the callers. + */ + err = 0; + continue; + } + if (err < 0) + break; + } + if (walk->vma || walk->pte_hole) + err = __walk_page_range(start, next, walk); + if (err) + break; + } while (start = next, start < end); + return err; +} + +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +{ + int err; + + if (!walk->mm) + return -EINVAL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON(!vma); + walk->vma = vma; + err = walk_page_test(vma->vm_start, vma->vm_end, walk); + if (err > 0) + return 0; + if (err < 0) + return err; + return __walk_page_range(vma->vm_start, vma->vm_end, walk); +} diff --git a/kernel/mm/percpu-km.c b/kernel/mm/percpu-km.c new file mode 100644 index 000000000..10e3d0b8a --- /dev/null +++ b/kernel/mm/percpu-km.c @@ -0,0 +1,110 @@ +/* + * mm/percpu-km.c - kernel memory based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo + * + * This file is released under the GPLv2. + * + * Chunks are allocated as a contiguous kernel memory using gfp + * allocation. This is to be used on nommu architectures. + * + * To use percpu-km, + * + * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig. + * + * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's + * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work + * fine. + * + * - NUMA is not supported. When setting up the first chunk, + * @cpu_distance_fn should be NULL or report all CPUs to be nearer + * than or at LOCAL_DISTANCE. + * + * - It's best if the chunk size is power of two multiple of + * PAGE_SIZE. Because each chunk is allocated as a contiguous + * kernel memory block using alloc_pages(), memory will be wasted if + * chunk size is not aligned. percpu-km code will whine about it. + */ + +#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#error "contiguous percpu allocation is incompatible with paged first chunk" +#endif + +#include + +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + return 0; +} + +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + /* nada */ +} + +static struct pcpu_chunk *pcpu_create_chunk(void) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + struct pcpu_chunk *chunk; + struct page *pages; + int i; + + chunk = pcpu_alloc_chunk(); + if (!chunk) + return NULL; + + pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); + if (!pages) { + pcpu_free_chunk(chunk); + return NULL; + } + + for (i = 0; i < nr_pages; i++) + pcpu_set_page_chunk(nth_page(pages, i), chunk); + + chunk->data = pages; + chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, 0, nr_pages); + spin_unlock_irq(&pcpu_lock); + + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + + if (chunk && chunk->data) + __free_pages(chunk->data, order_base_2(nr_pages)); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return virt_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + size_t nr_pages, alloc_pages; + + /* all units must be in a single group */ + if (ai->nr_groups != 1) { + printk(KERN_CRIT "percpu: can't handle more than one groups\n"); + return -EINVAL; + } + + nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT; + alloc_pages = roundup_pow_of_two(nr_pages); + + if (alloc_pages > nr_pages) + printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n", + alloc_pages - nr_pages); + + return 0; +} diff --git a/kernel/mm/percpu-vm.c b/kernel/mm/percpu-vm.c new file mode 100644 index 000000000..538998a13 --- /dev/null +++ b/kernel/mm/percpu-vm.c @@ -0,0 +1,366 @@ +/* + * mm/percpu-vm.c - vmalloc area based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo + * + * This file is released under the GPLv2. + * + * Chunks are mapped into vmalloc areas and populated page by page. + * This is the default chunk allocator. + */ + +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); + + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); +} + +/** + * pcpu_get_pages - get temp pages array + * @chunk: chunk of interest + * + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. + * + * RETURNS: + * Pointer to temp pages array on success. + */ +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) +{ + static struct page **pages; + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); + + lockdep_assert_held(&pcpu_alloc_mutex); + + if (!pages) + pages = pcpu_mem_zalloc(pages_size); + return pages; +} + +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } +} + +/** + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). + */ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end) +{ + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + unsigned int cpu, tcpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) + goto err; + } + } + return 0; + +err: + while (--i >= page_start) + __free_page(pages[pcpu_page_idx(cpu, i)]); + + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + for (i = page_start; i < page_end; i++) + __free_page(pages[pcpu_page_idx(tcpu, i)]); + } + return -ENOMEM; +} + +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); +} + +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} + +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; + + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; + } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); + } +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); +} + +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); +} + +/** + * pcpu_map_pages - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). + */ +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end) +{ + unsigned int cpu, tcpu; + int i, err; + + for_each_possible_cpu(cpu) { + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], + page_end - page_start); + if (err < 0) + goto err; + + for (i = page_start; i < page_end; i++) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + } + return 0; +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @page_start: the start page + * @page_end: the end page + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. + * + * CONTEXT: + * pcpu_alloc_mutex, does GFP_KERNEL allocation. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + struct page **pages; + + pages = pcpu_get_pages(chunk); + if (!pages) + return -ENOMEM; + + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; + + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; + } + pcpu_post_map_flush(chunk, page_start, page_end); + + return 0; +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @page_start: the start page + * @page_end: the end page + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + struct page **pages; + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages(chunk); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_unmap_pages(chunk, pages, page_start, page_end); + + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_free_pages(chunk, pages, page_start, page_end); +} + +static struct pcpu_chunk *pcpu_create_chunk(void) +{ + struct pcpu_chunk *chunk; + struct vm_struct **vms; + + chunk = pcpu_alloc_chunk(); + if (!chunk) + return NULL; + + vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size); + if (!vms) { + pcpu_free_chunk(chunk); + return NULL; + } + + chunk->data = vms; + chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + if (chunk && chunk->data) + pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return vmalloc_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + /* no extra restriction */ + return 0; +} diff --git a/kernel/mm/percpu.c b/kernel/mm/percpu.c new file mode 100644 index 000000000..2dd74487a --- /dev/null +++ b/kernel/mm/percpu.c @@ -0,0 +1,2295 @@ +/* + * mm/percpu.c - percpu memory allocator + * + * Copyright (C) 2009 SUSE Linux Products GmbH + * Copyright (C) 2009 Tejun Heo + * + * This file is released under the GPLv2. + * + * This is percpu allocator which can handle both static and dynamic + * areas. Percpu areas are allocated in chunks. Each chunk is + * consisted of boot-time determined number of units and the first + * chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. + * + * c0 c1 c2 + * ------------------- ------------------- ------------ + * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u + * ------------------- ...... ------------------- .... ------------ + * + * Allocation is done in offset-size areas of single unit space. Ie, + * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. + * + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists + * according to free size and tries to allocate from the fullest one. + * Each chunk keeps the maximum contiguous area size hint which is + * guaranteed to be equal to or larger than the maximum contiguous + * area in the chunk. This helps the allocator not to iterate the + * chunk maps unnecessarily. + * + * Allocation state in each chunk is kept using an array of integers + * on chunk->map. A positive value in the map represents a free + * region and negative allocated. Allocation inside a chunk is done + * by scanning this map sequentially and serving the first matching + * entry. This is mostly copied from the percpu_modalloc() allocator. + * Chunks can be determined from the address using the index field + * in the page struct. The index field contains a pointer to the chunk. + * + * To use this allocator, arch code should do the followings. + * + * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate + * regular address to percpu pointer and back if they need to be + * different from the default + * + * - use pcpu_setup_first_chunk() during percpu area initialization to + * setup the first chunk containing the kernel static percpu area + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ +#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 + +#ifdef CONFIG_SMP +/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ +#ifndef __addr_to_pcpu_ptr +#define __addr_to_pcpu_ptr(addr) \ + (void __percpu *)((unsigned long)(addr) - \ + (unsigned long)pcpu_base_addr + \ + (unsigned long)__per_cpu_start) +#endif +#ifndef __pcpu_ptr_to_addr +#define __pcpu_ptr_to_addr(ptr) \ + (void __force *)((unsigned long)(ptr) + \ + (unsigned long)pcpu_base_addr - \ + (unsigned long)__per_cpu_start) +#endif +#else /* CONFIG_SMP */ +/* on UP, it's always identity mapped */ +#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) +#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) +#endif /* CONFIG_SMP */ + +struct pcpu_chunk { + struct list_head list; /* linked to pcpu_slot lists */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + void *base_addr; /* base address of this chunk */ + + int map_used; /* # of map entries used before the sentry */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + + void *data; /* chunk data */ + int first_free; /* no free below this */ + bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ + unsigned long populated[]; /* populated bitmap */ +}; + +static int pcpu_unit_pages __read_mostly; +static int pcpu_unit_size __read_mostly; +static int pcpu_nr_units __read_mostly; +static int pcpu_atom_size __read_mostly; +static int pcpu_nr_slots __read_mostly; +static size_t pcpu_chunk_struct_size __read_mostly; + +/* cpus with the lowest and highest unit addresses */ +static unsigned int pcpu_low_unit_cpu __read_mostly; +static unsigned int pcpu_high_unit_cpu __read_mostly; + +/* the address of the first chunk which starts with the kernel static area */ +void *pcpu_base_addr __read_mostly; +EXPORT_SYMBOL_GPL(pcpu_base_addr); + +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ + +/* group information, used for vm allocation */ +static int pcpu_nr_groups __read_mostly; +static const unsigned long *pcpu_group_offsets __read_mostly; +static const size_t *pcpu_group_sizes __read_mostly; + +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +static struct pcpu_chunk *pcpu_first_chunk; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. The amount of + * reserved offset is in pcpu_reserved_chunk_limit. When reserved + * area doesn't exist, the following variables contain NULL and 0 + * respectively. + */ +static struct pcpu_chunk *pcpu_reserved_chunk; +static int pcpu_reserved_chunk_limit; + +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ + +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ + +/* + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. + */ +static int pcpu_nr_empty_pop_pages; + +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; + +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} + +static bool pcpu_addr_in_first_chunk(void *addr) +{ + void *first_start = pcpu_first_chunk->base_addr; + + return addr >= first_start && addr < first_start + pcpu_unit_size; +} + +static bool pcpu_addr_in_reserved_chunk(void *addr) +{ + void *first_start = pcpu_first_chunk->base_addr; + + return addr >= first_start && + addr < first_start + pcpu_reserved_chunk_limit; +} + +static int __pcpu_size_to_slot(int size) +{ + int highbit = fls(size); /* size is in bytes */ + return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); +} + +static int pcpu_size_to_slot(int size) +{ + if (size == pcpu_unit_size) + return pcpu_nr_slots - 1; + return __pcpu_size_to_slot(size); +} + +static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) +{ + if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + return 0; + + return pcpu_size_to_slot(chunk->free_size); +} + +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; +} + +static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); +} + +static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, + int *rs, int *re, int end) +{ + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); +} + +static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, + int *rs, int *re, int end) +{ + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); +} + +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions between @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + +/** + * pcpu_mem_zalloc - allocate memory + * @size: bytes to allocate + * + * Allocate @size bytes. If @size is smaller than PAGE_SIZE, + * kzalloc() is used; otherwise, vzalloc() is used. The returned + * memory is always zeroed. + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void *pcpu_mem_zalloc(size_t size) +{ + if (WARN_ON_ONCE(!slab_is_available())) + return NULL; + + if (size <= PAGE_SIZE) + return kzalloc(size, GFP_KERNEL); + else + return vzalloc(size); +} + +/** + * pcpu_mem_free - free memory + * @ptr: memory to free + * @size: size of the area + * + * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). + */ +static void pcpu_mem_free(void *ptr, size_t size) +{ + if (size <= PAGE_SIZE) + kfree(ptr); + else + vfree(ptr); +} + +/** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + +/** + * pcpu_chunk_relocate - put chunk in the appropriate chunk slot + * @chunk: chunk of interest + * @oslot: the previous slot it was on + * + * This function is called after an allocation or free changed @chunk. + * New slot according to the changed state is determined and @chunk is + * moved to the slot. Note that the reserved chunk is never put on + * chunk slots. + * + * CONTEXT: + * pcpu_lock. + */ +static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) +{ + int nslot = pcpu_chunk_slot(chunk); + + if (chunk != pcpu_reserved_chunk && oslot != nslot) { + if (oslot < nslot) + list_move(&chunk->list, &pcpu_slot[nslot]); + else + list_move_tail(&chunk->list, &pcpu_slot[nslot]); + } +} + +/** + * pcpu_need_to_extend - determine whether chunk area map needs to be extended + * @chunk: chunk of interest + * @is_atomic: the allocation context + * + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. + * + * CONTEXT: + * pcpu_lock. + * + * RETURNS: + * New target map allocation length if extension is necessary, 0 + * otherwise. + */ +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) +{ + int margin, new_alloc; + + if (is_atomic) { + margin = 3; + + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } + + if (chunk->map_alloc >= chunk->map_used + margin) + return 0; + + new_alloc = PCPU_DFL_MAP_ALLOC; + while (new_alloc < chunk->map_used + margin) + new_alloc *= 2; + + return new_alloc; +} + +/** + * pcpu_extend_area_map - extend area map of a chunk + * @chunk: chunk of interest + * @new_alloc: new target allocation length of the area map + * + * Extend area map of @chunk to have @new_alloc entries. + * + * CONTEXT: + * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) +{ + int *old = NULL, *new = NULL; + size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); + unsigned long flags; + + new = pcpu_mem_zalloc(new_size); + if (!new) + return -ENOMEM; + + /* acquire pcpu_lock and switch to new area map */ + spin_lock_irqsave(&pcpu_lock, flags); + + if (new_alloc <= chunk->map_alloc) + goto out_unlock; + + old_size = chunk->map_alloc * sizeof(chunk->map[0]); + old = chunk->map; + + memcpy(new, old, old_size); + + chunk->map_alloc = new_alloc; + chunk->map = new; + new = NULL; + +out_unlock: + spin_unlock_irqrestore(&pcpu_lock, flags); + + /* + * pcpu_mem_free() might end up calling vfree() which uses + * IRQ-unsafe lock and thus can't be called under pcpu_lock. + */ + pcpu_mem_free(old, old_size); + pcpu_mem_free(new, new_size); + + return 0; +} + +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + +/** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + +/** + * pcpu_alloc_area - allocate area from a pcpu_chunk + * @chunk: chunk of interest + * @size: wanted size in bytes + * @align: wanted align + * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies + * + * Try to allocate @size bytes area aligned at @align from @chunk. + * Note that this function only allocates the offset. It doesn't + * populate or map the area. + * + * @chunk->map must have at least two free slots. + * + * CONTEXT: + * pcpu_lock. + * + * RETURNS: + * Allocated offset in @chunk on success, -1 if no matching area is + * found. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only, int *occ_pages_p) +{ + int oslot = pcpu_chunk_slot(chunk); + int max_contig = 0; + int i, off; + bool seen_free = false; + int *p; + + for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { + int head, tail; + int this_size; + + off = *p; + if (off & 1) + continue; + + this_size = (p[1] & ~1) - off; + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + max_contig = max(this_size, max_contig); + continue; + } + + /* + * If head is small or the previous block is free, + * merge'em. Note that 'small' is defined as smaller + * than sizeof(int), which is very small but isn't too + * uncommon for percpu allocations. + */ + if (head && (head < sizeof(int) || !(p[-1] & 1))) { + *p = off += head; + if (p[-1] & 1) + chunk->free_size -= head; + else + max_contig = max(*p - p[-1], max_contig); + this_size -= head; + head = 0; + } + + /* if tail is small, just keep it around */ + tail = this_size - head - size; + if (tail < sizeof(int)) { + tail = 0; + size = this_size - head; + } + + /* split if warranted */ + if (head || tail) { + int nr_extra = !!head + !!tail; + + /* insert new subblocks */ + memmove(p + nr_extra + 1, p + 1, + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + + if (head) { + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + *++p = off += head; + ++i; + max_contig = max(head, max_contig); + } + if (tail) { + p[1] = off + size; + max_contig = max(tail, max_contig); + } + } + + if (!seen_free) + chunk->first_free = i + 1; + + /* update hint and mark allocated */ + if (i + 1 == chunk->map_used) + chunk->contig_hint = max_contig; /* fully scanned */ + else + chunk->contig_hint = max(chunk->contig_hint, + max_contig); + + chunk->free_size -= size; + *p |= 1; + + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + pcpu_chunk_relocate(chunk, oslot); + return off; + } + + chunk->contig_hint = max_contig; /* fully scanned */ + pcpu_chunk_relocate(chunk, oslot); + + /* tell the upper layer that this chunk has no matching area */ + return -1; +} + +/** + * pcpu_free_area - free area to a pcpu_chunk + * @chunk: chunk of interest + * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies + * + * Free area starting from @freeme to @chunk. Note that this function + * only modifies the allocation map. It doesn't depopulate or unmap + * the area. + * + * CONTEXT: + * pcpu_lock. + */ +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) +{ + int oslot = pcpu_chunk_slot(chunk); + int off = 0; + unsigned i, j; + int to_free = 0; + int *p; + + freeme |= 1; /* we are searching for pair */ + + i = 0; + j = chunk->map_used; + while (i != j) { + unsigned k = (i + j) / 2; + off = chunk->map[k]; + if (off < freeme) + i = k + 1; + else if (off > freeme) + j = k; + else + i = j = k; + } + BUG_ON(off != freeme); + + if (i < chunk->first_free) + chunk->first_free = i; + + p = chunk->map + i; + *p = off &= ~1; + chunk->free_size += (p[1] & ~1) - off; + + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + + /* merge with next? */ + if (!(p[1] & 1)) + to_free++; + /* merge with previous? */ + if (i > 0 && !(p[-1] & 1)) { + to_free++; + i--; + p--; + } + if (to_free) { + chunk->map_used -= to_free; + memmove(p + 1, p + 1 + to_free, + (chunk->map_used - i) * sizeof(chunk->map[0])); + } + + chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); + pcpu_chunk_relocate(chunk, oslot); +} + +static struct pcpu_chunk *pcpu_alloc_chunk(void) +{ + struct pcpu_chunk *chunk; + + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); + if (!chunk) + return NULL; + + chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * + sizeof(chunk->map[0])); + if (!chunk->map) { + pcpu_mem_free(chunk, pcpu_chunk_struct_size); + return NULL; + } + + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[0] = 0; + chunk->map[1] = pcpu_unit_size | 1; + chunk->map_used = 1; + + INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; + + return chunk; +} + +static void pcpu_free_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; + pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); + pcpu_mem_free(chunk, pcpu_chunk_struct_size); +} + +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + +/* + * Chunk management implementation. + * + * To allow different implementations, chunk alloc/free and + * [de]population are implemented in a separate file which is pulled + * into this file and compiled together. The following functions + * should be implemented. + * + * pcpu_populate_chunk - populate the specified range of a chunk + * pcpu_depopulate_chunk - depopulate the specified range of a chunk + * pcpu_create_chunk - create a new chunk + * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop + * pcpu_addr_to_page - translate address to physical address + * pcpu_verify_alloc_info - check alloc_info is acceptable during init + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); +static struct pcpu_chunk *pcpu_create_chunk(void); +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); +static struct page *pcpu_addr_to_page(void *addr); +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); + +#ifdef CONFIG_NEED_PER_CPU_KM +#include "percpu-km.c" +#else +#include "percpu-vm.c" +#endif + +/** + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + /* is it in the first chunk? */ + if (pcpu_addr_in_first_chunk(addr)) { + /* is it in the reserved area? */ + if (pcpu_addr_in_reserved_chunk(addr)) + return pcpu_reserved_chunk; + return pcpu_first_chunk; + } + + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += pcpu_unit_offsets[raw_smp_processor_id()]; + return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); +} + +/** + * pcpu_alloc - the percpu allocator + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags + * + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) +{ + static int warn_limit = 10; + struct pcpu_chunk *chunk; + const char *err; + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + int occ_pages = 0; + int slot, off, new_alloc, cpu, ret; + unsigned long flags; + void __percpu *ptr; + + /* + * We want the lowest bit of offset available for in-use/free + * indicator, so force >= 16bit alignment and make size even. + */ + if (unlikely(align < 2)) + align = 2; + + size = ALIGN(size, 2); + + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { + WARN(true, "illegal size (%zu) or align (%zu) for " + "percpu allocation\n", size, align); + return NULL; + } + + spin_lock_irqsave(&pcpu_lock, flags); + + /* serve reserved allocations from the reserved chunk if available */ + if (reserved && pcpu_reserved_chunk) { + chunk = pcpu_reserved_chunk; + + if (size > chunk->contig_hint) { + err = "alloc from reserved chunk failed"; + goto fail_unlock; + } + + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { + spin_unlock_irqrestore(&pcpu_lock, flags); + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { + err = "failed to extend area map of reserved chunk"; + goto fail; + } + spin_lock_irqsave(&pcpu_lock, flags); + } + + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); + if (off >= 0) + goto area_found; + + err = "alloc from reserved chunk failed"; + goto fail_unlock; + } + +restart: + /* search through normal chunks */ + for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (size > chunk->contig_hint) + continue; + + new_alloc = pcpu_need_to_extend(chunk, is_atomic); + if (new_alloc) { + if (is_atomic) + continue; + spin_unlock_irqrestore(&pcpu_lock, flags); + if (pcpu_extend_area_map(chunk, + new_alloc) < 0) { + err = "failed to extend area map"; + goto fail; + } + spin_lock_irqsave(&pcpu_lock, flags); + /* + * pcpu_lock has been dropped, need to + * restart cpu_slot list walking. + */ + goto restart; + } + + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); + if (off >= 0) + goto area_found; + } + } + + spin_unlock_irqrestore(&pcpu_lock, flags); + + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + if (is_atomic) + goto fail; + + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); + } + + mutex_unlock(&pcpu_alloc_mutex); + goto restart; + +area_found: + spin_unlock_irqrestore(&pcpu_lock, flags); + + /* populate if not all pages are already there */ + if (!is_atomic) { + int page_start, page_end, rs, re; + + mutex_lock(&pcpu_alloc_mutex); + + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off, &occ_pages); + err = "failed to populate"; + goto fail_unlock; + } + pcpu_chunk_populated(chunk, rs, re); + spin_unlock_irqrestore(&pcpu_lock, flags); + } + + mutex_unlock(&pcpu_alloc_mutex); + } + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + + ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); + kmemleak_alloc_percpu(ptr, size, gfp); + return ptr; + +fail_unlock: + spin_unlock_irqrestore(&pcpu_lock, flags); +fail: + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } + return NULL; +} + +/** + * __alloc_percpu_gfp - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags + * + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ +void __percpu *__alloc_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, false, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +/** + * __alloc_reserved_percpu - allocate reserved percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate zero-filled percpu area of @size bytes aligned at @align + * from reserved percpu area if arch has set it up; otherwise, + * allocation is served from the same dynamic area. Might sleep. + * Might trigger writeouts. + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void __percpu *__alloc_reserved_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, true, GFP_KERNEL); +} + +/** + * pcpu_balance_workfn - manage the amount of free chunks and populated pages + * @work: unused + * + * Reclaim all fully free chunks except for the first one. + */ +static void pcpu_balance_workfn(struct work_struct *work) +{ + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; + struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); + + list_for_each_entry_safe(chunk, next, free_head, list) { + WARN_ON(chunk->immutable); + + /* spare the first one */ + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) + continue; + + list_move(&chunk->list, &to_free); + } + + spin_unlock_irq(&pcpu_lock); + + list_for_each_entry_safe(chunk, next, &to_free, list) { + int rs, re; + + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); + } + pcpu_destroy_chunk(chunk); + } + + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + + mutex_unlock(&pcpu_alloc_mutex); +} + +/** + * free_percpu - free percpu area + * @ptr: pointer to area to free + * + * Free percpu area @ptr. + * + * CONTEXT: + * Can be called from atomic context. + */ +void free_percpu(void __percpu *ptr) +{ + void *addr; + struct pcpu_chunk *chunk; + unsigned long flags; + int off, occ_pages; + + if (!ptr) + return; + + kmemleak_free_percpu(ptr); + + addr = __pcpu_ptr_to_addr(ptr); + + spin_lock_irqsave(&pcpu_lock, flags); + + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->base_addr; + + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; + + /* if there are more than one fully free chunks, wake up grim reaper */ + if (chunk->free_size == pcpu_unit_size) { + struct pcpu_chunk *pos; + + list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) + if (pos != chunk) { + pcpu_schedule_balance_work(); + break; + } + } + + spin_unlock_irqrestore(&pcpu_lock, flags); +} +EXPORT_SYMBOL_GPL(free_percpu); + +/** + * is_kernel_percpu_address - test whether address is from static percpu area + * @addr: address to test + * + * Test whether @addr belongs to in-kernel static percpu area. Module + * static percpu areas are not considered. For those, use + * is_module_percpu_address(). + * + * RETURNS: + * %true if @addr is from in-kernel static percpu area, %false otherwise. + */ +bool is_kernel_percpu_address(unsigned long addr) +{ +#ifdef CONFIG_SMP + const size_t static_size = __per_cpu_end - __per_cpu_start; + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + unsigned int cpu; + + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if ((void *)addr >= start && (void *)addr < start + static_size) + return true; + } +#endif + /* on UP, can't distinguish from other static vars, always false */ + return false; +} + +/** + * per_cpu_ptr_to_phys - convert translated percpu address to physical address + * @addr: the address to be converted to physical address + * + * Given @addr which is dereferenceable address obtained via one of + * percpu access macros, this function translates it into its physical + * address. The caller is responsible for ensuring @addr stays valid + * until this function finishes. + * + * percpu allocator has special setup for the first chunk, which currently + * supports either embedding in linear address space or vmalloc mapping, + * and, from the second one, the backing allocator (currently either vm or + * km) provides translation. + * + * The addr can be translated simply without checking if it falls into the + * first chunk. But the current code reflects better how percpu allocator + * actually works, and the verification can discover both bugs in percpu + * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current + * code. + * + * RETURNS: + * The physical address for @addr. + */ +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + bool in_first_chunk = false; + unsigned long first_low, first_high; + unsigned int cpu; + + /* + * The following test on unit_low/high isn't strictly + * necessary but will speed up lookups of addresses which + * aren't in the first chunk. + */ + first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); + first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_low && + (unsigned long)addr < first_high) { + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if (addr >= start && addr < start + pcpu_unit_size) { + in_first_chunk = true; + break; + } + } + } + + if (in_first_chunk) { + if (!is_vmalloc_addr(addr)) + return __pa(addr); + else + return page_to_phys(vmalloc_to_page(addr)) + + offset_in_page(addr); + } else + return page_to_phys(pcpu_addr_to_page(addr)) + + offset_in_page(addr); +} + +/** + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + memblock_free_early(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) +{ + int group_width = 1, cpu_width = 1, width; + char empty_str[] = "--------"; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + + v = num_possible_cpus(); + while (v /= 10) + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; + + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); + + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); + + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { + printk(KERN_CONT "\n"); + printk("%spcpu-alloc: ", lvl); + } + printk(KERN_CONT "[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk(KERN_CONT "%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk(KERN_CONT "%s ", empty_str); + } + } + printk(KERN_CONT "\n"); +} + +/** + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @ai: pcpu_alloc_info describing how to percpu area is shaped + * @base_addr: mapped address + * + * Initialize the first percpu chunk which contains the kernel static + * perpcu area. This function is to be called from arch percpu area + * setup path. + * + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to + * reserve after the static area in the first chunk. This reserves + * the first chunk such that it's available only through reserved + * percpu allocation. This is primarily used to serve module percpu + * static areas on architectures where the addressing model has + * limited offset range for symbol relocations to guarantee module + * percpu symbols fall inside the relocatable range. + * + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. + * + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. + * + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. + * + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. + * + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. + * + * If the first chunk ends up with both reserved and dynamic areas, it + * is served by two chunks - one to serve the core static and reserved + * areas and the other for the dynamic area. They share the same vm + * and page map but uses different area allocation map to stay away + * from each other. The latter chunk is circulated in the chunk slots + * and available for dynamic allocation like any other chunks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) +{ + static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; + static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; + struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *group_offsets; + size_t *group_sizes; + unsigned long *unit_off; + unsigned int cpu; + int *unit_map; + int group, unit, i; + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ + cpumask_pr_args(cpu_possible_mask)); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + + /* sanity checks */ + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); +#ifdef CONFIG_SMP + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); +#endif + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); + PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); + + /* process group information and build config tables accordingly */ + group_offsets = memblock_virt_alloc(ai->nr_groups * + sizeof(group_offsets[0]), 0); + group_sizes = memblock_virt_alloc(ai->nr_groups * + sizeof(group_sizes[0]), 0); + unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0); + unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = UINT_MAX; + + pcpu_low_unit_cpu = NR_CPUS; + pcpu_high_unit_cpu = NR_CPUS; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + group_offsets[group] = gi->base_offset; + group_sizes[group] = gi->nr_units * ai->unit_size; + + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; + + PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); + + unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + + /* determine low/high unit_cpu */ + if (pcpu_low_unit_cpu == NR_CPUS || + unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) + pcpu_low_unit_cpu = cpu; + if (pcpu_high_unit_cpu == NR_CPUS || + unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) + pcpu_high_unit_cpu = cpu; + } + } + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); + + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_DEBUG, ai); + + pcpu_nr_groups = ai->nr_groups; + pcpu_group_offsets = group_offsets; + pcpu_group_sizes = group_sizes; + pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; + + /* determine basic parameters */ + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_atom_size = ai->atom_size; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); + + /* + * Allocate chunk slots. The additional last slot is for + * empty chunks. + */ + pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; + pcpu_slot = memblock_virt_alloc( + pcpu_nr_slots * sizeof(pcpu_slot[0]), 0); + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_slot[i]); + + /* + * Initialize static chunk. If reserved_size is zero, the + * static chunk covers static area + dynamic allocation area + * in the first chunk. If reserved_size is not zero, it + * covers static area + reserved area (mostly used for module + * static percpu allocation). + */ + schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); + INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); + schunk->base_addr = base_addr; + schunk->map = smap; + schunk->map_alloc = ARRAY_SIZE(smap); + schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; + + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; + pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; + } else { + schunk->free_size = dyn_size; + dyn_size = 0; /* dynamic area covered */ + } + schunk->contig_hint = schunk->free_size; + + schunk->map[0] = 1; + schunk->map[1] = ai->static_size; + schunk->map_used = 1; + if (schunk->free_size) + schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); + else + schunk->map[1] |= 1; + + /* init dynamic chunk if necessary */ + if (dyn_size) { + dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); + INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); + dchunk->base_addr = base_addr; + dchunk->map = dmap; + dchunk->map_alloc = ARRAY_SIZE(dmap); + dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; + + dchunk->contig_hint = dchunk->free_size = dyn_size; + dchunk->map[0] = 1; + dchunk->map[1] = pcpu_reserved_chunk_limit; + dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; + dchunk->map_used = 2; + } + + /* link the first chunk in */ + pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); + pcpu_chunk_relocate(pcpu_first_chunk, -1); + + /* we're done */ + pcpu_base_addr = base_addr; + return 0; +} + +#ifdef CONFIG_SMP + +const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", +}; + +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; + +static int __init percpu_alloc_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); + + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + +/* + * pcpu_embed_first_chunk() is used by the generic percpu setup. + * Build it if needed by the arch config or the generic setup is going + * to be used. + */ +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) +#define BUILD_EMBED_FIRST_CHUNK +#endif + +/* build pcpu_page_first_chunk() iff needed by the arch config */ +#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#define BUILD_PAGE_FIRST_CHUNK +#endif + +/* pcpu_build_alloc_info() is used by both embed and page first chunk */ +#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always multiples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +static struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_cnt)); + + /* calculate size_sum and ensure dyn_size is enough for early alloc */ + size_sum = PFN_ALIGN(static_size + reserved_size + + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + dyn_size = size_sum - static_size - reserved_size; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accommodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 1/3. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} +#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ + +#if defined(BUILD_EMBED_FIRST_CHUNK) +/** + * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: function to free percpu page + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * If this function is used to setup the first chunk, it is allocated + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). + * + * @dyn_size specifies the minimum dynamic area size. + * + * If the needed size is smaller than the minimum or specified unit + * size, the leftover is returned using @free_fn. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) +{ + void *base = (void *)ULONG_MAX; + void **areas = NULL; + struct pcpu_alloc_info *ai; + size_t size_sum, areas_size, max_distance; + int group, i, rc; + + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); + if (IS_ERR(ai)) + return PTR_ERR(ai); + + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); + + areas = memblock_virt_alloc_nopanic(areas_size, 0); + if (!areas) { + rc = -ENOMEM; + goto out_free; + } + + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + /* kmemleak tracks the percpu allocations separately */ + kmemleak_free(ptr); + areas[group] = ptr; + + base = min(ptr, base); + } + + /* + * Copy data and free unused parts. This should happen after all + * allocations are complete; otherwise, we may end up with + * overlapping groups. + */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + void *ptr = areas[group]; + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } + } + + /* base address is now known, determine group base offsets */ + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { + ai->groups[group].base_offset = areas[group] - base; + max_distance = max_t(size_t, max_distance, + ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > VMALLOC_TOTAL * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " + "space 0x%lx\n", max_distance, + VMALLOC_TOTAL); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } + + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); + + rc = pcpu_setup_first_chunk(ai, base); + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + if (areas[group]) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: + pcpu_free_alloc_info(ai); + if (areas) + memblock_free_early(__pa(areas), areas_size); + return rc; +} +#endif /* BUILD_EMBED_FIRST_CHUNK */ + +#ifdef BUILD_PAGE_FIRST_CHUNK +/** + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: function to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + static struct vm_struct vm; + struct pcpu_alloc_info *ai; + char psize_str[16]; + int unit_pages; + size_t pages_size; + struct page **pages; + int unit, i, j, rc; + + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + + ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = memblock_virt_alloc(pages_size, 0); + + /* allocate pages */ + j = 0; + for (unit = 0; unit < num_possible_cpus(); unit++) + for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); + goto enomem; + } + /* kmemleak tracks the percpu allocations separately */ + kmemleak_free(ptr); + pages[j++] = virt_to_page(ptr); + } + + /* allocate vm area, map the pages and copy static data */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * ai->unit_size; + vm_area_register_early(&vm, PAGE_SIZE); + + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned long unit_addr = + (unsigned long)vm.addr + unit * ai->unit_size; + + for (i = 0; i < unit_pages; i++) + populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + + /* pte already populated, the following shouldn't fail */ + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); + + /* + * FIXME: Archs with virtual cache should flush local + * cache for the linear mapping here - something + * equivalent to flush_cache_vmap() on the local cpu. + * flush_cache_vmap() can't be used as most supporting + * data structures are not set up yet. + */ + + /* copy static data */ + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + } + + /* we're ready, commit */ + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); + + rc = pcpu_setup_first_chunk(ai, vm.addr); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pages[j]), PAGE_SIZE); + rc = -ENOMEM; +out_free_ar: + memblock_free_early(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); + return rc; +} +#endif /* BUILD_PAGE_FIRST_CHUNK */ + +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +/* + * Generic SMP percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return memblock_virt_alloc_from_nopanic( + size, align, __pa(MAX_DMA_ADDRESS)); +} + +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + memblock_free_early(__pa(ptr), size); +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#else /* CONFIG_SMP */ + +/* + * UP percpu area setup. + * + * UP always uses km-based percpu allocator with identity mapping. + * Static percpu variables are indistinguishable from the usual static + * variables and don't require any special preparation. + */ +void __init setup_per_cpu_areas(void) +{ + const size_t unit_size = + roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, + PERCPU_DYNAMIC_RESERVE)); + struct pcpu_alloc_info *ai; + void *fc; + + ai = pcpu_alloc_alloc_info(1, 1); + fc = memblock_virt_alloc_from_nopanic(unit_size, + PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!ai || !fc) + panic("Failed to allocate memory for percpu areas."); + /* kmemleak tracks the percpu allocations separately */ + kmemleak_free(fc); + + ai->dyn_size = unit_size; + ai->unit_size = unit_size; + ai->atom_size = unit_size; + ai->alloc_size = unit_size; + ai->groups[0].nr_units = 1; + ai->groups[0].cpu_map[0] = 0; + + if (pcpu_setup_first_chunk(ai, fc) < 0) + panic("Failed to initialize percpu areas."); +} + +#endif /* CONFIG_SMP */ + +/* + * First and reserved chunks are initialized with temporary allocation + * map in initdata so that they can be used before slab is online. + * This function is called after slab is brought up and replaces those + * with properly allocated maps. + */ +void __init percpu_init_late(void) +{ + struct pcpu_chunk *target_chunks[] = + { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; + struct pcpu_chunk *chunk; + unsigned long flags; + int i; + + for (i = 0; (chunk = target_chunks[i]); i++) { + int *map; + const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); + + BUILD_BUG_ON(size > PAGE_SIZE); + + map = pcpu_mem_zalloc(size); + BUG_ON(!map); + + spin_lock_irqsave(&pcpu_lock, flags); + memcpy(map, chunk->map, size); + chunk->map = map; + spin_unlock_irqrestore(&pcpu_lock, flags); + } +} + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); diff --git a/kernel/mm/pgtable-generic.c b/kernel/mm/pgtable-generic.c new file mode 100644 index 000000000..c25f94b33 --- /dev/null +++ b/kernel/mm/pgtable-generic.c @@ -0,0 +1,200 @@ +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in asm-generic/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include +#include +#include + +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed), as well as write + * permission. Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_fix_spurious_fault(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#else + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + struct mm_struct *mm = (vma)->vm_mm; + pte_t pte; + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_accessible(mm, pte)) + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = pmd_mksplitting(*pmdp); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + /* tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + if (list_empty(&pgtable->lru)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t entry = *pmdp; + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/kernel/mm/process_vm_access.c b/kernel/mm/process_vm_access.c new file mode 100644 index 000000000..e88d07164 --- /dev/null +++ b/kernel/mm/process_vm_access.c @@ -0,0 +1,365 @@ +/* + * linux/mm/process_vm_access.c + * + * Copyright (C) 2010-2011 Christopher Yeoh , IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_COMPAT +#include +#endif + +/** + * process_vm_rw_pages - read/write pages from task specified + * @pages: array of pointers to pages we want to copy + * @start_offset: offset in page to start copying from/to + * @len: number of bytes to copy + * @iter: where to copy to/from locally + * @vm_write: 0 means copy from, 1 means copy to + * Returns 0 on success, error code otherwise + */ +static int process_vm_rw_pages(struct page **pages, + unsigned offset, + size_t len, + struct iov_iter *iter, + int vm_write) +{ + /* Do the copy for each page */ + while (len && iov_iter_count(iter)) { + struct page *page = *pages++; + size_t copy = PAGE_SIZE - offset; + size_t copied; + + if (copy > len) + copy = len; + + if (vm_write) { + copied = copy_page_from_iter(page, offset, copy, iter); + set_page_dirty_lock(page); + } else { + copied = copy_page_to_iter(page, offset, copy, iter); + } + len -= copied; + if (copied < copy && iov_iter_count(iter)) + return -EFAULT; + offset = 0; + } + return 0; +} + +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) + +/** + * process_vm_rw_single_vec - read/write pages from task specified + * @addr: start memory address of target process + * @len: size of area to copy to/from + * @iter: where to copy to/from locally + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @mm: mm for task + * @task: task to read/write from + * @vm_write: 0 means copy from, 1 means copy to + * Returns 0 on success or on failure error code + */ +static int process_vm_rw_single_vec(unsigned long addr, + unsigned long len, + struct iov_iter *iter, + struct page **process_pages, + struct mm_struct *mm, + struct task_struct *task, + int vm_write) +{ + unsigned long pa = addr & PAGE_MASK; + unsigned long start_offset = addr - pa; + unsigned long nr_pages; + ssize_t rc = 0; + unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES + / sizeof(struct pages *); + + /* Work out address and page range required */ + if (len == 0) + return 0; + nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + + while (!rc && nr_pages && iov_iter_count(iter)) { + int pages = min(nr_pages, max_pages_per_loop); + size_t bytes; + + /* Get the pages we're interested in */ + pages = get_user_pages_unlocked(task, mm, pa, pages, + vm_write, 0, process_pages); + if (pages <= 0) + return -EFAULT; + + bytes = pages * PAGE_SIZE - start_offset; + if (bytes > len) + bytes = len; + + rc = process_vm_rw_pages(process_pages, + start_offset, bytes, iter, + vm_write); + len -= bytes; + start_offset = 0; + nr_pages -= pages; + pa += pages * PAGE_SIZE; + while (pages) + put_page(process_pages[--pages]); + } + + return rc; +} + +/* Maximum number of entries for process pages array + which lives on stack */ +#define PVM_MAX_PP_ARRAY_COUNT 16 + +/** + * process_vm_rw_core - core of reading/writing pages from task specified + * @pid: PID of process to read/write from/to + * @iter: where to copy to/from locally + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct task_struct *task; + struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; + struct page **process_pages = pp_stack; + struct mm_struct *mm; + unsigned long i; + ssize_t rc = 0; + unsigned long nr_pages = 0; + unsigned long nr_pages_iov; + ssize_t iov_len; + size_t total_len = iov_iter_count(iter); + + /* + * Work out how many pages of struct pages we're going to need + * when eventually calling get_user_pages + */ + for (i = 0; i < riovcnt; i++) { + iov_len = rvec[i].iov_len; + if (iov_len > 0) { + nr_pages_iov = ((unsigned long)rvec[i].iov_base + + iov_len) + / PAGE_SIZE - (unsigned long)rvec[i].iov_base + / PAGE_SIZE + 1; + nr_pages = max(nr_pages, nr_pages_iov); + } + } + + if (nr_pages == 0) + return 0; + + if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { + /* For reliability don't try to kmalloc more than + 2 pages worth */ + process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, + sizeof(struct pages *)*nr_pages), + GFP_KERNEL); + + if (!process_pages) + return -ENOMEM; + } + + /* Get process information */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) { + rc = -ESRCH; + goto free_proc_pages; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH); + if (!mm || IS_ERR(mm)) { + rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + /* + * Explicitly map EACCES to EPERM as EPERM is a more a + * appropriate error code for process_vw_readv/writev + */ + if (rc == -EACCES) + rc = -EPERM; + goto put_task_struct; + } + + for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) + rc = process_vm_rw_single_vec( + (unsigned long)rvec[i].iov_base, rvec[i].iov_len, + iter, process_pages, mm, task, vm_write); + + /* copied = space before - space after */ + total_len -= iov_iter_count(iter); + + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (total_len) + rc = total_len; + + mmput(mm); + +put_task_struct: + put_task_struct(task); + +free_proc_pages: + if (process_pages != pp_stack) + kfree(process_pages); + return rc; +} + +/** + * process_vm_rw - check iovecs before calling core routine + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + struct iov_iter iter; + ssize_t rc; + int dir = vm_write ? WRITE : READ; + + if (flags != 0) + return -EINVAL; + + /* Check iovecs */ + rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); + if (rc < 0) + return rc; + if (!iov_iter_count(&iter)) + goto free_iovecs; + + rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, + iovstack_r, &iov_r); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + kfree(iov_l); + + return rc; +} + +SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); +} + +SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, + const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); +} + +#ifdef CONFIG_COMPAT + +static ssize_t +compat_process_vm_rw(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + struct iov_iter iter; + ssize_t rc = -EFAULT; + int dir = vm_write ? WRITE : READ; + + if (flags != 0) + return -EINVAL; + + rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); + if (rc < 0) + return rc; + if (!iov_iter_count(&iter)) + goto free_iovecs; + rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, + UIO_FASTIOV, iovstack_r, + &iov_r); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + kfree(iov_l); + return rc; +} + +COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 0); +} + +COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 1); +} + +#endif diff --git a/kernel/mm/quicklist.c b/kernel/mm/quicklist.c new file mode 100644 index 000000000..942212970 --- /dev/null +++ b/kernel/mm/quicklist.c @@ -0,0 +1,102 @@ +/* + * Quicklist support. + * + * Quicklists are light weight lists of pages that have a defined state + * on alloc and free. Pages must be in the quicklist specific defined state + * (zero by default) when the page is freed. It seems that the initial idea + * for such lists first came from Dave Miller and then various other people + * improved on it. + * + * Copyright (C) 2007 SGI, + * Christoph Lameter + * Generalized, added support for multiple lists and + * constructors / destructors. + */ +#include + +#include +#include +#include +#include + +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); + +#define FRACTION_OF_NODE_MEM 16 + +static unsigned long max_pages(unsigned long min_pages) +{ + unsigned long node_free_pages, max; + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; + + node_free_pages = +#ifdef CONFIG_ZONE_DMA + zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + +#endif +#ifdef CONFIG_ZONE_DMA32 + zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + +#endif + zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); + + max = node_free_pages / FRACTION_OF_NODE_MEM; + + num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); + max /= num_cpus_on_node; + + return max(max, min_pages); +} + +static long min_pages_to_free(struct quicklist *q, + unsigned long min_pages, long max_free) +{ + long pages_to_free; + + pages_to_free = q->nr_pages - max_pages(min_pages); + + return min(pages_to_free, max_free); +} + +/* + * Trim down the number of pages in the quicklist + */ +void quicklist_trim(int nr, void (*dtor)(void *), + unsigned long min_pages, unsigned long max_free) +{ + long pages_to_free; + struct quicklist *q; + + q = &get_cpu_var(quicklist)[nr]; + if (q->nr_pages > min_pages) { + pages_to_free = min_pages_to_free(q, min_pages, max_free); + + while (pages_to_free > 0) { + /* + * We pass a gfp_t of 0 to quicklist_alloc here + * because we will never call into the page allocator. + */ + void *p = quicklist_alloc(nr, 0, NULL); + + if (dtor) + dtor(p); + free_page((unsigned long)p); + pages_to_free--; + } + } + put_cpu_var(quicklist); +} + +unsigned long quicklist_total_size(void) +{ + unsigned long count = 0; + int cpu; + struct quicklist *ql, *q; + + for_each_online_cpu(cpu) { + ql = per_cpu(quicklist, cpu); + for (q = ql; q < ql + CONFIG_NR_QUICK; q++) + count += q->nr_pages; + } + return count; +} + diff --git a/kernel/mm/readahead.c b/kernel/mm/readahead.c new file mode 100644 index 000000000..935675844 --- /dev/null +++ b/kernel/mm/readahead.c @@ -0,0 +1,580 @@ +/* + * mm/readahead.c - address_space-level file readahead. + * + * Copyright (C) 2002, Linus Torvalds + * + * 09Apr2002 Andrew Morton + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Initialise a struct file's readahead state. Assumes that the caller has + * memset *ra to zero. + */ +void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) +{ + ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; + ra->prev_pos = -1; +} +EXPORT_SYMBOL_GPL(file_ra_state_init); + +#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) + +/* + * see if a page needs releasing upon read_cache_pages() failure + * - the caller of read_cache_pages() may have set PG_private or PG_fscache + * before calling, such as the NFS fs marking pages that are cached locally + * on disk, thus we need to give the fs a chance to clean up in the event of + * an error + */ +static void read_cache_pages_invalidate_page(struct address_space *mapping, + struct page *page) +{ + if (page_has_private(page)) { + if (!trylock_page(page)) + BUG(); + page->mapping = mapping; + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + page->mapping = NULL; + unlock_page(page); + } + page_cache_release(page); +} + +/* + * release a list of pages, invalidating them first if need be + */ +static void read_cache_pages_invalidate_pages(struct address_space *mapping, + struct list_head *pages) +{ + struct page *victim; + + while (!list_empty(pages)) { + victim = list_to_page(pages); + list_del(&victim->lru); + read_cache_pages_invalidate_page(mapping, victim); + } +} + +/** + * read_cache_pages - populate an address space with some pages & start reads against them + * @mapping: the address_space + * @pages: The address of a list_head which contains the target pages. These + * pages have their ->index populated and are otherwise uninitialised. + * @filler: callback routine for filling a single page. + * @data: private data for the callback routine. + * + * Hides the details of the LRU cache etc from the filesystems. + */ +int read_cache_pages(struct address_space *mapping, struct list_head *pages, + int (*filler)(void *, struct page *), void *data) +{ + struct page *page; + int ret = 0; + + while (!list_empty(pages)) { + page = list_to_page(pages); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { + read_cache_pages_invalidate_page(mapping, page); + continue; + } + page_cache_release(page); + + ret = filler(data, page); + if (unlikely(ret)) { + read_cache_pages_invalidate_pages(mapping, pages); + break; + } + task_io_account_read(PAGE_CACHE_SIZE); + } + return ret; +} + +EXPORT_SYMBOL(read_cache_pages); + +static int read_pages(struct address_space *mapping, struct file *filp, + struct list_head *pages, unsigned nr_pages) +{ + struct blk_plug plug; + unsigned page_idx; + int ret; + + blk_start_plug(&plug); + + if (mapping->a_ops->readpages) { + ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + /* Clean up the remaining pages */ + put_pages_list(pages); + goto out; + } + + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_to_page(pages); + list_del(&page->lru); + if (!add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { + mapping->a_ops->readpage(filp, page); + } + page_cache_release(page); + } + ret = 0; + +out: + blk_finish_plug(&plug); + + return ret; +} + +/* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all + * the pages first, then submits them all for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + * + * Returns the number of pages requested, or the maximum amount of I/O allowed. + */ +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + struct page *page; + unsigned long end_index; /* The last page we want to read */ + LIST_HEAD(page_pool); + int page_idx; + int ret = 0; + loff_t isize = i_size_read(inode); + + if (isize == 0) + goto out; + + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + /* + * Preallocate as many pages as we will need. + */ + for (page_idx = 0; page_idx < nr_to_read; page_idx++) { + pgoff_t page_offset = offset + page_idx; + + if (page_offset > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_offset); + rcu_read_unlock(); + if (page && !radix_tree_exceptional_entry(page)) + continue; + + page = page_cache_alloc_readahead(mapping); + if (!page) + break; + page->index = page_offset; + list_add(&page->lru, &page_pool); + if (page_idx == nr_to_read - lookahead_size) + SetPageReadahead(page); + ret++; + } + + /* + * Now start the IO. We ignore I/O errors - if the page is not + * uptodate then the caller will launch readpage again, and + * will then handle the error. + */ + if (ret) + read_pages(mapping, filp, &page_pool, ret); + BUG_ON(!list_empty(&page_pool)); +out: + return ret; +} + +/* + * Chunk the readahead into 2 megabyte units, so that we don't pin too much + * memory at once. + */ +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read) +{ + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) + return -EINVAL; + + nr_to_read = max_sane_readahead(nr_to_read); + while (nr_to_read) { + int err; + + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; + + if (this_chunk > nr_to_read) + this_chunk = nr_to_read; + err = __do_page_cache_readahead(mapping, filp, + offset, this_chunk, 0); + if (err < 0) + return err; + + offset += this_chunk; + nr_to_read -= this_chunk; + } + return 0; +} + +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) +/* + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. + */ +unsigned long max_sane_readahead(unsigned long nr) +{ + return min(nr, MAX_READAHEAD); +} + +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-8 page = 32k initial, > 8 page = 128k initial + */ +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) +{ + unsigned long newsize = roundup_pow_of_two(size); + + if (newsize <= max / 32) + newsize = newsize * 4; + else if (newsize <= max / 4) + newsize = newsize * 2; + else + newsize = max; + + return newsize; +} + +/* + * Get the previous window size, ramp it up, and + * return it as the new window size. + */ +static unsigned long get_next_ra_size(struct file_ra_state *ra, + unsigned long max) +{ + unsigned long cur = ra->size; + unsigned long newsize; + + if (cur < max / 16) + newsize = 4 * cur; + else + newsize = 2 * cur; + + return min(newsize, max); +} + +/* + * On-demand readahead design. + * + * The fields in struct file_ra_state represent the most-recently-executed + * readahead attempt: + * + * |<----- async_size ---------| + * |------------------- size -------------------->| + * |==================#===========================| + * ^start ^page marked with PG_readahead + * + * To overlap application thinking time and disk I/O time, we do + * `readahead pipelining': Do not wait until the application consumed all + * readahead pages and stalled on the missing page at readahead_index; + * Instead, submit an asynchronous readahead I/O as soon as there are + * only async_size pages left in the readahead window. Normally async_size + * will be equal to size, for maximum pipelining. + * + * In interleaved sequential reads, concurrent streams on the same fd can + * be invalidating each other's readahead state. So we flag the new readahead + * page at (start+size-async_size) with PG_readahead, and use it as readahead + * indicator. The flag won't be set on already cached pages, to avoid the + * readahead-for-nothing fuss, saving pointless page cache lookups. + * + * prev_pos tracks the last visited byte in the _previous_ read request. + * It should be maintained by the caller, and will be used for detecting + * small random reads. Note that the readahead algorithm checks loosely + * for sequential patterns. Hence interleaved reads might be served as + * sequential ones. + * + * There is a special-case: if the first page which the application tries to + * read happens to be the first page of the file, it is assumed that a linear + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. + * + * The code ramps up the readahead size aggressively at first, but slow down as + * it approaches max_readhead. + */ + +/* + * Count contiguously cached pages from @offset-1 to @offset-@max, + * this count is a conservative estimation of + * - length of the sequential read sequence, or + * - thrashing threshold in memory tight systems + */ +static pgoff_t count_history_pages(struct address_space *mapping, + pgoff_t offset, unsigned long max) +{ + pgoff_t head; + + rcu_read_lock(); + head = page_cache_prev_hole(mapping, offset - 1, max); + rcu_read_unlock(); + + return offset - 1 - head; +} + +/* + * page cache context based read-ahead + */ +static int try_context_readahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t offset, + unsigned long req_size, + unsigned long max) +{ + pgoff_t size; + + size = count_history_pages(mapping, offset, max); + + /* + * not enough history pages: + * it could be a random read + */ + if (size <= req_size) + return 0; + + /* + * starts from beginning of file: + * it is a strong indication of long-run stream (or whole-file-read) + */ + if (size >= offset) + size *= 2; + + ra->start = offset; + ra->size = min(size + req_size, max); + ra->async_size = 1; + + return 1; +} + +/* + * A minimal readahead algorithm for trivial sequential/random reads. + */ +static unsigned long +ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t offset, + unsigned long req_size) +{ + unsigned long max = max_sane_readahead(ra->ra_pages); + pgoff_t prev_offset; + + /* + * start of file + */ + if (!offset) + goto initial_readahead; + + /* + * It's the expected callback offset, assume sequential access. + * Ramp up sizes, and push forward the readahead window. + */ + if ((offset == (ra->start + ra->size - ra->async_size) || + offset == (ra->start + ra->size))) { + ra->start += ra->size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + + /* + * Hit a marked page without valid readahead state. + * E.g. interleaved reads. + * Query the pagecache for async_size, which normally equals to + * readahead size. Ramp it up and use it as the new readahead size. + */ + if (hit_readahead_marker) { + pgoff_t start; + + rcu_read_lock(); + start = page_cache_next_hole(mapping, offset + 1, max); + rcu_read_unlock(); + + if (!start || start - offset > max) + return 0; + + ra->start = start; + ra->size = start - offset; /* old async_size */ + ra->size += req_size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + + /* + * oversize read + */ + if (req_size > max) + goto initial_readahead; + + /* + * sequential cache miss + * trivial case: (offset - prev_offset) == 1 + * unaligned reads: (offset - prev_offset) == 0 + */ + prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; + if (offset - prev_offset <= 1UL) + goto initial_readahead; + + /* + * Query the page cache and look for the traces(cached history pages) + * that a sequential stream would leave behind. + */ + if (try_context_readahead(mapping, ra, offset, req_size, max)) + goto readit; + + /* + * standalone, small random read + * Read as is, and do not pollute the readahead state. + */ + return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + +initial_readahead: + ra->start = offset; + ra->size = get_init_ra_size(req_size, max); + ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; + +readit: + /* + * Will this read hit the readahead marker made by itself? + * If so, trigger the readahead marker hit now, and merge + * the resulted next readahead window into the current one. + */ + if (offset == ra->start && ra->size == ra->async_size) { + ra->async_size = get_next_ra_size(ra, max); + ra->size += ra->async_size; + } + + return ra_submit(ra, mapping, filp); +} + +/** + * page_cache_sync_readahead - generic file readahead + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages + * + * page_cache_sync_readahead() should be called when a cache miss happened: + * it will submit the read. The readahead logic may decide to piggyback more + * pages onto the read request if access patterns suggest it will improve + * performance. + */ +void page_cache_sync_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + pgoff_t offset, unsigned long req_size) +{ + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* be dumb */ + if (filp && (filp->f_mode & FMODE_RANDOM)) { + force_page_cache_readahead(mapping, filp, offset, req_size); + return; + } + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, false, offset, req_size); +} +EXPORT_SYMBOL_GPL(page_cache_sync_readahead); + +/** + * page_cache_async_readahead - file readahead for marked pages + * @mapping: address_space which holds the pagecache and I/O vectors + * @ra: file_ra_state which holds the readahead state + * @filp: passed on to ->readpage() and ->readpages() + * @page: the page at @offset which has the PG_readahead flag set + * @offset: start offset into @mapping, in pagecache page-sized units + * @req_size: hint: total size of the read which the caller is performing in + * pagecache pages + * + * page_cache_async_readahead() should be called when a page is used which + * has the PG_readahead flag; this is a marker to suggest that the application + * has used up enough of the readahead window that we should start pulling in + * more pages. + */ +void +page_cache_async_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *page, pgoff_t offset, + unsigned long req_size) +{ + /* no read-ahead */ + if (!ra->ra_pages) + return; + + /* + * Same bit is used for PG_readahead and PG_reclaim. + */ + if (PageWriteback(page)) + return; + + ClearPageReadahead(page); + + /* + * Defer asynchronous read-ahead on IO congestion. + */ + if (bdi_read_congested(inode_to_bdi(mapping->host))) + return; + + /* do read-ahead */ + ondemand_readahead(mapping, ra, filp, true, offset, req_size); +} +EXPORT_SYMBOL_GPL(page_cache_async_readahead); + +static ssize_t +do_readahead(struct address_space *mapping, struct file *filp, + pgoff_t index, unsigned long nr) +{ + if (!mapping || !mapping->a_ops) + return -EINVAL; + + return force_page_cache_readahead(mapping, filp, index, nr); +} + +SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) +{ + ssize_t ret; + struct fd f; + + ret = -EBADF; + f = fdget(fd); + if (f.file) { + if (f.file->f_mode & FMODE_READ) { + struct address_space *mapping = f.file->f_mapping; + pgoff_t start = offset >> PAGE_CACHE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + unsigned long len = end - start + 1; + ret = do_readahead(mapping, f.file, start, len); + } + fdput(f); + } + return ret; +} diff --git a/kernel/mm/rmap.c b/kernel/mm/rmap.c new file mode 100644 index 000000000..24dd3f9fe --- /dev/null +++ b/kernel/mm/rmap.c @@ -0,0 +1,1599 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. + * + * Provides methods for unmapping each kind of mapped page: + * the anon methods track anonymous pages, and + * the file methods track pages belonging to an inode. + * + * Original design by Rik van Riel 2001 + * File methods by Dave McCracken 2003, 2004 + * Anonymous methods by Andrea Arcangeli 2004 + * Contributions by Hugh Dickins 2003, 2004 + */ + +/* + * Lock ordering in mm: + * + * inode->i_mutex (while writing or truncating, not reading or faulting) + * mm->mmap_sem + * page->flags PG_locked (lock_page) + * mapping->i_mmap_rwsem + * anon_vma->rwsem + * mm->page_table_lock or pte_lock + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within bdi.wb->list_lock in __sync_single_inode) + * + * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) + * ->tasklist_lock + * pte map lock + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +static struct kmem_cache *anon_vma_cachep; +static struct kmem_cache *anon_vma_chain_cachep; + +static inline struct anon_vma *anon_vma_alloc(void) +{ + struct anon_vma *anon_vma; + + anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + if (anon_vma) { + atomic_set(&anon_vma->refcount, 1); + anon_vma->degree = 1; /* Reference for first vma */ + anon_vma->parent = anon_vma; + /* + * Initialise the anon_vma root to point to itself. If called + * from fork, the root will be reset to the parents anon_vma. + */ + anon_vma->root = anon_vma; + } + + return anon_vma; +} + +static inline void anon_vma_free(struct anon_vma *anon_vma) +{ + VM_BUG_ON(atomic_read(&anon_vma->refcount)); + + /* + * Synchronize against page_lock_anon_vma_read() such that + * we can safely hold the lock without the anon_vma getting + * freed. + * + * Relies on the full mb implied by the atomic_dec_and_test() from + * put_anon_vma() against the acquire barrier implied by + * down_read_trylock() from page_lock_anon_vma_read(). This orders: + * + * page_lock_anon_vma_read() VS put_anon_vma() + * down_read_trylock() atomic_dec_and_test() + * LOCK MB + * atomic_read() rwsem_is_locked() + * + * LOCK should suffice since the actual taking of the lock must + * happen _before_ what follows. + */ + might_sleep(); + if (rwsem_is_locked(&anon_vma->root->rwsem)) { + anon_vma_lock_write(anon_vma); + anon_vma_unlock_write(anon_vma); + } + + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) +{ + return kmem_cache_alloc(anon_vma_chain_cachep, gfp); +} + +static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +{ + kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); +} + +static void anon_vma_chain_link(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) +{ + avc->vma = vma; + avc->anon_vma = anon_vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); +} + +/** + * anon_vma_prepare - attach an anon_vma to a memory region + * @vma: the memory region in question + * + * This makes sure the memory mapping described by 'vma' has + * an 'anon_vma' attached to it, so that we can associate the + * anonymous pages mapped into it with that anon_vma. + * + * The common case will be that we already have one, but if + * not we either need to find an adjacent mapping that we + * can re-use the anon_vma from (very common when the only + * reason for splitting a vma has been mprotect()), or we + * allocate a new one. + * + * Anon-vma allocations are very subtle, because we may have + * optimistically looked up an anon_vma in page_lock_anon_vma_read() + * and that may actually touch the spinlock even in the newly + * allocated vma (it depends on RCU to make sure that the + * anon_vma isn't actually destroyed). + * + * As a result, we need to do proper anon_vma locking even + * for the new allocation. At the same time, we do not want + * to do any locking for the common case of already having + * an anon_vma. + * + * This must be called with the mmap_sem held for reading. + */ +int anon_vma_prepare(struct vm_area_struct *vma) +{ + struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; + + might_sleep(); + if (unlikely(!anon_vma)) { + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *allocated; + + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_enomem; + + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + goto out_enomem_free_avc; + allocated = anon_vma; + } + + anon_vma_lock_write(anon_vma); + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* vma reference or self-parent link for new root */ + anon_vma->degree++; + allocated = NULL; + avc = NULL; + } + spin_unlock(&mm->page_table_lock); + anon_vma_unlock_write(anon_vma); + + if (unlikely(allocated)) + put_anon_vma(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); + } + return 0; + + out_enomem_free_avc: + anon_vma_chain_free(avc); + out_enomem: + return -ENOMEM; +} + +/* + * This is a useful helper function for locking the anon_vma root as + * we traverse the vma->anon_vma_chain, looping over anon_vma's that + * have the same vma. + * + * Such anon_vma's should have the same root, so you'd expect to see + * just a single mutex_lock for the whole traversal. + */ +static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) +{ + struct anon_vma *new_root = anon_vma->root; + if (new_root != root) { + if (WARN_ON_ONCE(root)) + up_write(&root->rwsem); + root = new_root; + down_write(&root->rwsem); + } + return root; +} + +static inline void unlock_anon_vma_root(struct anon_vma *root) +{ + if (root) + up_write(&root->rwsem); +} + +/* + * Attach the anon_vmas from src to dst. + * Returns 0 on success, -ENOMEM on failure. + * + * If dst->anon_vma is NULL this function tries to find and reuse existing + * anon_vma which has no vmas and only one child anon_vma. This prevents + * degradation of anon_vma hierarchy to endless linear chain in case of + * constantly forking task. On the other hand, an anon_vma with more than one + * child isn't reused even if there was no alive vma, thus rmap walker has a + * good chance of avoiding scanning the whole hierarchy when it searches where + * page is mapped. + */ +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +{ + struct anon_vma_chain *avc, *pavc; + struct anon_vma *root = NULL; + + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma; + + avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); + if (unlikely(!avc)) { + unlock_anon_vma_root(root); + root = NULL; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + } + anon_vma = pavc->anon_vma; + root = lock_anon_vma_root(root, anon_vma); + anon_vma_chain_link(dst, avc, anon_vma); + + /* + * Reuse existing anon_vma if its degree lower than two, + * that means it has no vma and only one anon_vma child. + * + * Do not chose parent anon_vma, otherwise first child + * will always reuse it. Root anon_vma is never reused: + * it has self-parent reference and at least one child. + */ + if (!dst->anon_vma && anon_vma != src->anon_vma && + anon_vma->degree < 2) + dst->anon_vma = anon_vma; + } + if (dst->anon_vma) + dst->anon_vma->degree++; + unlock_anon_vma_root(root); + return 0; + + enomem_failure: + /* + * dst->anon_vma is dropped here otherwise its degree can be incorrectly + * decremented in unlink_anon_vmas(). + * We can safely do this because callers of anon_vma_clone() don't care + * about dst->anon_vma if anon_vma_clone() failed. + */ + dst->anon_vma = NULL; + unlink_anon_vmas(dst); + return -ENOMEM; +} + +/* + * Attach vma to its own anon_vma, as well as to the anon_vmas that + * the corresponding VMA in the parent process is attached to. + * Returns 0 on success, non-zero on failure. + */ +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) +{ + struct anon_vma_chain *avc; + struct anon_vma *anon_vma; + int error; + + /* Don't bother if the parent process has no anon_vma here. */ + if (!pvma->anon_vma) + return 0; + + /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ + vma->anon_vma = NULL; + + /* + * First, attach the new VMA to the parent VMA's anon_vmas, + * so rmap can find non-COWed pages in child processes. + */ + error = anon_vma_clone(vma, pvma); + if (error) + return error; + + /* An existing anon_vma has been reused, all done then. */ + if (vma->anon_vma) + return 0; + + /* Then add our own anon_vma. */ + anon_vma = anon_vma_alloc(); + if (!anon_vma) + goto out_error; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_error_free_anon_vma; + + /* + * The root anon_vma's spinlock is the lock actually used when we + * lock any of the anon_vmas in this anon_vma tree. + */ + anon_vma->root = pvma->anon_vma->root; + anon_vma->parent = pvma->anon_vma; + /* + * With refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned until + * this anon_vma is freed, because the lock lives in the root. + */ + get_anon_vma(anon_vma->root); + /* Mark this anon_vma as the one where our new (COWed) pages go. */ + vma->anon_vma = anon_vma; + anon_vma_lock_write(anon_vma); + anon_vma_chain_link(vma, avc, anon_vma); + anon_vma->parent->degree++; + anon_vma_unlock_write(anon_vma); + + return 0; + + out_error_free_anon_vma: + put_anon_vma(anon_vma); + out_error: + unlink_anon_vmas(vma); + return -ENOMEM; +} + +void unlink_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + struct anon_vma *root = NULL; + + /* + * Unlink each anon_vma chained to the VMA. This list is ordered + * from newest to oldest, ensuring the root anon_vma gets freed last. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + root = lock_anon_vma_root(root, anon_vma); + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); + + /* + * Leave empty anon_vmas on the list - we'll need + * to free them outside the lock. + */ + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { + anon_vma->parent->degree--; + continue; + } + + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } + if (vma->anon_vma) + vma->anon_vma->degree--; + unlock_anon_vma_root(root); + + /* + * Iterate the list once more, it now only contains empty and unlinked + * anon_vmas, destroy them. Could not do before due to __put_anon_vma() + * needing to write-acquire the anon_vma->root->rwsem. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + BUG_ON(anon_vma->degree); + put_anon_vma(anon_vma); + + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } +} + +static void anon_vma_ctor(void *data) +{ + struct anon_vma *anon_vma = data; + + init_rwsem(&anon_vma->rwsem); + atomic_set(&anon_vma->refcount, 0); + anon_vma->rb_root = RB_ROOT; +} + +void __init anon_vma_init(void) +{ + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); +} + +/* + * Getting a lock on a stable anon_vma from a page off the LRU is tricky! + * + * Since there is no serialization what so ever against page_remove_rmap() + * the best this function can do is return a locked anon_vma that might + * have been relevant to this page. + * + * The page might have been remapped to a different anon_vma or the anon_vma + * returned may already be freed (and even reused). + * + * In case it was remapped to a different anon_vma, the new anon_vma will be a + * child of the old anon_vma, and the anon_vma lifetime rules will therefore + * ensure that any anon_vma obtained from the page will still be valid for as + * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * + * All users of this function must be very careful when walking the anon_vma + * chain and verify that the page in question is indeed mapped in it + * [ something equivalent to page_mapped_in_vma() ]. + * + * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() + * that the anon_vma pointer from page->mapping is valid if there is a + * mapcount, we can dereference the anon_vma after observing those. + */ +struct anon_vma *page_get_anon_vma(struct page *page) +{ + struct anon_vma *anon_vma = NULL; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long)READ_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } + + /* + * If this page is still mapped, then its anon_vma cannot have been + * freed. But if it has been unmapped, we have no security against the + * anon_vma structure being freed and reused (for another anon_vma: + * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() + * above cannot corrupt). + */ + if (!page_mapped(page)) { + rcu_read_unlock(); + put_anon_vma(anon_vma); + return NULL; + } +out: + rcu_read_unlock(); + + return anon_vma; +} + +/* + * Similar to page_get_anon_vma() except it locks the anon_vma. + * + * Its a little more complex as it tries to keep the fast path to a single + * atomic op -- the trylock. If we fail the trylock, we fall back to getting a + * reference like with page_get_anon_vma() and then block on the mutex. + */ +struct anon_vma *page_lock_anon_vma_read(struct page *page) +{ + struct anon_vma *anon_vma = NULL; + struct anon_vma *root_anon_vma; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long)READ_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + goto out; + if (!page_mapped(page)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + root_anon_vma = READ_ONCE(anon_vma->root); + if (down_read_trylock(&root_anon_vma->rwsem)) { + /* + * If the page is still mapped, then this anon_vma is still + * its anon_vma, and holding the mutex ensures that it will + * not go away, see anon_vma_free(). + */ + if (!page_mapped(page)) { + up_read(&root_anon_vma->rwsem); + anon_vma = NULL; + } + goto out; + } + + /* trylock failed, we got to sleep */ + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } + + if (!page_mapped(page)) { + rcu_read_unlock(); + put_anon_vma(anon_vma); + return NULL; + } + + /* we pinned the anon_vma, its safe to sleep */ + rcu_read_unlock(); + anon_vma_lock_read(anon_vma); + + if (atomic_dec_and_test(&anon_vma->refcount)) { + /* + * Oops, we held the last refcount, release the lock + * and bail -- can't simply use put_anon_vma() because + * we'll deadlock on the anon_vma_lock_write() recursion. + */ + anon_vma_unlock_read(anon_vma); + __put_anon_vma(anon_vma); + anon_vma = NULL; + } + + return anon_vma; + +out: + rcu_read_unlock(); + return anon_vma; +} + +void page_unlock_anon_vma_read(struct anon_vma *anon_vma) +{ + anon_vma_unlock_read(anon_vma); +} + +/* + * At what user virtual address is page expected in @vma? + */ +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page_to_pgoff(page); + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + + return address; +} + +/* + * At what user virtual address is page expected in vma? + * Caller should check the page is actually part of the vma. + */ +unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address; + if (PageAnon(page)) { + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* + * Note: swapoff's unuse_vma() is more efficient with this + * check, and needs it to match anon_vma when KSM is active. + */ + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) + return -EFAULT; + } else if (page->mapping) { + if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) + return -EFAULT; + } else + return -EFAULT; + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + return -EFAULT; + return address; +} + +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + pmd_t pmde; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* + * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = *pmd; + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + pmd = NULL; +out: + return pmd; +} + +/* + * Check that @page is mapped at @address into @mm. + * + * If @sync is false, page_check_address may perform a racy check to avoid + * the page table lock when the pte is not present (helpful when reclaiming + * highly shared pages). + * + * On success returns with pte mapped and locked. + */ +pte_t *__page_check_address(struct page *page, struct mm_struct *mm, + unsigned long address, spinlock_t **ptlp, int sync) +{ + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ + pte = huge_pte_offset(mm, address); + if (!pte) + return NULL; + + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); + goto check; + } + + pmd = mm_find_pmd(mm, address); + if (!pmd) + return NULL; + + pte = pte_offset_map(pmd, address); + /* Make a quick check before getting the lock */ + if (!sync && !pte_present(*pte)) { + pte_unmap(pte); + return NULL; + } + + ptl = pte_lockptr(mm, pmd); +check: + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { + *ptlp = ptl; + return pte; + } + pte_unmap_unlock(pte, ptl); + return NULL; +} + +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + + address = __vma_address(page, vma); + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + return 0; + pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); + if (!pte) /* the page is not in this mm */ + return 0; + pte_unmap_unlock(pte, ptl); + + return 1; +} + +struct page_referenced_arg { + int mapcount; + int referenced; + unsigned long vm_flags; + struct mem_cgroup *memcg; +}; +/* + * arg: page_referenced_arg will be passed + */ +static int page_referenced_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + int referenced = 0; + struct page_referenced_arg *pra = arg; + + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; + + /* + * rmap might return false positives; we must filter + * these out using page_check_address_pmd(). + */ + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (!pmd) + return SWAP_AGAIN; + + if (vma->vm_flags & VM_LOCKED) { + spin_unlock(ptl); + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ + } + + /* go ahead even if the pmd is pmd_trans_splitting() */ + if (pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + spin_unlock(ptl); + } else { + pte_t *pte; + + /* + * rmap might return false positives; we must filter + * these out using page_check_address(). + */ + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + return SWAP_AGAIN; + + if (vma->vm_flags & VM_LOCKED) { + pte_unmap_unlock(pte, ptl); + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!(vma->vm_flags & VM_SEQ_READ))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + } + + if (referenced) { + pra->referenced++; + pra->vm_flags |= vma->vm_flags; + } + + pra->mapcount--; + if (!pra->mapcount) + return SWAP_SUCCESS; /* To break the loop */ + + return SWAP_AGAIN; +} + +static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) +{ + struct page_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; + + if (!mm_match_cgroup(vma->vm_mm, memcg)) + return true; + + return false; +} + +/** + * page_referenced - test if the page was referenced + * @page: the page to test + * @is_locked: caller holds lock on the page + * @memcg: target memory cgroup + * @vm_flags: collect encountered vma->vm_flags who actually referenced the page + * + * Quick test_and_clear_referenced for all mappings to a page, + * returns the number of ptes which referenced the page. + */ +int page_referenced(struct page *page, + int is_locked, + struct mem_cgroup *memcg, + unsigned long *vm_flags) +{ + int ret; + int we_locked = 0; + struct page_referenced_arg pra = { + .mapcount = page_mapcount(page), + .memcg = memcg, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_referenced_one, + .arg = (void *)&pra, + .anon_lock = page_lock_anon_vma_read, + }; + + *vm_flags = 0; + if (!page_mapped(page)) + return 0; + + if (!page_rmapping(page)) + return 0; + + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) + return 1; + } + + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (memcg) { + rwc.invalid_vma = invalid_page_referenced_vma; + } + + ret = rmap_walk(page, &rwc); + *vm_flags = pra.vm_flags; + + if (we_locked) + unlock_page(page); + + return pra.referenced; +} + +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + spinlock_t *ptl; + int ret = 0; + int *cleaned = arg; + + pte = page_check_address(page, mm, address, &ptl, 1); + if (!pte) + goto out; + + if (pte_dirty(*pte) || pte_write(*pte)) { + pte_t entry; + + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(mm, address, pte, entry); + ret = 1; + } + + pte_unmap_unlock(pte, ptl); + + if (ret) { + mmu_notifier_invalidate_page(mm, address); + (*cleaned)++; + } +out: + return SWAP_AGAIN; +} + +static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) +{ + if (vma->vm_flags & VM_SHARED) + return false; + + return true; +} + +int page_mkclean(struct page *page) +{ + int cleaned = 0; + struct address_space *mapping; + struct rmap_walk_control rwc = { + .arg = (void *)&cleaned, + .rmap_one = page_mkclean_one, + .invalid_vma = invalid_mkclean_vma, + }; + + BUG_ON(!PageLocked(page)); + + if (!page_mapped(page)) + return 0; + + mapping = page_mapping(page); + if (!mapping) + return 0; + + rmap_walk(page, &rwc); + + return cleaned; +} +EXPORT_SYMBOL_GPL(page_mkclean); + +/** + * page_move_anon_rmap - move a page to our anon_vma + * @page: the page to move to our anon_vma + * @vma: the vma the page belongs to + * @address: the user virtual address mapped + * + * When a page belongs exclusively to one process after a COW event, + * that page can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling + * processes. + */ +void page_move_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_VMA(!anon_vma, vma); + VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; +} + +/** + * __page_set_anon_rmap - set up new anonymous rmap + * @page: Page to add to rmap + * @vma: VM area to add page to. + * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ +static void __page_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + + if (PageAnon(page)) + return; + + /* + * If the page isn't exclusively mapped into this vma, + * we must use the _oldest_ possible anon_vma for the + * page mapping! + */ + if (!exclusive) + anon_vma = anon_vma->root; + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); +} + +/** + * __page_check_anon_rmap - sanity check anonymous rmap addition + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + */ +static void __page_check_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ +#ifdef CONFIG_DEBUG_VM + /* + * The page's anon-rmap details (mapping and index) are guaranteed to + * be set up correctly at this point. + * + * We have exclusion against page_add_anon_rmap because the caller + * always holds the page locked, except if called from page_dup_rmap, + * in which case the page is already known to be setup. + * + * We have exclusion against page_add_new_anon_rmap because those pages + * are initially only visible via the pagetables, and the pte is locked + * over the call to page_add_new_anon_rmap. + */ + BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); + BUG_ON(page->index != linear_page_index(vma, address)); +#endif +} + +/** + * page_add_anon_rmap - add pte mapping to an anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * The caller needs to hold the pte lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting, + * and to ensure that PageAnon is not being upgraded racily to PageKsm + * (but PageKsm is never downgraded to PageAnon). + */ +void page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + do_page_add_anon_rmap(page, vma, address, 0); +} + +/* + * Special version of the above for do_swap_page, which often runs + * into pages that are exclusively owned by the current process. + * Everybody else should continue to use page_add_anon_rmap above. + */ +void do_page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + int first = atomic_inc_and_test(&page->_mapcount); + if (first) { + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption + * disabled. + */ + if (PageTransHuge(page)) + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + hpage_nr_pages(page)); + } + if (unlikely(PageKsm(page))) + return; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + /* address might be in next vma when migration races vma_adjust */ + if (first) + __page_set_anon_rmap(page, vma, address, exclusive); + else + __page_check_anon_rmap(page, vma, address); +} + +/** + * page_add_new_anon_rmap - add pte mapping to a new anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * Same as page_add_anon_rmap but must only be called on *new* pages. + * This means the inc-and-test can be bypassed. + * Page does not have to be locked. + */ +void page_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + SetPageSwapBacked(page); + atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ + if (PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + hpage_nr_pages(page)); + __page_set_anon_rmap(page, vma, address, 1); +} + +/** + * page_add_file_rmap - add pte mapping to a file page + * @page: the page to add the mapping to + * + * The caller needs to hold the pte lock. + */ +void page_add_file_rmap(struct page *page) +{ + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); + if (atomic_inc_and_test(&page->_mapcount)) { + __inc_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + } + mem_cgroup_end_page_stat(memcg); +} + +static void page_remove_file_rmap(struct page *page) +{ + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); + + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) + goto out; + + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. + */ + __dec_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); +out: + mem_cgroup_end_page_stat(memcg); +} + +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * + * The caller needs to hold the pte lock. + */ +void page_remove_rmap(struct page *page) +{ + if (!PageAnon(page)) { + page_remove_file_rmap(page); + return; + } + + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; + + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. + */ + if (PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + -hpage_nr_pages(page)); + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); + + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ +} + +/* + * @arg: enum ttu_flags will be passed to this argument + */ +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte; + pte_t pteval; + spinlock_t *ptl; + int ret = SWAP_AGAIN; + enum ttu_flags flags = (enum ttu_flags)arg; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + /* + * If the page is mlock()d, we cannot swap it out. + * If it's recently referenced (perhaps page_referenced + * skipped over this mm) then we should reactivate it. + */ + if (!(flags & TTU_IGNORE_MLOCK)) { + if (vma->vm_flags & VM_LOCKED) + goto out_mlock; + + if (flags & TTU_MUNLOCK) + goto out_unmap; + } + if (!(flags & TTU_IGNORE_ACCESS)) { + if (ptep_clear_flush_young_notify(vma, address, pte)) { + ret = SWAP_FAIL; + goto out_unmap; + } + } + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, page_to_pfn(page)); + pteval = ptep_clear_flush(vma, address, pte); + + /* Move the dirty bit to the physical page now the pte is gone. */ + if (pte_dirty(pteval)) + set_page_dirty(page); + + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (!PageHuge(page)) { + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); + } + set_pte_at(mm, address, pte, + swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); + } else if (PageAnon(page)) { + swp_entry_t entry = { .val = page_private(page) }; + pte_t swp_pte; + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); + } else if (IS_ENABLED(CONFIG_MIGRATION)) { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + BUG_ON(!(flags & TTU_MIGRATION)); + entry = make_migration_entry(page, pte_write(pteval)); + } + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); + } else if (IS_ENABLED(CONFIG_MIGRATION) && + (flags & TTU_MIGRATION)) { + /* Establish migration entry for a file page */ + swp_entry_t entry; + entry = make_migration_entry(page, pte_write(pteval)); + set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + } else + dec_mm_counter(mm, MM_FILEPAGES); + + page_remove_rmap(page); + page_cache_release(page); + +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) + mmu_notifier_invalidate_page(mm, address); +out: + return ret; + +out_mlock: + pte_unmap_unlock(pte, ptl); + + + /* + * We need mmap_sem locking, Otherwise VM_LOCKED check makes + * unstable result and race. Plus, We can't wait here because + * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. + * if trylock failed, the page remain in evictable lru and later + * vmscan could retry to move the page to unevictable lru if the + * page is actually mlocked. + */ + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + ret = SWAP_MLOCK; + } + up_read(&vma->vm_mm->mmap_sem); + } + return ret; +} + +bool is_vma_temporary_stack(struct vm_area_struct *vma) +{ + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; +} + +static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) +{ + return is_vma_temporary_stack(vma); +} + +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + +/** + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * @flags: action and flags + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock. + * Return values are: + * + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a mapping, try again later + * SWAP_FAIL - the page is unswappable + * SWAP_MLOCK - page is mlocked. + */ +int try_to_unmap(struct page *page, enum ttu_flags flags) +{ + int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, + .done = page_not_mapped, + .anon_lock = page_lock_anon_vma_read, + }; + + VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); + + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) + rwc.invalid_vma = invalid_migration_vma; + + ret = rmap_walk(page, &rwc); + + if (ret != SWAP_MLOCK && !page_mapped(page)) + ret = SWAP_SUCCESS; + return ret; +} + +/** + * try_to_munlock - try to munlock a page + * @page: the page to be munlocked + * + * Called from munlock code. Checks all of the VMAs mapping the page + * to make sure nobody else has this page mlocked. The page will be + * returned with PG_mlocked cleared if no other vmas have it mlocked. + * + * Return values are: + * + * SWAP_AGAIN - no vma is holding page mlocked, or, + * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_FAIL - page cannot be located at present + * SWAP_MLOCK - page is now mlocked. + */ +int try_to_munlock(struct page *page) +{ + int ret; + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)TTU_MUNLOCK, + .done = page_not_mapped, + .anon_lock = page_lock_anon_vma_read, + + }; + + VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); + + ret = rmap_walk(page, &rwc); + return ret; +} + +void __put_anon_vma(struct anon_vma *anon_vma) +{ + struct anon_vma *root = anon_vma->root; + + anon_vma_free(anon_vma); + if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + anon_vma_free(root); +} + +static struct anon_vma *rmap_walk_anon_lock(struct page *page, + struct rmap_walk_control *rwc) +{ + struct anon_vma *anon_vma; + + if (rwc->anon_lock) + return rwc->anon_lock(page); + + /* + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() + * because that depends on page_mapped(); but not all its usages + * are holding mmap_sem. Users without mmap_sem are required to + * take a reference count to prevent the anon_vma disappearing + */ + anon_vma = page_anon_vma(page); + if (!anon_vma) + return NULL; + + anon_vma_lock_read(anon_vma); + return anon_vma; +} + +/* + * rmap_walk_anon - do something to anonymous page using the object-based + * rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +{ + struct anon_vma *anon_vma; + pgoff_t pgoff; + struct anon_vma_chain *avc; + int ret = SWAP_AGAIN; + + anon_vma = rmap_walk_anon_lock(page, rwc); + if (!anon_vma) + return ret; + + pgoff = page_to_pgoff(page); + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, address, rwc->arg); + if (ret != SWAP_AGAIN) + break; + if (rwc->done && rwc->done(page)) + break; + } + anon_vma_unlock_read(anon_vma); + return ret; +} + +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + * + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * LOCKED. + */ +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_rwsem. + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (!mapping) + return ret; + + pgoff = page_to_pgoff(page); + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + unsigned long address = vma_address(page, vma); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + ret = rwc->rmap_one(page, vma, address, rwc->arg); + if (ret != SWAP_AGAIN) + goto done; + if (rwc->done && rwc->done(page)) + goto done; + } + +done: + i_mmap_unlock_read(mapping); + return ret; +} + +int rmap_walk(struct page *page, struct rmap_walk_control *rwc) +{ + if (unlikely(PageKsm(page))) + return rmap_walk_ksm(page, rwc); + else if (PageAnon(page)) + return rmap_walk_anon(page, rwc); + else + return rmap_walk_file(page, rwc); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * The following three functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + */ +static void __hugepage_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + + if (PageAnon(page)) + return; + if (!exclusive) + anon_vma = anon_vma->root; + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); +} + +void hugepage_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int first; + + BUG_ON(!PageLocked(page)); + BUG_ON(!anon_vma); + /* address might be in next vma when migration races vma_adjust */ + first = atomic_inc_and_test(&page->_mapcount); + if (first) + __hugepage_set_anon_rmap(page, vma, address, 0); +} + +void hugepage_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(&page->_mapcount, 0); + __hugepage_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLB_PAGE */ diff --git a/kernel/mm/shmem.c b/kernel/mm/shmem.c new file mode 100644 index 000000000..47d536e59 --- /dev/null +++ b/kernel/mm/shmem.c @@ -0,0 +1,3458 @@ +/* + * Resizable virtual memory filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG + * 2002 Red Hat Inc. + * Copyright (C) 2002-2011 Hugh Dickins. + * Copyright (C) 2011 Google Inc. + * Copyright (C) 2002-2005 VERITAS Software Corporation. + * Copyright (C) 2004 Andi Kleen, SuSE Labs + * + * Extended attribute support for tmpfs: + * Copyright (c) 2004, Luke Kenneth Casson Leighton + * Copyright (c) 2004 Red Hat, Inc., James Morris + * + * tiny-shmem: + * Copyright (c) 2004, 2008 Matt Mackall + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct vfsmount *shm_mnt; + +#ifdef CONFIG_SHMEM +/* + * This virtual memory filesystem is heavily based on the ramfs. It + * extends ramfs by the ability to use swap and honor resource limits + * which makes it a completely usable filesystem. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) +#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) + +/* Pretend that each entry is of this size in directory's i_size */ +#define BOGO_DIRENT_SIZE 20 + +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ +#define SHORT_SYMLINK_LEN 128 + +/* + * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * inode->i_private (with i_mutex making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ + pgoff_t nr_falloced; /* how many new pages have been fallocated */ + pgoff_t nr_unswapped; /* how often writepage refused to swap out */ +}; + +/* Flag allocation requirements to shmem_getpage */ +enum sgp_type { + SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_CACHE, /* don't exceed i_size, may allocate page */ + SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ +}; + +#ifdef CONFIG_TMPFS +static unsigned long shmem_default_max_blocks(void) +{ + return totalram_pages / 2; +} + +static unsigned long shmem_default_max_inodes(void) +{ + return min(totalram_pages - totalhigh_pages, totalram_pages / 2); +} +#endif + +static bool shmem_should_replace_page(struct page *page, gfp_t gfp); +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index); +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + +static inline int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, int *fault_type) +{ + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), fault_type); +} + +static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * shmem_file_setup pre-accounts the whole fixed size of a VM object, + * for shared memory and for shared anonymous (/dev/zero) mappings + * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), + * consistent with the pre-accounting of private mappings ... + */ +static inline int shmem_acct_size(unsigned long flags, loff_t size) +{ + return (flags & VM_NORESERVE) ? + 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); +} + +static inline void shmem_unacct_size(unsigned long flags, loff_t size) +{ + if (!(flags & VM_NORESERVE)) + vm_unacct_memory(VM_ACCT(size)); +} + +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + +/* + * ... whereas tmpfs objects are accounted incrementally as + * pages are allocated, in order to allow huge sparse files. + * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. + */ +static inline int shmem_acct_block(unsigned long flags) +{ + return (flags & VM_NORESERVE) ? + security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; +} + +static inline void shmem_unacct_blocks(unsigned long flags, long pages) +{ + if (flags & VM_NORESERVE) + vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); +} + +static const struct super_operations shmem_ops; +static const struct address_space_operations shmem_aops; +static const struct file_operations shmem_file_operations; +static const struct inode_operations shmem_inode_operations; +static const struct inode_operations shmem_dir_inode_operations; +static const struct inode_operations shmem_special_inode_operations; +static const struct vm_operations_struct shmem_vm_ops; + +static LIST_HEAD(shmem_swaplist); +static DEFINE_MUTEX(shmem_swaplist_mutex); + +static int shmem_reserve_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + return 0; +} + +static void shmem_free_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } +} + +/** + * shmem_recalc_inode - recalculate the block usage of an inode + * @inode: inode to recalc + * + * We have to calculate the free blocks since the mm can drop + * undirtied hole pages behind our back. + * + * But normally info->alloced == inode->i_mapping->nrpages + info->swapped + * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * It has to be called with the spinlock held. + */ +static void shmem_recalc_inode(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + long freed; + + freed = info->alloced - info->swapped - inode->i_mapping->nrpages; + if (freed > 0) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -freed); + info->alloced -= freed; + inode->i_blocks -= freed * BLOCKS_PER_PAGE; + shmem_unacct_blocks(info->flags, freed); + } +} + +/* + * Replace item expected in radix tree by a new item, while holding tree lock. + */ +static int shmem_radix_tree_replace(struct address_space *mapping, + pgoff_t index, void *expected, void *replacement) +{ + void **pslot; + void *item; + + VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); + pslot = radix_tree_lookup_slot(&mapping->page_tree, index); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); + if (item != expected) + return -ENOENT; + radix_tree_replace_slot(pslot, replacement); + return 0; +} + +/* + * Sometimes, before we decide whether to proceed or to fail, we must check + * that an entry was not already brought back from swap by a racing thread. + * + * Checking page is not enough: by the time a SwapCache page is locked, it + * might be reused, and again be SwapCache, using the same swap as before. + */ +static bool shmem_confirm_swap(struct address_space *mapping, + pgoff_t index, swp_entry_t swap) +{ + void *item; + + rcu_read_lock(); + item = radix_tree_lookup(&mapping->page_tree, index); + rcu_read_unlock(); + return item == swp_to_radix_entry(swap); +} + +/* + * Like add_to_page_cache_locked, but error if expected item has gone. + */ +static int shmem_add_to_page_cache(struct page *page, + struct address_space *mapping, + pgoff_t index, void *expected) +{ + int error; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + + page_cache_get(page); + page->mapping = mapping; + page->index = index; + + spin_lock_irq(&mapping->tree_lock); + if (!expected) + error = radix_tree_insert(&mapping->page_tree, index, page); + else + error = shmem_radix_tree_replace(mapping, index, expected, + page); + if (!error) { + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); + } + return error; +} + +/* + * Like delete_from_page_cache, but substitutes swap for page. + */ +static void shmem_delete_from_page_cache(struct page *page, void *radswap) +{ + struct address_space *mapping = page->mapping; + int error; + + spin_lock_irq(&mapping->tree_lock); + error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + page->mapping = NULL; + mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + __dec_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); + BUG_ON(error); +} + +/* + * Remove swap entry from radix tree, free the swap and its page cache. + */ +static int shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) +{ + void *old; + + spin_lock_irq(&mapping->tree_lock); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); + spin_unlock_irq(&mapping->tree_lock); + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; +} + +/* + * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. + */ +void shmem_unlock_mapping(struct address_space *mapping) +{ + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + /* + * Minor point, but we might as well stop if someone else SHM_LOCKs it. + */ + while (!mapping_unevictable(mapping)) { + /* + * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it + * has finished, if it hits a row of PAGEVEC_SIZE swap entries. + */ + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); + if (!pvec.nr) + break; + index = indices[pvec.nr - 1] + 1; + pagevec_remove_exceptionals(&pvec); + check_move_unevictable_pages(pvec.pages, pvec.nr); + pagevec_release(&pvec); + cond_resched(); + } +} + +/* + * Remove range of pages and swap entries from radix tree, and free them. + * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. + */ +static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + bool unfalloc) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; + unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + long nr_swaps_freed = 0; + pgoff_t index; + int i; + + if (lend == -1) + end = -1; /* unsigned, so actually very big */ + + pagevec_init(&pvec, 0); + index = start; + while (index < end) { + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); + if (!pvec.nr) + break; + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + index = indices[i]; + if (index >= end) + break; + + if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; + nr_swaps_freed += !shmem_free_swap(mapping, + index, page); + continue; + } + + if (!trylock_page(page)) + continue; + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + truncate_inode_page(mapping, page); + } + } + unlock_page(page); + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + cond_resched(); + index++; + } + + if (partial_start) { + struct page *page = NULL; + shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); + if (page) { + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + top = partial_end; + partial_end = 0; + } + zero_user_segment(page, partial_start, top); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + } + if (partial_end) { + struct page *page = NULL; + shmem_getpage(inode, end, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, 0, partial_end); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + } + if (start >= end) + return; + + index = start; + while (index < end) { + cond_resched(); + + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); + if (!pvec.nr) { + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) + break; + /* But if truncating, restart to make sure all gone */ + index = start; + continue; + } + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + index = indices[i]; + if (index >= end) + break; + + if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; + if (shmem_free_swap(mapping, index, page)) { + /* Swap was replaced by page: retry */ + index--; + break; + } + nr_swaps_freed++; + continue; + } + + lock_page(page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + truncate_inode_page(mapping, page); + } else { + /* Page was replaced by swap: retry */ + unlock_page(page); + index--; + break; + } + } + unlock_page(page); + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + index++; + } + + spin_lock(&info->lock); + info->swapped -= nr_swaps_freed; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); +} + +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + shmem_undo_range(inode, lstart, lend, false); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +} +EXPORT_SYMBOL_GPL(shmem_truncate_range); + +static int shmem_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; + + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + + if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; + i_size_write(inode, newsize); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + } + if (newsize < oldsize) { + loff_t holebegin = round_up(newsize, PAGE_SIZE); + unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + shmem_truncate_range(inode, newsize, (loff_t)-1); + /* unmap again to remove racily COWed private pages */ + unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + } + } + + setattr_copy(inode, attr); + if (attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(inode, inode->i_mode); + return error; +} + +static void shmem_evict_inode(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + + if (inode->i_mapping->a_ops == &shmem_aops) { + shmem_unacct_size(info->flags, inode->i_size); + inode->i_size = 0; + shmem_truncate_range(inode, 0, (loff_t)-1); + if (!list_empty(&info->swaplist)) { + mutex_lock(&shmem_swaplist_mutex); + list_del_init(&info->swaplist); + mutex_unlock(&shmem_swaplist_mutex); + } + } else + kfree(info->symlink); + + simple_xattrs_free(&info->xattrs); + WARN_ON(inode->i_blocks); + shmem_free_inode(inode->i_sb); + clear_inode(inode); +} + +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct shmem_inode_info *info, + swp_entry_t swap, struct page **pagep) +{ + struct address_space *mapping = info->vfs_inode.i_mapping; + void *radswap; + pgoff_t index; + gfp_t gfp; + int error = 0; + + radswap = swp_to_radix_entry(swap); + index = radix_tree_locate_item(&mapping->page_tree, radswap); + if (index == -1) + return -EAGAIN; /* tell shmem_unuse we found nothing */ + + /* + * Move _head_ to start search for next from here. + * But be careful: shmem_evict_inode checks list_empty without taking + * mutex, and there's an instant in list_move_tail when info->swaplist + * would appear empty, if it were the only one on shmem_swaplist. + */ + if (shmem_swaplist.next != &info->swaplist) + list_move_tail(&shmem_swaplist, &info->swaplist); + + gfp = mapping_gfp_mask(mapping); + if (shmem_should_replace_page(*pagep, gfp)) { + mutex_unlock(&shmem_swaplist_mutex); + error = shmem_replace_page(pagep, gfp, info, index); + mutex_lock(&shmem_swaplist_mutex); + /* + * We needed to drop mutex to make that restrictive page + * allocation, but the inode might have been freed while we + * dropped it: although a racing shmem_evict_inode() cannot + * complete without emptying the radix_tree, our page lock + * on this swapcache page is not enough to prevent that - + * free_swap_and_cache() of our swap entry will only + * trylock_page(), removing swap from radix_tree whatever. + * + * We must not proceed to shmem_add_to_page_cache() if the + * inode has been freed, but of course we cannot rely on + * inode or mapping or info to check that. However, we can + * safely check if our swap entry is still in use (and here + * it can't have got reused for another page): if it's still + * in use, then the inode cannot have been freed yet, and we + * can safely proceed (if it's no longer in use, that tells + * nothing about the inode, but we don't need to unuse swap). + */ + if (!page_swapcount(*pagep)) + error = -ENOENT; + } + + /* + * We rely on shmem_swaplist_mutex, not only to protect the swaplist, + * but also to hold up shmem_evict_inode(): so inode cannot be freed + * beneath us (pagelock doesn't help until the page is in pagecache). + */ + if (!error) + error = shmem_add_to_page_cache(*pagep, mapping, index, + radswap); + if (error != -ENOMEM) { + /* + * Truncation and eviction use free_swap_and_cache(), which + * only does trylock page: if we raced, best clean up here. + */ + delete_from_swap_cache(*pagep); + set_page_dirty(*pagep); + if (!error) { + spin_lock(&info->lock); + info->swapped--; + spin_unlock(&info->lock); + swap_free(swap); + } + } + return error; +} + +/* + * Search through swapped inodes to find and replace swap by page. + */ +int shmem_unuse(swp_entry_t swap, struct page *page) +{ + struct list_head *this, *next; + struct shmem_inode_info *info; + struct mem_cgroup *memcg; + int error = 0; + + /* + * There's a faint possibility that swap page was replaced before + * caller locked it: caller will come back later with the right page. + */ + if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) + goto out; + + /* + * Charge page using GFP_KERNEL while we can wait, before taking + * the shmem_swaplist_mutex which might hold up shmem_writepage(). + * Charged back to the user (not to caller) when swap account is used. + */ + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); + if (error) + goto out; + /* No radix_tree_preload: swap entry keeps a place for page in tree */ + error = -EAGAIN; + + mutex_lock(&shmem_swaplist_mutex); + list_for_each_safe(this, next, &shmem_swaplist) { + info = list_entry(this, struct shmem_inode_info, swaplist); + if (info->swapped) + error = shmem_unuse_inode(info, swap, &page); + else + list_del_init(&info->swaplist); + cond_resched(); + if (error != -EAGAIN) + break; + /* found nothing in this: move on to search the next */ + } + mutex_unlock(&shmem_swaplist_mutex); + + if (error) { + if (error != -ENOMEM) + error = 0; + mem_cgroup_cancel_charge(page, memcg); + } else + mem_cgroup_commit_charge(page, memcg, true); +out: + unlock_page(page); + page_cache_release(page); + return error; +} + +/* + * Move the page from the page cache to the swap cache. + */ +static int shmem_writepage(struct page *page, struct writeback_control *wbc) +{ + struct shmem_inode_info *info; + struct address_space *mapping; + struct inode *inode; + swp_entry_t swap; + pgoff_t index; + + BUG_ON(!PageLocked(page)); + mapping = page->mapping; + index = page->index; + inode = mapping->host; + info = SHMEM_I(inode); + if (info->flags & VM_LOCKED) + goto redirty; + if (!total_swap_pages) + goto redirty; + + /* + * Our capabilities prevent regular writeback or sync from ever calling + * shmem_writepage; but a stacking filesystem might use ->writepage of + * its underlying filesystem, in which case tmpfs should write out to + * swap only in response to memory pressure, and not for the writeback + * threads or sync. + */ + if (!wbc->for_reclaim) { + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ + goto redirty; + } + + /* + * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC + * value into swapfile.c, the only way we can correctly account for a + * fallocated page arriving here is now to initialize it and write it. + * + * That's okay for a page already fallocated earlier, but if we have + * not yet completed the fallocation, then (a) we want to keep track + * of this page in case we have to undo it, and (b) it may not be a + * good idea to continue anyway, once we're pushing into swap. So + * reactivate the page, and let shmem_fallocate() quit when too many. + */ + if (!PageUptodate(page)) { + if (inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + !shmem_falloc->waitq && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped++; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; + } + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + + swap = get_swap_page(); + if (!swap.val) + goto redirty; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the page is + * moved to swap cache, when its pagelock no longer protects + * the inode from eviction. But don't unlock the mutex until + * we've incremented swapped, because shmem_unuse_inode() will + * prune a !swapped inode from the swaplist under this mutex. + */ + mutex_lock(&shmem_swaplist_mutex); + if (list_empty(&info->swaplist)) + list_add_tail(&info->swaplist, &shmem_swaplist); + + if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + swap_shmem_alloc(swap); + shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + + spin_lock(&info->lock); + info->swapped++; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + mutex_unlock(&shmem_swaplist_mutex); + BUG_ON(page_mapped(page)); + swap_writepage(page, wbc); + return 0; + } + + mutex_unlock(&shmem_swaplist_mutex); + swapcache_free(swap); +redirty: + set_page_dirty(page); + if (wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ + unlock_page(page); + return 0; +} + +#ifdef CONFIG_NUMA +#ifdef CONFIG_TMPFS +static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ + char buffer[64]; + + if (!mpol || mpol->mode == MPOL_DEFAULT) + return; /* show nothing */ + + mpol_to_str(buffer, sizeof(buffer), mpol); + + seq_printf(seq, ",mpol=%s", buffer); +} + +static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + struct mempolicy *mpol = NULL; + if (sbinfo->mpol) { + spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + mpol = sbinfo->mpol; + mpol_get(mpol); + spin_unlock(&sbinfo->stat_lock); + } + return mpol; +} +#endif /* CONFIG_TMPFS */ + +static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct vm_area_struct pvma; + struct page *page; + + /* Create a pseudo vma that just contains the policy */ + pvma.vm_start = 0; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; + pvma.vm_ops = NULL; + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + + page = swapin_readahead(swap, gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; +} + +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct vm_area_struct pvma; + struct page *page; + + /* Create a pseudo vma that just contains the policy */ + pvma.vm_start = 0; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; + pvma.vm_ops = NULL; + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + + page = alloc_page_vma(gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; +} +#else /* !CONFIG_NUMA */ +#ifdef CONFIG_TMPFS +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +#endif /* CONFIG_TMPFS */ + +static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + return swapin_readahead(swap, gfp, NULL, 0); +} + +static inline struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + return alloc_page(gfp); +} +#endif /* CONFIG_NUMA */ + +#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif + +/* + * When a page is moved from swapcache to shmem filecache (either by the + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * shmem_unuse_inode()), it may have been read in earlier from swap, in + * ignorance of the mapping it belongs to. If that mapping has special + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), + * we may need to copy to a suitable page before moving to filecache. + * + * In a future release, this may well be extended to respect cpuset and + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); + * but for now it is a simple matter of zone. + */ +static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +{ + return page_zonenum(page) > gfp_zone(gfp); +} + +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct page *oldpage, *newpage; + struct address_space *swap_mapping; + pgoff_t swap_index; + int error; + + oldpage = *pagep; + swap_index = page_private(oldpage); + swap_mapping = page_mapping(oldpage); + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success by further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + newpage = shmem_alloc_page(gfp, info, index); + if (!newpage) + return -ENOMEM; + + page_cache_get(newpage); + copy_highpage(newpage, oldpage); + flush_dcache_page(newpage); + + __set_page_locked(newpage); + SetPageUptodate(newpage); + SetPageSwapBacked(newpage); + set_page_private(newpage, swap_index); + SetPageSwapCache(newpage); + + /* + * Our caller will very soon move newpage out of swapcache, but it's + * a nice clean interface for us to replace oldpage by newpage there. + */ + spin_lock_irq(&swap_mapping->tree_lock); + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, + newpage); + if (!error) { + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + } + spin_unlock_irq(&swap_mapping->tree_lock); + + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + oldpage = newpage; + } else { + mem_cgroup_migrate(oldpage, newpage, true); + lru_cache_add_anon(newpage); + *pagep = newpage; + } + + ClearPageSwapCache(oldpage); + set_page_private(oldpage, 0); + + unlock_page(oldpage); + page_cache_release(oldpage); + page_cache_release(oldpage); + return error; +} + +/* + * shmem_getpage_gfp - find page in cache, or get from swap, or allocate + * + * If we allocate a new one we do not mark it dirty. That's up to the + * vm. If we swap it in we mark it dirty since we also free the swap + * entry since a page cannot live in both the swap and page cache + */ +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info; + struct shmem_sb_info *sbinfo; + struct mem_cgroup *memcg; + struct page *page; + swp_entry_t swap; + int error; + int once = 0; + int alloced = 0; + + if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) + return -EFBIG; +repeat: + swap.val = 0; + page = find_lock_entry(mapping, index); + if (radix_tree_exceptional_entry(page)) { + swap = radix_to_swp_entry(page); + page = NULL; + } + + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + error = -EINVAL; + goto failed; + } + + if (page && sgp == SGP_WRITE) + mark_page_accessed(page); + + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) + goto clear; + unlock_page(page); + page_cache_release(page); + page = NULL; + } + if (page || (sgp == SGP_READ && !swap.val)) { + *pagep = page; + return 0; + } + + /* + * Fast cache lookup did not find it: + * bring it back from swap or allocate. + */ + info = SHMEM_I(inode); + sbinfo = SHMEM_SB(inode->i_sb); + + if (swap.val) { + /* Look it up and read it in.. */ + page = lookup_swap_cache(swap); + if (!page) { + /* here we actually do the io */ + if (fault_type) + *fault_type |= VM_FAULT_MAJOR; + page = shmem_swapin(swap, gfp, info, index); + if (!page) { + error = -ENOMEM; + goto failed; + } + } + + /* We have to do this with page locked to prevent races */ + lock_page(page); + if (!PageSwapCache(page) || page_private(page) != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; /* try again */ + goto unlock; + } + if (!PageUptodate(page)) { + error = -EIO; + goto failed; + } + wait_on_page_writeback(page); + + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; + } + + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + swp_to_radix_entry(swap)); + /* + * We already confirmed swap under page lock, and make + * no memory allocation here, so usually no possibility + * of error; but free_swap_and_cache() only trylocks a + * page, so it is just possible that the entry has been + * truncated or holepunched since swap was confirmed. + * shmem_undo_range() will have done some of the + * unaccounting, now delete_from_swap_cache() will do + * the rest. + * Reset swap.val? No, leave it so "failed" goes back to + * "repeat": reading a hole and writing should succeed. + */ + if (error) { + mem_cgroup_cancel_charge(page, memcg); + delete_from_swap_cache(page); + } + } + if (error) + goto failed; + + mem_cgroup_commit_charge(page, memcg, true); + + spin_lock(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + + if (sgp == SGP_WRITE) + mark_page_accessed(page); + + delete_from_swap_cache(page); + set_page_dirty(page); + swap_free(swap); + + } else { + if (shmem_acct_block(info->flags)) { + error = -ENOSPC; + goto failed; + } + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0) { + error = -ENOSPC; + goto unacct; + } + percpu_counter_inc(&sbinfo->used_blocks); + } + + page = shmem_alloc_page(gfp, info, index); + if (!page) { + error = -ENOMEM; + goto decused; + } + + __SetPageSwapBacked(page); + __set_page_locked(page); + if (sgp == SGP_WRITE) + __SetPageReferenced(page); + + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + if (error) + goto decused; + error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + NULL); + radix_tree_preload_end(); + } + if (error) { + mem_cgroup_cancel_charge(page, memcg); + goto decused; + } + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_anon(page); + + spin_lock(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + alloced = true; + + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + if (sgp == SGP_DIRTY) + set_page_dirty(page); + } + + /* Perhaps the file has been truncated since we checked */ + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + error = -EINVAL; + if (alloced) + goto trunc; + else + goto failed; + } + *pagep = page; + return 0; + + /* + * Error recovery. + */ +trunc: + info = SHMEM_I(inode); + ClearPageDirty(page); + delete_from_page_cache(page); + spin_lock(&info->lock); + info->alloced--; + inode->i_blocks -= BLOCKS_PER_PAGE; + spin_unlock(&info->lock); +decused: + sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); +unacct: + shmem_unacct_blocks(info->flags, 1); +failed: + if (swap.val && error != -EINVAL && + !shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: + if (page) { + unlock_page(page); + page_cache_release(page); + } + if (error == -ENOSPC && !once++) { + info = SHMEM_I(inode); + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + goto repeat; + } + if (error == -EEXIST) /* from above or from radix_tree_insert */ + goto repeat; + return error; +} + +static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int error; + int ret = VM_FAULT_LOCKED; + + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_mutex. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_mutex in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ + if (unlikely(inode->i_private)) { + struct shmem_falloc *shmem_falloc; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT(shmem_fault_wait); + + ret = VM_FAULT_NOPAGE; + if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && + !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* It's polite to up mmap_sem if we can */ + up_read(&vma->vm_mm->mmap_sem); + ret = VM_FAULT_RETRY; + } + + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + spin_unlock(&inode->i_lock); + return ret; + } + spin_unlock(&inode->i_lock); + } + + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); + if (error) + return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + + if (ret & VM_FAULT_MAJOR) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + } + return ret; +} + +#ifdef CONFIG_NUMA +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); + pgoff_t index; + + index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); +} +#endif + +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); + int retval = -ENOMEM; + + spin_lock(&info->lock); + if (lock && !(info->flags & VM_LOCKED)) { + if (!user_shm_lock(inode->i_size, user)) + goto out_nomem; + info->flags |= VM_LOCKED; + mapping_set_unevictable(file->f_mapping); + } + if (!lock && (info->flags & VM_LOCKED) && user) { + user_shm_unlock(inode->i_size, user); + info->flags &= ~VM_LOCKED; + mapping_clear_unevictable(file->f_mapping); + } + retval = 0; + +out_nomem: + spin_unlock(&info->lock); + return retval; +} + +static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &shmem_vm_ops; + return 0; +} + +static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + struct inode *inode; + struct shmem_inode_info *info; + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + if (shmem_reserve_inode(sb)) + return NULL; + + inode = new_inode(sb); + if (inode) { + inode->i_ino = get_next_ino(); + inode_init_owner(inode, dir, mode); + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_generation = get_seconds(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + info->seals = F_SEAL_SEAL; + info->flags = flags & VM_NORESERVE; + INIT_LIST_HEAD(&info->swaplist); + simple_xattrs_init(&info->xattrs); + cache_no_acl(inode); + + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); + break; + case S_IFDIR: + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy, NULL); + break; + } + } else + shmem_free_inode(sb); + return inode; +} + +bool shmem_mapping(struct address_space *mapping) +{ + if (!mapping->host) + return false; + + return mapping->host->i_sb->s_op == &shmem_ops; +} + +#ifdef CONFIG_TMPFS +static const struct inode_operations shmem_symlink_inode_operations; +static const struct inode_operations shmem_short_symlink_operations; + +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); +#else +#define shmem_initxattrs NULL +#endif + +static int +shmem_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + /* i_mutex is held by caller */ + if (unlikely(info->seals)) { + if (info->seals & F_SEAL_WRITE) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); +} + +static int +shmem_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + zero_user_segments(page, 0, from, + from + copied, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + } + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + + return copied; +} + +static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + unsigned long offset; + enum sgp_type sgp = SGP_READ; + int error = 0; + ssize_t retval = 0; + loff_t *ppos = &iocb->ki_pos; + + /* + * Might this read be for a stacking filesystem? Then when reading + * holes of a sparse file, we actually need to allocate those pages, + * and even mark them dirty, so it cannot exceed the max_blocks limit. + */ + if (!iter_is_iovec(to)) + sgp = SGP_DIRTY; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + for (;;) { + struct page *page = NULL; + pgoff_t end_index; + unsigned long nr, ret; + loff_t i_size = i_size_read(inode); + + end_index = i_size >> PAGE_CACHE_SHIFT; + if (index > end_index) + break; + if (index == end_index) { + nr = i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + error = shmem_getpage(inode, index, &page, sgp, NULL); + if (error) { + if (error == -EINVAL) + error = 0; + break; + } + if (page) + unlock_page(page); + + /* + * We must evaluate after, since reads (unlike writes) + * are called without i_mutex protection against truncate + */ + nr = PAGE_CACHE_SIZE; + i_size = i_size_read(inode); + end_index = i_size >> PAGE_CACHE_SHIFT; + if (index == end_index) { + nr = i_size & ~PAGE_CACHE_MASK; + if (nr <= offset) { + if (page) + page_cache_release(page); + break; + } + } + nr -= offset; + + if (page) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + /* + * Mark the page accessed if we read the beginning. + */ + if (!offset) + mark_page_accessed(page); + } else { + page = ZERO_PAGE(0); + page_cache_get(page); + } + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ + ret = copy_page_to_iter(page, offset, nr, to); + retval += ret; + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (!iov_iter_count(to)) + break; + if (ret < nr) { + error = -EFAULT; + break; + } + cond_resched(); + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + file_accessed(file); + return retval ? retval : error; +} + +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + struct inode *inode = mapping->host; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *page; + pgoff_t index, end_index; + loff_t isize, left; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + isize = i_size_read(inode); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, spd.nr_pages_max); + + spd.nr_pages = find_get_pages_contig(mapping, index, + nr_pages, spd.pages); + index += spd.nr_pages; + error = 0; + + while (spd.nr_pages < nr_pages) { + error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); + spd.pages[spd.nr_pages++] = page; + index++; + } + + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (!PageUptodate(page) || page->mapping != mapping) { + error = shmem_getpage(inode, index, &page, + SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; + } + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + if (end_index == index) { + unsigned int plen; + + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) + break; + + this_len = min(this_len, plen - loff); + len = this_len; + } + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; + len -= this_len; + loff = 0; + spd.nr_pages++; + index++; + } + + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(&spd); + + if (error > 0) { + *ppos += error; + file_accessed(in); + } + return error; +} + +/* + * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + */ +static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + pgoff_t index, pgoff_t end, int whence) +{ + struct page *page; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool done = false; + int i; + + pagevec_init(&pvec, 0); + pvec.nr = 1; /* start small: we may be there already */ + while (!done) { + pvec.nr = find_get_entries(mapping, index, + pvec.nr, pvec.pages, indices); + if (!pvec.nr) { + if (whence == SEEK_DATA) + index = end; + break; + } + for (i = 0; i < pvec.nr; i++, index++) { + if (index < indices[i]) { + if (whence == SEEK_HOLE) { + done = true; + break; + } + index = indices[i]; + } + page = pvec.pages[i]; + if (page && !radix_tree_exceptional_entry(page)) { + if (!PageUptodate(page)) + page = NULL; + } + if (index >= end || + (page && whence == SEEK_DATA) || + (!page && whence == SEEK_HOLE)) { + done = true; + break; + } + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + pvec.nr = PAGEVEC_SIZE; + cond_resched(); + } + return index; +} + +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + pgoff_t start, end; + loff_t new_offset; + + if (whence != SEEK_DATA && whence != SEEK_HOLE) + return generic_file_llseek_size(file, offset, whence, + MAX_LFS_FILESIZE, i_size_read(inode)); + mutex_lock(&inode->i_mutex); + /* We're holding i_mutex so we can access i_size directly */ + + if (offset < 0) + offset = -EINVAL; + else if (offset >= inode->i_size) + offset = -ENXIO; + else { + start = offset >> PAGE_CACHE_SHIFT; + end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + new_offset = shmem_seek_hole_data(mapping, start, end, whence); + new_offset <<= PAGE_CACHE_SHIFT; + if (new_offset > offset) { + if (new_offset < inode->i_size) + offset = new_offset; + else if (whence == SEEK_DATA) + offset = -ENXIO; + else + offset = inode->i_size; + } + } + + if (offset >= 0) + offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); + mutex_unlock(&inode->i_mutex); + return offset; +} + +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on shmem. + */ +#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void shmem_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + } else if (page_count(page) - page_mapcount(page) > 1) { + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_set(&mapping->page_tree, iter.index, + SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); + } + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int shmem_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void **slot; + pgoff_t start; + struct page *page; + int error, scan; + + shmem_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + start, SHMEM_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(&mapping->page_tree, + iter.index, SHMEM_TAG_PINNED); + spin_unlock_irq(&mapping->tree_lock); +continue_resched: + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + rcu_read_unlock(); + } + + return error; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +int shmem_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a shmem-file but restrict + * access to a specific subset of file operations. Seals can only be + * added, but never removed. This way, mutually untrusted parties can + * share common memory regions with a well-defined policy. A malicious + * peer can thus never perform unwanted operations on a shared object. + * + * Seals are only supported on special shmem-files and always affect + * the whole underlying inode. Once a seal is set, it may prevent some + * kinds of access to the file. Currently, the following seals are + * defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous shmem files support sealing. More importantly, seals are + * never written to disk. Therefore, there's no plan to support it on + * other file types. + */ + + if (file->f_op != &shmem_file_operations) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + mutex_lock(&inode->i_mutex); + + if (info->seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = shmem_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + info->seals |= seals; + error = 0; + +unlock: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(shmem_add_seals); + +int shmem_get_seals(struct file *file) +{ + if (file->f_op != &shmem_file_operations) + return -EINVAL; + + return SHMEM_I(file_inode(file))->seals; +} +EXPORT_SYMBOL_GPL(shmem_get_seals); + +long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = shmem_add_seals(file, arg); + break; + case F_GET_SEALS: + error = shmem_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_falloc shmem_falloc; + pgoff_t start, index, end; + int error; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + struct address_space *mapping = file->f_mapping; + loff_t unmap_start = round_up(offset, PAGE_SIZE); + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + error = -EPERM; + goto out; + } + + shmem_falloc.waitq = &shmem_falloc_waitq; + shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, offset, offset + len - 1); + /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + spin_unlock(&inode->i_lock); + error = 0; + goto out; + } + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + + start = offset >> PAGE_CACHE_SHIFT; + end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ + if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { + error = -ENOSPC; + goto out; + } + + shmem_falloc.waitq = NULL; + shmem_falloc.start = start; + shmem_falloc.next = start; + shmem_falloc.nr_falloced = 0; + shmem_falloc.nr_unswapped = 0; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + + for (index = start; index < end; index++) { + struct page *page; + + /* + * Good, the fallocate(2) manpage permits EINTR: we may have + * been interrupted because we are using up too much memory. + */ + if (signal_pending(current)) + error = -EINTR; + else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) + error = -ENOMEM; + else + error = shmem_getpage(inode, index, &page, SGP_FALLOC, + NULL); + if (error) { + /* Remove the !PageUptodate pages we added */ + shmem_undo_range(inode, + (loff_t)start << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, true); + goto undone; + } + + /* + * Inform shmem_writepage() how far we have reached. + * No need for lock or barrier: we have the page lock. + */ + shmem_falloc.next++; + if (!PageUptodate(page)) + shmem_falloc.nr_falloced++; + + /* + * If !PageUptodate, leave it that way so that freeable pages + * can be recognized if we need to rollback on error later. + * But set_page_dirty so that memory pressure will swap rather + * than free the pages we are allocating (and SGP_CACHE pages + * might still be clean: we now need to mark those dirty too). + */ + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + cond_resched(); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; +undone: + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + mutex_unlock(&inode->i_mutex); + return error; +} + +static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); + + buf->f_type = TMPFS_MAGIC; + buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_namelen = NAME_MAX; + if (sbinfo->max_blocks) { + buf->f_blocks = sbinfo->max_blocks; + buf->f_bavail = + buf->f_bfree = sbinfo->max_blocks - + percpu_counter_sum(&sbinfo->used_blocks); + } + if (sbinfo->max_inodes) { + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + } + /* else leave those fields 0 like simple_statfs */ + return 0; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +static int +shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); + if (inode) { + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + error = security_inode_init_security(inode, dir, + &dentry->d_name, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = 0; + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + } + return error; +out_iput: + iput(inode); + return error; +} + +static int +shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); + if (inode) { + error = security_inode_init_security(inode, dir, + NULL, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + d_tmpfile(dentry, inode); + } + return error; +out_iput: + iput(inode); + return error; +} + +static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + int error; + + if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) + return error; + inc_nlink(dir); + return 0; +} + +static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return shmem_mknod(dir, dentry, mode | S_IFREG, 0); +} + +/* + * Link a file.. + */ +static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + int ret; + + /* + * No ordinary (disk based) filesystem counts links as inodes; + * but each new link needs a new dentry, pinning lowmem, and + * tmpfs dentries cannot be pruned until they are unlinked. + */ + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; + + dir->i_size += BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inc_nlink(inode); + ihold(inode); /* New dentry reference */ + dget(dentry); /* Extra pinning count for the created dentry */ + d_instantiate(dentry, inode); +out: + return ret; +} + +static int shmem_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) + shmem_free_inode(inode->i_sb); + + dir->i_size -= BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + drop_nlink(inode); + dput(dentry); /* Undo the count from "create" - this does all the work */ + return 0; +} + +static int shmem_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!simple_empty(dentry)) + return -ENOTEMPTY; + + drop_nlink(d_inode(dentry)); + drop_nlink(dir); + return shmem_unlink(dir, dentry); +} + +static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +{ + bool old_is_dir = d_is_dir(old_dentry); + bool new_is_dir = d_is_dir(new_dentry); + + if (old_dir != new_dir && old_is_dir != new_is_dir) { + if (old_is_dir) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } else { + drop_nlink(new_dir); + inc_nlink(old_dir); + } + } + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + d_inode(old_dentry)->i_ctime = + d_inode(new_dentry)->i_ctime = CURRENT_TIME; + + return 0; +} + +static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) +{ + struct dentry *whiteout; + int error; + + whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); + if (!whiteout) + return -ENOMEM; + + error = shmem_mknod(old_dir, whiteout, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); + dput(whiteout); + if (error) + return error; + + /* + * Cheat and hash the whiteout while the old dentry is still in + * place, instead of playing games with FS_RENAME_DOES_D_MOVE. + * + * d_lookup() will consistently find one of them at this point, + * not sure which one, but that isn't even important. + */ + d_rehash(whiteout); + return 0; +} + +/* + * The VFS layer already does all the dentry stuff for rename, + * we just have to decrement the usage count for the target if + * it exists so that the VFS layer correctly free's it when it + * gets overwritten. + */ +static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) +{ + struct inode *inode = d_inode(old_dentry); + int they_are_dirs = S_ISDIR(inode->i_mode); + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); + + if (!simple_empty(new_dentry)) + return -ENOTEMPTY; + + if (flags & RENAME_WHITEOUT) { + int error; + + error = shmem_whiteout(old_dir, old_dentry); + if (error) + return error; + } + + if (d_really_is_positive(new_dentry)) { + (void) shmem_unlink(new_dir, new_dentry); + if (they_are_dirs) { + drop_nlink(d_inode(new_dentry)); + drop_nlink(old_dir); + } + } else if (they_are_dirs) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } + + old_dir->i_size -= BOGO_DIRENT_SIZE; + new_dir->i_size += BOGO_DIRENT_SIZE; + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + inode->i_ctime = CURRENT_TIME; + return 0; +} + +static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + int error; + int len; + struct inode *inode; + struct page *page; + char *kaddr; + struct shmem_inode_info *info; + + len = strlen(symname) + 1; + if (len > PAGE_CACHE_SIZE) + return -ENAMETOOLONG; + + inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); + if (!inode) + return -ENOSPC; + + error = security_inode_init_security(inode, dir, &dentry->d_name, + shmem_initxattrs, NULL); + if (error) { + if (error != -EOPNOTSUPP) { + iput(inode); + return error; + } + error = 0; + } + + info = SHMEM_I(inode); + inode->i_size = len-1; + if (len <= SHORT_SYMLINK_LEN) { + info->symlink = kmemdup(symname, len, GFP_KERNEL); + if (!info->symlink) { + iput(inode); + return -ENOMEM; + } + inode->i_op = &shmem_short_symlink_operations; + } else { + error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); + if (error) { + iput(inode); + return error; + } + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_symlink_inode_operations; + kaddr = kmap_atomic(page); + memcpy(kaddr, symname, len); + kunmap_atomic(kaddr); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + d_instantiate(dentry, inode); + dget(dentry); + return 0; +} + +static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink); + return NULL; +} + +static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = NULL; + int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); + nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); + if (page) + unlock_page(page); + return page; +} + +static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +{ + if (!IS_ERR(nd_get_link(nd))) { + struct page *page = cookie; + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); + } +} + +#ifdef CONFIG_TMPFS_XATTR +/* + * Superblocks without xattr inode operations may get some security.* xattr + * support from the LSM "for free". As soon as we have any other xattrs + * like ACLs, we also need to implement the security.* handlers at + * filesystem level, though. + */ + +/* + * Callback for security_inode_init_security() for acquiring xattrs. + */ +static int shmem_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + const struct xattr *xattr; + struct simple_xattr *new_xattr; + size_t len; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); + if (!new_xattr) + return -ENOMEM; + + len = strlen(xattr->name) + 1; + new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, + GFP_KERNEL); + if (!new_xattr->name) { + kfree(new_xattr); + return -ENOMEM; + } + + memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); + memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, + xattr->name, len); + + simple_xattr_list_add(&info->xattrs, new_xattr); + } + + return 0; +} + +static const struct xattr_handler *shmem_xattr_handlers[] = { +#ifdef CONFIG_TMPFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL +}; + +static int shmem_xattr_validate(const char *name) +{ + struct { const char *prefix; size_t len; } arr[] = { + { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, + { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } + }; + int i; + + for (i = 0; i < ARRAY_SIZE(arr); i++) { + size_t preflen = arr[i].len; + if (strncmp(name, arr[i].prefix, preflen) == 0) { + if (!name[preflen]) + return -EINVAL; + return 0; + } + } + return -EOPNOTSUPP; +} + +static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + int err; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + err = shmem_xattr_validate(name); + if (err) + return err; + + return simple_xattr_get(&info->xattrs, name, buffer, size); +} + +static int shmem_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + int err; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + err = shmem_xattr_validate(name); + if (err) + return err; + + return simple_xattr_set(&info->xattrs, name, value, size, flags); +} + +static int shmem_removexattr(struct dentry *dentry, const char *name) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + int err; + + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + err = shmem_xattr_validate(name); + if (err) + return err; + + return simple_xattr_remove(&info->xattrs, name); +} + +static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + return simple_xattr_list(&info->xattrs, buffer, size); +} +#endif /* CONFIG_TMPFS_XATTR */ + +static const struct inode_operations shmem_short_symlink_operations = { + .readlink = generic_readlink, + .follow_link = shmem_follow_short_symlink, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, +#endif +}; + +static const struct inode_operations shmem_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = shmem_follow_link, + .put_link = shmem_put_link, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, +#endif +}; + +static struct dentry *shmem_get_parent(struct dentry *child) +{ + return ERR_PTR(-ESTALE); +} + +static int shmem_match(struct inode *ino, void *vfh) +{ + __u32 *fh = vfh; + __u64 inum = fh[2]; + inum = (inum << 32) | fh[1]; + return ino->i_ino == inum && fh[0] == ino->i_generation; +} + +static struct dentry *shmem_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct inode *inode; + struct dentry *dentry = NULL; + u64 inum; + + if (fh_len < 3) + return NULL; + + inum = fid->raw[2]; + inum = (inum << 32) | fid->raw[1]; + + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), + shmem_match, fid->raw); + if (inode) { + dentry = d_find_alias(inode); + iput(inode); + } + + return dentry; +} + +static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, + struct inode *parent) +{ + if (*len < 3) { + *len = 3; + return FILEID_INVALID; + } + + if (inode_unhashed(inode)) { + /* Unfortunately insert_inode_hash is not idempotent, + * so as we hash inodes here rather than at creation + * time, we need a lock to ensure we only try + * to do it once + */ + static DEFINE_SPINLOCK(lock); + spin_lock(&lock); + if (inode_unhashed(inode)) + __insert_inode_hash(inode, + inode->i_ino + inode->i_generation); + spin_unlock(&lock); + } + + fh[0] = inode->i_generation; + fh[1] = inode->i_ino; + fh[2] = ((__u64)inode->i_ino) >> 32; + + *len = 3; + return 1; +} + +static const struct export_operations shmem_export_ops = { + .get_parent = shmem_get_parent, + .encode_fh = shmem_encode_fh, + .fh_to_dentry = shmem_fh_to_dentry, +}; + +static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, + bool remount) +{ + char *this_char, *value, *rest; + struct mempolicy *mpol = NULL; + uid_t uid; + gid_t gid; + + while (options != NULL) { + this_char = options; + for (;;) { + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + options = strchr(options, ','); + if (options == NULL) + break; + options++; + if (!isdigit(*options)) { + options[-1] = '\0'; + break; + } + } + if (!*this_char) + continue; + if ((value = strchr(this_char,'=')) != NULL) { + *value++ = 0; + } else { + printk(KERN_ERR + "tmpfs: No value for mount option '%s'\n", + this_char); + goto error; + } + + if (!strcmp(this_char,"size")) { + unsigned long long size; + size = memparse(value,&rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; + size *= totalram_pages; + do_div(size, 100); + rest++; + } + if (*rest) + goto bad_val; + sbinfo->max_blocks = + DIV_ROUND_UP(size, PAGE_CACHE_SIZE); + } else if (!strcmp(this_char,"nr_blocks")) { + sbinfo->max_blocks = memparse(value, &rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"nr_inodes")) { + sbinfo->max_inodes = memparse(value, &rest); + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"mode")) { + if (remount) + continue; + sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; + if (*rest) + goto bad_val; + } else if (!strcmp(this_char,"uid")) { + if (remount) + continue; + uid = simple_strtoul(value, &rest, 0); + if (*rest) + goto bad_val; + sbinfo->uid = make_kuid(current_user_ns(), uid); + if (!uid_valid(sbinfo->uid)) + goto bad_val; + } else if (!strcmp(this_char,"gid")) { + if (remount) + continue; + gid = simple_strtoul(value, &rest, 0); + if (*rest) + goto bad_val; + sbinfo->gid = make_kgid(current_user_ns(), gid); + if (!gid_valid(sbinfo->gid)) + goto bad_val; + } else if (!strcmp(this_char,"mpol")) { + mpol_put(mpol); + mpol = NULL; + if (mpol_parse_str(value, &mpol)) + goto bad_val; + } else { + printk(KERN_ERR "tmpfs: Bad mount option %s\n", + this_char); + goto error; + } + } + sbinfo->mpol = mpol; + return 0; + +bad_val: + printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + value, this_char); +error: + mpol_put(mpol); + return 1; + +} + +static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + struct shmem_sb_info config = *sbinfo; + unsigned long inodes; + int error = -EINVAL; + + config.mpol = NULL; + if (shmem_parse_options(data, &config, true)) + return error; + + spin_lock(&sbinfo->stat_lock); + inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) + goto out; + if (config.max_inodes < inodes) + goto out; + /* + * Those tests disallow limited->unlimited while any are in use; + * but we must separately disallow unlimited->limited, because + * in that case we have no record of how much is already in use. + */ + if (config.max_blocks && !sbinfo->max_blocks) + goto out; + if (config.max_inodes && !sbinfo->max_inodes) + goto out; + + error = 0; + sbinfo->max_blocks = config.max_blocks; + sbinfo->max_inodes = config.max_inodes; + sbinfo->free_inodes = config.max_inodes - inodes; + + /* + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (config.mpol) { + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ + } +out: + spin_unlock(&sbinfo->stat_lock); + return error; +} + +static int shmem_show_options(struct seq_file *seq, struct dentry *root) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); + + if (sbinfo->max_blocks != shmem_default_max_blocks()) + seq_printf(seq, ",size=%luk", + sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); + if (sbinfo->max_inodes != shmem_default_max_inodes()) + seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); + if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) + seq_printf(seq, ",mode=%03ho", sbinfo->mode); + if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, sbinfo->uid)); + if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, sbinfo->gid)); + shmem_show_mpol(seq, sbinfo->mpol); + return 0; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct shmem_inode_info *info; + struct file *file; + int fd, error; + char *name; + long len; + + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + info = SHMEM_I(file_inode(file)); + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + if (flags & MFD_ALLOW_SEALING) + info->seals &= ~F_SEAL_SEAL; + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} + +#endif /* CONFIG_TMPFS */ + +static void shmem_put_super(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + percpu_counter_destroy(&sbinfo->used_blocks); + mpol_put(sbinfo->mpol); + kfree(sbinfo); + sb->s_fs_info = NULL; +} + +int shmem_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct shmem_sb_info *sbinfo; + int err = -ENOMEM; + + /* Round up to L1_CACHE_BYTES to resist false sharing */ + sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), + L1_CACHE_BYTES), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + sbinfo->mode = S_IRWXUGO | S_ISVTX; + sbinfo->uid = current_fsuid(); + sbinfo->gid = current_fsgid(); + sb->s_fs_info = sbinfo; + +#ifdef CONFIG_TMPFS + /* + * Per default we only allow half of the physical ram per + * tmpfs instance, limiting inodes to one per page of lowmem; + * but the internal instance is left unlimited. + */ + if (!(sb->s_flags & MS_KERNMOUNT)) { + sbinfo->max_blocks = shmem_default_max_blocks(); + sbinfo->max_inodes = shmem_default_max_inodes(); + if (shmem_parse_options(data, sbinfo, false)) { + err = -EINVAL; + goto failed; + } + } else { + sb->s_flags |= MS_NOUSER; + } + sb->s_export_op = &shmem_export_ops; + sb->s_flags |= MS_NOSEC; +#else + sb->s_flags |= MS_NOUSER; +#endif + + spin_lock_init(&sbinfo->stat_lock); + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) + goto failed; + sbinfo->free_inodes = sbinfo->max_inodes; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = TMPFS_MAGIC; + sb->s_op = &shmem_ops; + sb->s_time_gran = 1; +#ifdef CONFIG_TMPFS_XATTR + sb->s_xattr = shmem_xattr_handlers; +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL + sb->s_flags |= MS_POSIXACL; +#endif + + inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + if (!inode) + goto failed; + inode->i_uid = sbinfo->uid; + inode->i_gid = sbinfo->gid; + sb->s_root = d_make_root(inode); + if (!sb->s_root) + goto failed; + return 0; + +failed: + shmem_put_super(sb); + return err; +} + +static struct kmem_cache *shmem_inode_cachep; + +static struct inode *shmem_alloc_inode(struct super_block *sb) +{ + struct shmem_inode_info *info; + info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + if (!info) + return NULL; + return &info->vfs_inode; +} + +static void shmem_destroy_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + +static void shmem_destroy_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + mpol_free_shared_policy(&SHMEM_I(inode)->policy); + call_rcu(&inode->i_rcu, shmem_destroy_callback); +} + +static void shmem_init_inode(void *foo) +{ + struct shmem_inode_info *info = foo; + inode_init_once(&info->vfs_inode); +} + +static int shmem_init_inodecache(void) +{ + shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", + sizeof(struct shmem_inode_info), + 0, SLAB_PANIC, shmem_init_inode); + return 0; +} + +static void shmem_destroy_inodecache(void) +{ + kmem_cache_destroy(shmem_inode_cachep); +} + +static const struct address_space_operations shmem_aops = { + .writepage = shmem_writepage, + .set_page_dirty = __set_page_dirty_no_writeback, +#ifdef CONFIG_TMPFS + .write_begin = shmem_write_begin, + .write_end = shmem_write_end, +#endif +#ifdef CONFIG_MIGRATION + .migratepage = migrate_page, +#endif + .error_remove_page = generic_error_remove_page, +}; + +static const struct file_operations shmem_file_operations = { + .mmap = shmem_mmap, +#ifdef CONFIG_TMPFS + .llseek = shmem_file_llseek, + .read_iter = shmem_file_read_iter, + .write_iter = generic_file_write_iter, + .fsync = noop_fsync, + .splice_read = shmem_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = shmem_fallocate, +#endif +}; + +static const struct inode_operations shmem_inode_operations = { + .setattr = shmem_setattr, +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, + .set_acl = simple_set_acl, +#endif +}; + +static const struct inode_operations shmem_dir_inode_operations = { +#ifdef CONFIG_TMPFS + .create = shmem_create, + .lookup = simple_lookup, + .link = shmem_link, + .unlink = shmem_unlink, + .symlink = shmem_symlink, + .mkdir = shmem_mkdir, + .rmdir = shmem_rmdir, + .mknod = shmem_mknod, + .rename2 = shmem_rename2, + .tmpfile = shmem_tmpfile, +#endif +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_setattr, + .set_acl = simple_set_acl, +#endif +}; + +static const struct inode_operations shmem_special_inode_operations = { +#ifdef CONFIG_TMPFS_XATTR + .setxattr = shmem_setxattr, + .getxattr = shmem_getxattr, + .listxattr = shmem_listxattr, + .removexattr = shmem_removexattr, +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_setattr, + .set_acl = simple_set_acl, +#endif +}; + +static const struct super_operations shmem_ops = { + .alloc_inode = shmem_alloc_inode, + .destroy_inode = shmem_destroy_inode, +#ifdef CONFIG_TMPFS + .statfs = shmem_statfs, + .remount_fs = shmem_remount_fs, + .show_options = shmem_show_options, +#endif + .evict_inode = shmem_evict_inode, + .drop_inode = generic_delete_inode, + .put_super = shmem_put_super, +}; + +static const struct vm_operations_struct shmem_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + +static struct dentry *shmem_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, shmem_fill_super); +} + +static struct file_system_type shmem_fs_type = { + .owner = THIS_MODULE, + .name = "tmpfs", + .mount = shmem_mount, + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +int __init shmem_init(void) +{ + int error; + + /* If rootfs called this, don't re-init */ + if (shmem_inode_cachep) + return 0; + + error = shmem_init_inodecache(); + if (error) + goto out3; + + error = register_filesystem(&shmem_fs_type); + if (error) { + printk(KERN_ERR "Could not register tmpfs\n"); + goto out2; + } + + shm_mnt = kern_mount(&shmem_fs_type); + if (IS_ERR(shm_mnt)) { + error = PTR_ERR(shm_mnt); + printk(KERN_ERR "Could not kern_mount tmpfs\n"); + goto out1; + } + return 0; + +out1: + unregister_filesystem(&shmem_fs_type); +out2: + shmem_destroy_inodecache(); +out3: + shm_mnt = ERR_PTR(error); + return error; +} + +#else /* !CONFIG_SHMEM */ + +/* + * tiny-shmem: simple shmemfs and tmpfs using ramfs code + * + * This is intended for small system where the benefits of the full + * shmem code (swap-backed and resource-limited) are outweighed by + * their complexity. On systems without swap this code should be + * effectively equivalent, but much lighter weight. + */ + +static struct file_system_type shmem_fs_type = { + .name = "tmpfs", + .mount = ramfs_mount, + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +int __init shmem_init(void) +{ + BUG_ON(register_filesystem(&shmem_fs_type) != 0); + + shm_mnt = kern_mount(&shmem_fs_type); + BUG_ON(IS_ERR(shm_mnt)); + + return 0; +} + +int shmem_unuse(swp_entry_t swap, struct page *page) +{ + return 0; +} + +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + return 0; +} + +void shmem_unlock_mapping(struct address_space *mapping) +{ +} + +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + truncate_inode_pages_range(inode->i_mapping, lstart, lend); +} +EXPORT_SYMBOL_GPL(shmem_truncate_range); + +#define shmem_vm_ops generic_file_vm_ops +#define shmem_file_operations ramfs_file_operations +#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) +#define shmem_acct_size(flags, size) 0 +#define shmem_unacct_size(flags, size) do {} while (0) + +#endif /* CONFIG_SHMEM */ + +/* common code */ + +static struct dentry_operations anon_ops = { + .d_dname = simple_dname +}; + +static struct file *__shmem_file_setup(const char *name, loff_t size, + unsigned long flags, unsigned int i_flags) +{ + struct file *res; + struct inode *inode; + struct path path; + struct super_block *sb; + struct qstr this; + + if (IS_ERR(shm_mnt)) + return ERR_CAST(shm_mnt); + + if (size < 0 || size > MAX_LFS_FILESIZE) + return ERR_PTR(-EINVAL); + + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + + res = ERR_PTR(-ENOMEM); + this.name = name; + this.len = strlen(name); + this.hash = 0; /* will go */ + sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); + path.dentry = d_alloc_pseudo(sb, &this); + if (!path.dentry) + goto put_memory; + d_set_d_op(path.dentry, &anon_ops); + + res = ERR_PTR(-ENOSPC); + inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); + if (!inode) + goto put_memory; + + inode->i_flags |= i_flags; + d_instantiate(path.dentry, inode); + inode->i_size = size; + clear_nlink(inode); /* It is unlinked */ + res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (IS_ERR(res)) + goto put_path; + + res = alloc_file(&path, FMODE_WRITE | FMODE_READ, + &shmem_file_operations); + if (IS_ERR(res)) + goto put_path; + + return res; + +put_memory: + shmem_unacct_size(flags, size); +put_path: + path_put(&path); + return res; +} + +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The one user is the big_key implementation. LSM checks + * are provided at the key level rather than the inode level. + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, S_PRIVATE); +} + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(name, size, flags, 0); +} +EXPORT_SYMBOL_GPL(shmem_file_setup); + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + + /* + * Cloning a new file under mmap_sem leads to a lock ordering conflict + * between XFS directory reading and selinux: since this file is only + * accessible to the user through its mapping, use S_PRIVATE flag to + * bypass file security, in the same way as shmem_kernel_file_setup(). + */ + file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + return 0; +} + +/** + * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", + * with any new page allocations done using the specified allocation flags. + * But read_cache_page_gfp() uses the ->readpage() method: which does not + * suit tmpfs, since it may have pages in swapcache, and needs to find those + * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. + * + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. + */ +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ +#ifdef CONFIG_SHMEM + struct inode *inode = mapping->host; + struct page *page; + int error; + + BUG_ON(mapping->a_ops != &shmem_aops); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + if (error) + page = ERR_PTR(error); + else + unlock_page(page); + return page; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ + return read_cache_page_gfp(mapping, index, gfp); +#endif +} +EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); diff --git a/kernel/mm/slab.c b/kernel/mm/slab.c new file mode 100644 index 000000000..7eb38dd1c --- /dev/null +++ b/kernel/mm/slab.c @@ -0,0 +1,4240 @@ +/* + * linux/mm/slab.c + * Written by Mark Hemment, 1996/97. + * (markhe@nextd.demon.co.uk) + * + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli + * + * Major cleanup, different bufctl logic, per-cpu arrays + * (c) 2000 Manfred Spraul + * + * Cleanup, make the head arrays unconditional, preparation for NUMA + * (c) 2002 Manfred Spraul + * + * An implementation of the Slab Allocator as described in outline in; + * UNIX Internals: The New Frontiers by Uresh Vahalia + * Pub: Prentice Hall ISBN 0-13-101908-2 + * or with a little more detail in; + * The Slab Allocator: An Object-Caching Kernel Memory Allocator + * Jeff Bonwick (Sun Microsystems). + * Presented at: USENIX Summer 1994 Technical Conference + * + * The memory is organized in caches, one cache for each object type. + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) + * Each cache consists out of many slabs (they are small (usually one + * page long) and always contiguous), and each slab contains multiple + * initialized objects. + * + * This means, that your constructor is used only for newly allocated + * slabs and you must pass objects with the same initializations to + * kmem_cache_free. + * + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, + * normal). If you need a special memory type, then must create a new + * cache for that memory type. + * + * In order to reduce fragmentation, the slabs are sorted in 3 groups: + * full slabs with 0 free objects + * partial slabs + * empty slabs with no allocated objects + * + * If partial slabs exist, then new allocations come from these slabs, + * otherwise from empty slabs or new slabs are allocated. + * + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache + * during kmem_cache_destroy(). The caller must prevent concurrent allocs. + * + * Each cache has a short per-cpu head array, most allocs + * and frees go into that array, and if that array overflows, then 1/2 + * of the entries in the array are given back into the global cache. + * The head array is strictly LIFO and should improve the cache hit rates. + * On SMP, it additionally reduces the spinlock operations. + * + * The c_cpuarray may not be read with enabled local interrupts - + * it's changed with a smp_call_function(). + * + * SMP synchronization: + * constructors and destructors are called without any locking. + * Several members in struct kmem_cache and struct slab never change, they + * are accessed without any locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * and local interrupts are disabled so slab code is preempt-safe. + * The non-constant members are protected with a per-cache irq spinlock. + * + * Many thanks to Mark Hemment, who wrote another per-cpu slab patch + * in 2000 - many ideas in the current implementation are derived from + * his patch. + * + * Further notes from the original documentation: + * + * 11 April '97. Started multi-threading - markhe + * The global cache-chain is protected by the mutex 'slab_mutex'. + * The sem is only needed when accessing/extending the cache-chain, which + * can never happen inside an interrupt (kmem_cache_create(), + * kmem_cache_shrink() and kmem_cache_reap()). + * + * At present, each engine can be growing a cache. This should be blocked. + * + * 15 March 2005. NUMA slab allocator. + * Shai Fultheim . + * Shobhit Dayal + * Alok N Kataria + * Christoph Lameter + * + * Modified the slab allocator to be node aware on NUMA systems. + * Each node has its own list of partial, free and full slabs. + * All object allocations for a node occur from node specific slab lists. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +#include "internal.h" + +#include "slab.h" + +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define DEBUG 1 +#define STATS 1 +#define FORCED_DEBUG 1 +#else +#define DEBUG 0 +#define STATS 0 +#define FORCED_DEBUG 0 +#endif + +/* Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) +#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) + +#ifndef ARCH_KMALLOC_FLAGS +#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN +#endif + +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t; +#else +typedef unsigned short freelist_idx_t; +#endif + +#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) + +/* + * true if a page was allocated from pfmemalloc reserves for network-based + * swap + */ +static bool pfmemalloc_active __read_mostly; + +/* + * struct array_cache + * + * Purpose: + * - LIFO ordering, to hand out cache-warm objects from _alloc + * - reduce the number of linked list operations + * - reduce spinlock operations + * + * The limit is stored in the per-cpu structure to reduce the data cache + * footprint. + * + */ +struct array_cache { + unsigned int avail; + unsigned int limit; + unsigned int batchcount; + unsigned int touched; + void *entry[]; /* + * Must have this definition in here for the proper + * alignment of array_cache. Also simplifies accessing + * the entries. + * + * Entries should not be directly dereferenced as + * entries belonging to slabs marked pfmemalloc will + * have the lower bits set SLAB_OBJ_PFMEMALLOC + */ +}; + +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + +#define SLAB_OBJ_PFMEMALLOC 1 +static inline bool is_obj_pfmemalloc(void *objp) +{ + return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; +} + +static inline void set_obj_pfmemalloc(void **objp) +{ + *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); + return; +} + +static inline void clear_obj_pfmemalloc(void **objp) +{ + *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); +} + +/* + * bootstrap: The caches do not work without cpuarrays anymore, but the + * cpuarrays are allocated from the generic caches... + */ +#define BOOT_CPUCACHE_ENTRIES 1 +struct arraycache_init { + struct array_cache cache; + void *entries[BOOT_CPUCACHE_ENTRIES]; +}; + +/* + * Need this for bootstrapping a per node allocator. + */ +#define NUM_INIT_LISTS (2 * MAX_NUMNODES) +static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; +#define CACHE_CACHE 0 +#define SIZE_NODE (MAX_NUMNODES) + +static int drain_freelist(struct kmem_cache *cache, + struct kmem_cache_node *n, int tofree); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); +static void cache_reap(struct work_struct *unused); + +static int slab_early_init = 1; + +#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) + +static void kmem_cache_node_init(struct kmem_cache_node *parent) +{ + INIT_LIST_HEAD(&parent->slabs_full); + INIT_LIST_HEAD(&parent->slabs_partial); + INIT_LIST_HEAD(&parent->slabs_free); + parent->shared = NULL; + parent->alien = NULL; + parent->colour_next = 0; + spin_lock_init(&parent->list_lock); + parent->free_objects = 0; + parent->free_touched = 0; +} + +#define MAKE_LIST(cachep, listp, slab, nodeid) \ + do { \ + INIT_LIST_HEAD(listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ + } while (0) + +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ + do { \ + MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ + MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ + } while (0) + +#define CFLGS_OFF_SLAB (0x80000000UL) +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) + +#define BATCHREFILL_LIMIT 16 +/* + * Optimization question: fewer reaps means less probability for unnessary + * cpucache drain/refill cycles. + * + * OTOH the cpuarrays can contain lots of objects, + * which could lock up otherwise freeable slabs. + */ +#define REAPTIMEOUT_AC (2*HZ) +#define REAPTIMEOUT_NODE (4*HZ) + +#if STATS +#define STATS_INC_ACTIVE(x) ((x)->num_active++) +#define STATS_DEC_ACTIVE(x) ((x)->num_active--) +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_SET_HIGH(x) \ + do { \ + if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) +#define STATS_INC_ERR(x) ((x)->errors++) +#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) +#define STATS_INC_NODEFREES(x) ((x)->node_frees++) +#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) +#define STATS_SET_FREEABLE(x, i) \ + do { \ + if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) +#else +#define STATS_INC_ACTIVE(x) do { } while (0) +#define STATS_DEC_ACTIVE(x) do { } while (0) +#define STATS_INC_ALLOCED(x) do { } while (0) +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) +#define STATS_SET_HIGH(x) do { } while (0) +#define STATS_INC_ERR(x) do { } while (0) +#define STATS_INC_NODEALLOCS(x) do { } while (0) +#define STATS_INC_NODEFREES(x) do { } while (0) +#define STATS_INC_ACOVERFLOW(x) do { } while (0) +#define STATS_SET_FREEABLE(x, i) do { } while (0) +#define STATS_INC_ALLOCHIT(x) do { } while (0) +#define STATS_INC_ALLOCMISS(x) do { } while (0) +#define STATS_INC_FREEHIT(x) do { } while (0) +#define STATS_INC_FREEMISS(x) do { } while (0) +#endif + +#if DEBUG + +/* + * memory layout of objects: + * 0 : objp + * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that + * the end of an object is aligned with the end of the real + * allocation. Catches writes behind the end of the allocation. + * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: + * redzone word. + * cachep->obj_offset: The real object. + * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->size - 1* BYTES_PER_WORD: last caller address + * [BYTES_PER_WORD long] + */ +static int obj_offset(struct kmem_cache *cachep) +{ + return cachep->obj_offset; +} + +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + return (unsigned long long*) (objp + obj_offset(cachep) - + sizeof(unsigned long long)); +} + +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + if (cachep->flags & SLAB_STORE_USER) + return (unsigned long long *)(objp + cachep->size - + sizeof(unsigned long long) - + REDZONE_ALIGN); + return (unsigned long long *) (objp + cachep->size - + sizeof(unsigned long long)); +} + +static void **dbg_userword(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_STORE_USER)); + return (void **)(objp + cachep->size - BYTES_PER_WORD); +} + +#else + +#define obj_offset(x) 0 +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) + +#endif + +#define OBJECT_FREE (0) +#define OBJECT_ACTIVE (1) + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static void set_obj_status(struct page *page, int idx, int val) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + status[idx] = val; +} + +static inline unsigned int get_obj_status(struct page *page, int idx) +{ + int freelist_size; + char *status; + struct kmem_cache *cachep = page->slab_cache; + + freelist_size = cachep->num * sizeof(freelist_idx_t); + status = (char *)page->freelist + freelist_size; + + return status[idx]; +} + +#else +static inline void set_obj_status(struct page *page, int idx, int val) {} + +#endif + +/* + * Do not go above this order unless 0 objects fit into the slab or + * overridden on the command line. + */ +#define SLAB_MAX_ORDER_HI 1 +#define SLAB_MAX_ORDER_LO 0 +static int slab_max_order = SLAB_MAX_ORDER_LO; +static bool slab_max_order_set __initdata; + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct page *page = virt_to_head_page(obj); + return page->slab_cache; +} + +static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, + unsigned int idx) +{ + return page->s_mem + cache->size * idx; +} + +/* + * We want to avoid an expensive divide : (offset / cache->size) + * Using the fact that size is a constant for a particular cache, + * we can replace (offset / cache->size) by + * reciprocal_divide(offset, cache->reciprocal_buffer_size) + */ +static inline unsigned int obj_to_index(const struct kmem_cache *cache, + const struct page *page, void *obj) +{ + u32 offset = (obj - page->s_mem); + return reciprocal_divide(offset, cache->reciprocal_buffer_size); +} + +/* internal cache of cache description objs */ +static struct kmem_cache kmem_cache_boot = { + .batchcount = 1, + .limit = BOOT_CPUCACHE_ENTRIES, + .shared = 1, + .size = sizeof(struct kmem_cache), + .name = "kmem_cache", +}; + +#define BAD_ALIEN_MAGIC 0x01020304ul + +static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); + +static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +{ + return this_cpu_ptr(cachep->cpu_cache); +} + +static size_t calculate_freelist_size(int nr_objs, size_t align) +{ + size_t freelist_size; + + freelist_size = nr_objs * sizeof(freelist_idx_t); + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size += nr_objs * sizeof(char); + + if (align) + freelist_size = ALIGN(freelist_size, align); + + return freelist_size; +} + +static int calculate_nr_objs(size_t slab_size, size_t buffer_size, + size_t idx_size, size_t align) +{ + int nr_objs; + size_t remained_size; + size_t freelist_size; + int extra_space = 0; + + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + extra_space = sizeof(char); + /* + * Ignore padding for the initial guess. The padding + * is at most @align-1 bytes, and @buffer_size is at + * least @align. In the worst case, this result will + * be one greater than the number of objects that fit + * into the memory allocation when taking the padding + * into account. + */ + nr_objs = slab_size / (buffer_size + idx_size + extra_space); + + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ + remained_size = slab_size - nr_objs * buffer_size; + freelist_size = calculate_freelist_size(nr_objs, align); + if (remained_size < freelist_size) + nr_objs--; + + return nr_objs; +} + +/* + * Calculate the number of objects and left-over bytes for a given buffer size. + */ +static void cache_estimate(unsigned long gfporder, size_t buffer_size, + size_t align, int flags, size_t *left_over, + unsigned int *num) +{ + int nr_objs; + size_t mgmt_size; + size_t slab_size = PAGE_SIZE << gfporder; + + /* + * The slab management structure can be either off the slab or + * on it. For the latter case, the memory allocated for a + * slab is used for: + * + * - One unsigned int for each object + * - Padding to respect alignment of @align + * - @buffer_size bytes for each object + * + * If the slab management structure is off the slab, then the + * alignment will already be calculated into the size. Because + * the slabs are all pages aligned, the objects will be at the + * correct alignment when allocated. + */ + if (flags & CFLGS_OFF_SLAB) { + mgmt_size = 0; + nr_objs = slab_size / buffer_size; + + } else { + nr_objs = calculate_nr_objs(slab_size, buffer_size, + sizeof(freelist_idx_t), align); + mgmt_size = calculate_freelist_size(nr_objs, align); + } + *num = nr_objs; + *left_over = slab_size - nr_objs*buffer_size - mgmt_size; +} + +#if DEBUG +#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) + +static void __slab_error(const char *function, struct kmem_cache *cachep, + char *msg) +{ + printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + function, cachep->name, msg); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} +#endif + +/* + * By default on NUMA we use alien caches to stage the freeing of + * objects allocated from other nodes. This causes massive memory + * inefficiencies when using fake NUMA setup to split memory into a + * large number of small nodes, so it can be disabled on the command + * line + */ + +static int use_alien_caches __read_mostly = 1; +static int __init noaliencache_setup(char *s) +{ + use_alien_caches = 0; + return 1; +} +__setup("noaliencache", noaliencache_setup); + +static int __init slab_max_order_setup(char *str) +{ + get_option(&str, &slab_max_order); + slab_max_order = slab_max_order < 0 ? 0 : + min(slab_max_order, MAX_ORDER - 1); + slab_max_order_set = true; + + return 1; +} +__setup("slab_max_order=", slab_max_order_setup); + +#ifdef CONFIG_NUMA +/* + * Special reaping functions for NUMA systems called from cache_reap(). + * These take care of doing round robin flushing of alien caches (containing + * objects freed on different nodes from which they were allocated) and the + * flushing of remote pcps by calling drain_node_pages. + */ +static DEFINE_PER_CPU(unsigned long, slab_reap_node); + +static void init_reap_node(int cpu) +{ + int node; + + node = next_node(cpu_to_mem(cpu), node_online_map); + if (node == MAX_NUMNODES) + node = first_node(node_online_map); + + per_cpu(slab_reap_node, cpu) = node; +} + +static void next_reap_node(void) +{ + int node = __this_cpu_read(slab_reap_node); + + node = next_node(node, node_online_map); + if (unlikely(node >= MAX_NUMNODES)) + node = first_node(node_online_map); + __this_cpu_write(slab_reap_node, node); +} + +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif + +/* + * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz + * via the workqueue/eventd. + * Add the CPU number into the expiration time to minimize the possibility of + * the CPUs getting into lockstep and contending for the global cache chain + * lock. + */ +static void start_cpu_timer(int cpu) +{ + struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); + + /* + * When this gets called from do_initcalls via cpucache_init(), + * init_workqueues() has already run, so keventd will be setup + * at that time. + */ + if (keventd_up() && reap_work->work.func == NULL) { + init_reap_node(cpu); + INIT_DEFERRABLE_WORK(reap_work, cache_reap); + schedule_delayed_work_on(cpu, reap_work, + __round_jiffies_relative(HZ, cpu)); + } +} + +static void init_arraycache(struct array_cache *ac, int limit, int batch) +{ + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transferred to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(ac); + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; + } +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + init_arraycache(ac, entries, batchcount); + return ac; +} + +static inline bool is_slab_pfmemalloc(struct page *page) +{ + return PageSlabPfmemalloc(page); +} + +/* Clears pfmemalloc_active if no slabs have pfmalloc set */ +static void recheck_pfmemalloc_active(struct kmem_cache *cachep, + struct array_cache *ac) +{ + struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); + struct page *page; + unsigned long flags; + + if (!pfmemalloc_active) + return; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->slabs_full, lru) + if (is_slab_pfmemalloc(page)) + goto out; + + list_for_each_entry(page, &n->slabs_partial, lru) + if (is_slab_pfmemalloc(page)) + goto out; + + list_for_each_entry(page, &n->slabs_free, lru) + if (is_slab_pfmemalloc(page)) + goto out; + + pfmemalloc_active = false; +out: + spin_unlock_irqrestore(&n->list_lock, flags); +} + +static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, + gfp_t flags, bool force_refill) +{ + int i; + void *objp = ac->entry[--ac->avail]; + + /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ + if (unlikely(is_obj_pfmemalloc(objp))) { + struct kmem_cache_node *n; + + if (gfp_pfmemalloc_allowed(flags)) { + clear_obj_pfmemalloc(&objp); + return objp; + } + + /* The caller cannot use PFMEMALLOC objects, find another one */ + for (i = 0; i < ac->avail; i++) { + /* If a !PFMEMALLOC object is found, swap them */ + if (!is_obj_pfmemalloc(ac->entry[i])) { + objp = ac->entry[i]; + ac->entry[i] = ac->entry[ac->avail]; + ac->entry[ac->avail] = objp; + return objp; + } + } + + /* + * If there are empty slabs on the slabs_free list and we are + * being forced to refill the cache, mark this one !pfmemalloc. + */ + n = get_node(cachep, numa_mem_id()); + if (!list_empty(&n->slabs_free) && force_refill) { + struct page *page = virt_to_head_page(objp); + ClearPageSlabPfmemalloc(page); + clear_obj_pfmemalloc(&objp); + recheck_pfmemalloc_active(cachep, ac); + return objp; + } + + /* No !PFMEMALLOC objects available */ + ac->avail++; + objp = NULL; + } + + return objp; +} + +static inline void *ac_get_obj(struct kmem_cache *cachep, + struct array_cache *ac, gfp_t flags, bool force_refill) +{ + void *objp; + + if (unlikely(sk_memalloc_socks())) + objp = __ac_get_obj(cachep, ac, flags, force_refill); + else + objp = ac->entry[--ac->avail]; + + return objp; +} + +static noinline void *__ac_put_obj(struct kmem_cache *cachep, + struct array_cache *ac, void *objp) +{ + if (unlikely(pfmemalloc_active)) { + /* Some pfmemalloc slabs exist, check if this is one */ + struct page *page = virt_to_head_page(objp); + if (PageSlabPfmemalloc(page)) + set_obj_pfmemalloc(&objp); + } + + return objp; +} + +static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, + void *objp) +{ + if (unlikely(sk_memalloc_socks())) + objp = __ac_put_obj(cachep, ac, objp); + + ac->entry[ac->avail++] = objp; +} + +/* + * Transfer objects in one arraycache to another. + * Locking must be handled by the caller. + * + * Return the number of entries transferred. + */ +static int transfer_objects(struct array_cache *to, + struct array_cache *from, unsigned int max) +{ + /* Figure out how many entries to transfer */ + int nr = min3(from->avail, max, to->limit - to->avail); + + if (!nr) + return 0; + + memcpy(to->entry + to->avail, from->entry + from->avail -nr, + sizeof(void *) *nr); + + from->avail -= nr; + to->avail += nr; + return nr; +} + +#ifndef CONFIG_NUMA + +#define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, n) do { } while (0) + +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) +{ + return (struct alien_cache **)BAD_ALIEN_MAGIC; +} + +static inline void free_alien_cache(struct alien_cache **ac_ptr) +{ +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + return 0; +} + +static inline void *alternate_node_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + return NULL; +} + +static inline void *____cache_alloc_node(struct kmem_cache *cachep, + gfp_t flags, int nodeid) +{ + return NULL; +} + +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return flags; +} + +#else /* CONFIG_NUMA */ + +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); +static void *alternate_node_alloc(struct kmem_cache *, gfp_t); + +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +{ + struct alien_cache **alc_ptr; + size_t memsize = sizeof(void *) * nr_node_ids; + int i; + + if (limit > 1) + limit = 12; + alc_ptr = kzalloc_node(memsize, gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; + } + } + return alc_ptr; +} + +static void free_alien_cache(struct alien_cache **alc_ptr) +{ + int i; + + if (!alc_ptr) + return; + for_each_node(i) + kfree(alc_ptr[i]); + kfree(alc_ptr); +} + +static void __drain_alien_cache(struct kmem_cache *cachep, + struct array_cache *ac, int node, + struct list_head *list) +{ + struct kmem_cache_node *n = get_node(cachep, node); + + if (ac->avail) { + spin_lock(&n->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects + * into the free lists and getting them back later. + */ + if (n->shared) + transfer_objects(n->shared, ac, ac->limit); + + free_block(cachep, ac->entry, ac->avail, node, list); + ac->avail = 0; + spin_unlock(&n->list_lock); + } +} + +/* + * Called from cache_reap() to regularly drain alien caches round robin. + */ +static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) +{ + int node = __this_cpu_read(slab_reap_node); + + if (n->alien) { + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&alc->lock)) { + LIST_HEAD(list); + + __drain_alien_cache(cachep, ac, node, &list); + spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); + } + } + } +} + +static void drain_alien_cache(struct kmem_cache *cachep, + struct alien_cache **alien) +{ + int i = 0; + struct alien_cache *alc; + struct array_cache *ac; + unsigned long flags; + + for_each_online_node(i) { + alc = alien[i]; + if (alc) { + LIST_HEAD(list); + + ac = &alc->ac; + spin_lock_irqsave(&alc->lock, flags); + __drain_alien_cache(cachep, ac, i, &list); + spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); + } + } +} + +static int __cache_free_alien(struct kmem_cache *cachep, void *objp, + int node, int page_node) +{ + struct kmem_cache_node *n; + struct alien_cache *alien = NULL; + struct array_cache *ac; + LIST_HEAD(list); + + n = get_node(cachep, node); + STATS_INC_NODEFREES(cachep); + if (n->alien && n->alien[page_node]) { + alien = n->alien[page_node]; + ac = &alien->ac; + spin_lock(&alien->lock); + if (unlikely(ac->avail == ac->limit)) { + STATS_INC_ACOVERFLOW(cachep); + __drain_alien_cache(cachep, ac, page_node, &list); + } + ac_put_obj(cachep, ac, objp); + spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); + } else { + n = get_node(cachep, page_node); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); + } + return 1; +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + int page_node = page_to_nid(virt_to_page(objp)); + int node = numa_mem_id(); + /* + * Make sure we are not freeing a object from another node to the array + * cache on this cpu. + */ + if (likely(node == page_node)) + return 0; + + return __cache_free_alien(cachep, objp, node, page_node); +} + +/* + * Construct gfp mask to allocate from a specific node but do not invoke reclaim + * or warn about failures. + */ +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; +} +#endif + +/* + * Allocates and initializes node for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node + * will be allocated off-node since memory is not yet online for the new node. + * When hotplugging memory or a cpu, existing node are not replaced if + * already in use. + * + * Must hold slab_mutex. + */ +static int init_cache_node_node(int node) +{ + struct kmem_cache *cachep; + struct kmem_cache_node *n; + const size_t memsize = sizeof(struct kmem_cache_node); + + list_for_each_entry(cachep, &slab_caches, list) { + /* + * Set up the kmem_cache_node for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + n = get_node(cachep, node); + if (!n) { + n = kmalloc_node(memsize, GFP_KERNEL, node); + if (!n) + return -ENOMEM; + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + /* + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient + * protection here. + */ + cachep->node[node] = n; + } + + spin_lock_irq(&n->list_lock); + n->free_limit = + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&n->list_lock); + } + return 0; +} + +static inline int slabs_tofree(struct kmem_cache *cachep, + struct kmem_cache_node *n) +{ + return (n->free_objects + cachep->num - 1) / cachep->num; +} + +static void cpuup_canceled(long cpu) +{ + struct kmem_cache *cachep; + struct kmem_cache_node *n = NULL; + int node = cpu_to_mem(cpu); + const struct cpumask *mask = cpumask_of_node(node); + + list_for_each_entry(cachep, &slab_caches, list) { + struct array_cache *nc; + struct array_cache *shared; + struct alien_cache **alien; + LIST_HEAD(list); + + n = get_node(cachep, node); + if (!n) + continue; + + spin_lock_irq(&n->list_lock); + + /* Free limit for this kmem_cache_node */ + n->free_limit -= cachep->batchcount; + + /* cpu is dead; no one can alloc from it. */ + nc = per_cpu_ptr(cachep->cpu_cache, cpu); + if (nc) { + free_block(cachep, nc->entry, nc->avail, node, &list); + nc->avail = 0; + } + + if (!cpumask_empty(mask)) { + spin_unlock_irq(&n->list_lock); + goto free_slab; + } + + shared = n->shared; + if (shared) { + free_block(cachep, shared->entry, + shared->avail, node, &list); + n->shared = NULL; + } + + alien = n->alien; + n->alien = NULL; + + spin_unlock_irq(&n->list_lock); + + kfree(shared); + if (alien) { + drain_alien_cache(cachep, alien); + free_alien_cache(alien); + } + +free_slab: + slabs_destroy(cachep, &list); + } + /* + * In the previous loop, all the objects were freed to + * the respective cache's slabs, now we can go ahead and + * shrink each nodelist to its limit. + */ + list_for_each_entry(cachep, &slab_caches, list) { + n = get_node(cachep, node); + if (!n) + continue; + drain_freelist(cachep, n, slabs_tofree(cachep, n)); + } +} + +static int cpuup_prepare(long cpu) +{ + struct kmem_cache *cachep; + struct kmem_cache_node *n = NULL; + int node = cpu_to_mem(cpu); + int err; + + /* + * We need to do this right in the beginning since + * alloc_arraycache's are going to use this list. + * kmalloc_node allows us to add the slab to the right + * kmem_cache_node and not this cpu's kmem_cache_node + */ + err = init_cache_node_node(node); + if (err < 0) + goto bad; + + /* + * Now we can go ahead with allocating the shared arrays and + * array caches + */ + list_for_each_entry(cachep, &slab_caches, list) { + struct array_cache *shared = NULL; + struct alien_cache **alien = NULL; + + if (cachep->shared) { + shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, + 0xbaadf00d, GFP_KERNEL); + if (!shared) + goto bad; + } + if (use_alien_caches) { + alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); + if (!alien) { + kfree(shared); + goto bad; + } + } + n = get_node(cachep, node); + BUG_ON(!n); + + spin_lock_irq(&n->list_lock); + if (!n->shared) { + /* + * We are serialised from CPU_DEAD or + * CPU_UP_CANCELLED by the cpucontrol lock + */ + n->shared = shared; + shared = NULL; + } +#ifdef CONFIG_NUMA + if (!n->alien) { + n->alien = alien; + alien = NULL; + } +#endif + spin_unlock_irq(&n->list_lock); + kfree(shared); + free_alien_cache(alien); + } + + return 0; +bad: + cpuup_canceled(cpu); + return -ENOMEM; +} + +static int cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + mutex_lock(&slab_mutex); + err = cpuup_prepare(cpu); + mutex_unlock(&slab_mutex); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + start_cpu_timer(cpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Shutdown cache reaper. Note that the slab_mutex is + * held so that if cache_reap() is invoked it cannot do + * anything expensive but will only modify reap_work + * and reschedule the timer. + */ + cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(slab_reap_work, cpu).work.func = NULL; + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + start_cpu_timer(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* + * Even if all the cpus of a node are down, we don't free the + * kmem_cache_node of any cache. This to avoid a race between + * cpu_down, and a kmalloc allocation from another cpu for + * memory from the node of the cpu going down. The node + * structure is usually allocated from kmem_cache_create() and + * gets destroyed at kmem_cache_destroy(). + */ + /* fall through */ +#endif + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + mutex_lock(&slab_mutex); + cpuup_canceled(cpu); + mutex_unlock(&slab_mutex); + break; + } + return notifier_from_errno(err); +} + +static struct notifier_block cpucache_notifier = { + &cpuup_callback, NULL, 0 +}; + +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +/* + * Drains freelist for a node on each slab cache, used for memory hot-remove. + * Returns -EBUSY if all objects cannot be drained so that the node is not + * removed. + * + * Must hold slab_mutex. + */ +static int __meminit drain_cache_node_node(int node) +{ + struct kmem_cache *cachep; + int ret = 0; + + list_for_each_entry(cachep, &slab_caches, list) { + struct kmem_cache_node *n; + + n = get_node(cachep, node); + if (!n) + continue; + + drain_freelist(cachep, n, slabs_tofree(cachep, n)); + + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) { + ret = -EBUSY; + break; + } + } + return ret; +} + +static int __meminit slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int ret = 0; + int nid; + + nid = mnb->status_change_nid; + if (nid < 0) + goto out; + + switch (action) { + case MEM_GOING_ONLINE: + mutex_lock(&slab_mutex); + ret = init_cache_node_node(nid); + mutex_unlock(&slab_mutex); + break; + case MEM_GOING_OFFLINE: + mutex_lock(&slab_mutex); + ret = drain_cache_node_node(nid); + mutex_unlock(&slab_mutex); + break; + case MEM_ONLINE: + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } +out: + return notifier_from_errno(ret); +} +#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ + +/* + * swap the static kmem_cache_node with kmalloced memory + */ +static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, + int nodeid) +{ + struct kmem_cache_node *ptr; + + ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); + BUG_ON(!ptr); + + memcpy(ptr, list, sizeof(struct kmem_cache_node)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->list_lock); + + MAKE_ALL_LISTS(cachep, ptr, nodeid); + cachep->node[nodeid] = ptr; +} + +/* + * For setting up all the kmem_cache_node for cache whose buffer_size is same as + * size of kmem_cache_node. + */ +static void __init set_up_node(struct kmem_cache *cachep, int index) +{ + int node; + + for_each_online_node(node) { + cachep->node[node] = &init_kmem_cache_node[index + node]; + cachep->node[node]->next_reap = jiffies + + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + } +} + +/* + * Initialisation. Called after the page allocator have been initialised and + * before smp_init(). + */ +void __init kmem_cache_init(void) +{ + int i; + + BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < + sizeof(struct rcu_head)); + kmem_cache = &kmem_cache_boot; + + if (num_possible_nodes() == 1) + use_alien_caches = 0; + + for (i = 0; i < NUM_INIT_LISTS; i++) + kmem_cache_node_init(&init_kmem_cache_node[i]); + + /* + * Fragmentation resistance on low memory - only use bigger + * page orders on machines with more than 32MB of memory if + * not overridden on the command line. + */ + if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) + slab_max_order = SLAB_MAX_ORDER_HI; + + /* Bootstrap is tricky, because several objects are allocated + * from caches that do not exist yet: + * 1) initialize the kmem_cache cache: it contains the struct + * kmem_cache structures of all caches, except kmem_cache itself: + * kmem_cache is statically allocated. + * Initially an __init data area is used for the head array and the + * kmem_cache_node structures, it's replaced with a kmalloc allocated + * array at the end of the bootstrap. + * 2) Create the first kmalloc cache. + * The struct kmem_cache for the new cache is allocated normally. + * An __init data area is used for the head array. + * 3) Create the remaining kmalloc caches, with minimally sized + * head arrays. + * 4) Replace the __init data head arrays for kmem_cache and the first + * kmalloc cache with kmalloc allocated arrays. + * 5) Replace the __init data for kmem_cache_node for kmem_cache and + * the other cache's with kmalloc allocated memory. + * 6) Resize the head arrays of the kmalloc caches to their final sizes. + */ + + /* 1) create the kmem_cache */ + + /* + * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids + */ + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); + list_add(&kmem_cache->list, &slab_caches); + slab_state = PARTIAL; + + /* + * Initialize the caches that provide memory for the kmem_cache_node + * structures first. Without this, further allocations will bug. + */ + kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", + kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); + slab_state = PARTIAL_NODE; + + slab_early_init = 0; + + /* 5) Replace the bootstrap kmem_cache_node */ + { + int nid; + + for_each_online_node(nid) { + init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); + + init_list(kmalloc_caches[INDEX_NODE], + &init_kmem_cache_node[SIZE_NODE + nid], nid); + } + } + + create_kmalloc_caches(ARCH_KMALLOC_FLAGS); +} + +void __init kmem_cache_init_late(void) +{ + struct kmem_cache *cachep; + + slab_state = UP; + + /* 6) resize the head arrays to their final sizes */ + mutex_lock(&slab_mutex); + list_for_each_entry(cachep, &slab_caches, list) + if (enable_cpucache(cachep, GFP_NOWAIT)) + BUG(); + mutex_unlock(&slab_mutex); + + /* Done! */ + slab_state = FULL; + + /* + * Register a cpu startup notifier callback that initializes + * cpu_cache_get for all new cpus + */ + register_cpu_notifier(&cpucache_notifier); + +#ifdef CONFIG_NUMA + /* + * Register a memory hotplug callback that initializes and frees + * node. + */ + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif + + /* + * The reap timers are started later, with a module init call: That part + * of the kernel is not yet operational. + */ +} + +static int __init cpucache_init(void) +{ + int cpu; + + /* + * Register the timers that return unneeded pages to the page allocator + */ + for_each_online_cpu(cpu) + start_cpu_timer(cpu); + + /* Done! */ + slab_state = FULL; + return 0; +} +__initcall(cpucache_init); + +static noinline void +slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) +{ +#if DEBUG + struct kmem_cache_node *n; + struct page *page; + unsigned long flags; + int node; + static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) + return; + + printk(KERN_WARNING + "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nodeid, gfpflags); + printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + cachep->name, cachep->size, cachep->gfporder); + + for_each_kmem_cache_node(cachep, node, n) { + unsigned long active_objs = 0, num_objs = 0, free_objects = 0; + unsigned long active_slabs = 0, num_slabs = 0; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->slabs_full, lru) { + active_objs += cachep->num; + active_slabs++; + } + list_for_each_entry(page, &n->slabs_partial, lru) { + active_objs += page->active; + active_slabs++; + } + list_for_each_entry(page, &n->slabs_free, lru) + num_slabs++; + + free_objects += n->free_objects; + spin_unlock_irqrestore(&n->list_lock, flags); + + num_slabs += active_slabs; + num_objs = num_slabs * cachep->num; + printk(KERN_WARNING + " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + node, active_slabs, num_slabs, active_objs, num_objs, + free_objects); + } +#endif +} + +/* + * Interface to system's page allocator. No need to hold the + * kmem_cache_node ->list_lock. + * + * If we requested dmaable memory, we will get it. Even if we + * did not request dmaable memory, we might get it, but that + * would be relatively rare and ignorable. + */ +static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, + int nodeid) +{ + struct page *page; + int nr_pages; + + flags |= cachep->allocflags; + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; + + if (memcg_charge_slab(cachep, flags, cachep->gfporder)) + return NULL; + + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + if (!page) { + memcg_uncharge_slab(cachep, cachep->gfporder); + slab_out_of_memory(cachep, flags, nodeid); + return NULL; + } + + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (unlikely(page->pfmemalloc)) + pfmemalloc_active = true; + + nr_pages = (1 << cachep->gfporder); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + add_zone_page_state(page_zone(page), + NR_SLAB_RECLAIMABLE, nr_pages); + else + add_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_pages); + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); + + if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { + kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); + + if (cachep->ctor) + kmemcheck_mark_uninitialized_pages(page, nr_pages); + else + kmemcheck_mark_unallocated_pages(page, nr_pages); + } + + return page; +} + +/* + * Interface to system's page release. + */ +static void kmem_freepages(struct kmem_cache *cachep, struct page *page) +{ + const unsigned long nr_freed = (1 << cachep->gfporder); + + kmemcheck_free_shadow(page, cachep->gfporder); + + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + sub_zone_page_state(page_zone(page), + NR_SLAB_RECLAIMABLE, nr_freed); + else + sub_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_freed); + + BUG_ON(!PageSlab(page)); + __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + page_mapcount_reset(page); + page->mapping = NULL; + + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += nr_freed; + __free_pages(page, cachep->gfporder); + memcg_uncharge_slab(cachep, cachep->gfporder); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct kmem_cache *cachep; + struct page *page; + + page = container_of(head, struct page, rcu_head); + cachep = page->slab_cache; + + kmem_freepages(cachep, page); +} + +#if DEBUG + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, + unsigned long caller) +{ + int size = cachep->object_size; + + addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; + + if (size < 5 * sizeof(unsigned long)) + return; + + *addr++ = 0x12345678; + *addr++ = caller; + *addr++ = smp_processor_id(); + size -= 3 * sizeof(unsigned long); + { + unsigned long *sptr = &caller; + unsigned long svalue; + + while (!kstack_end(sptr)) { + svalue = *sptr++; + if (kernel_text_address(svalue)) { + *addr++ = svalue; + size -= sizeof(unsigned long); + if (size <= sizeof(unsigned long)) + break; + } + } + + } + *addr++ = 0x87654321; +} +#endif + +static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) +{ + int size = cachep->object_size; + addr = &((char *)addr)[obj_offset(cachep)]; + + memset(addr, val, size); + *(unsigned char *)(addr + size - 1) = POISON_END; +} + +static void dump_line(char *data, int offset, int limit) +{ + int i; + unsigned char error = 0; + int bad_count = 0; + + printk(KERN_ERR "%03x: ", offset); + for (i = 0; i < limit; i++) { + if (data[offset + i] != POISON_FREE) { + error = data[offset + i]; + bad_count++; + } + } + print_hex_dump(KERN_CONT, "", 0, 16, 1, + &data[offset], limit, 1); + + if (bad_count == 1) { + error ^= POISON_FREE; + if (!(error & (error - 1))) { + printk(KERN_ERR "Single bit error detected. Probably " + "bad RAM.\n"); +#ifdef CONFIG_X86 + printk(KERN_ERR "Run memtest86+ or a similar memory " + "test tool.\n"); +#else + printk(KERN_ERR "Run a memory test tool.\n"); +#endif + } + } +} +#endif + +#if DEBUG + +static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) +{ + int i, size; + char *realobj; + + if (cachep->flags & SLAB_RED_ZONE) { + printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + + if (cachep->flags & SLAB_STORE_USER) { + printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + *dbg_userword(cachep, objp), + *dbg_userword(cachep, objp)); + } + realobj = (char *)objp + obj_offset(cachep); + size = cachep->object_size; + for (i = 0; i < size && lines; i += 16, lines--) { + int limit; + limit = 16; + if (i + limit > size) + limit = size - i; + dump_line(realobj, i, limit); + } +} + +static void check_poison_obj(struct kmem_cache *cachep, void *objp) +{ + char *realobj; + int size, i; + int lines = 0; + + realobj = (char *)objp + obj_offset(cachep); + size = cachep->object_size; + + for (i = 0; i < size; i++) { + char exp = POISON_FREE; + if (i == size - 1) + exp = POISON_END; + if (realobj[i] != exp) { + int limit; + /* Mismatch ! */ + /* Print header */ + if (lines == 0) { + printk(KERN_ERR + "Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, realobj, size); + print_objinfo(cachep, objp, 0); + } + /* Hexdump the affected line */ + i = (i / 16) * 16; + limit = 16; + if (i + limit > size) + limit = size - i; + dump_line(realobj, i, limit); + i += 16; + lines++; + /* Limit to 5 lines */ + if (lines > 5) + break; + } + } + if (lines != 0) { + /* Print some data about the neighboring objects, if they + * exist: + */ + struct page *page = virt_to_head_page(objp); + unsigned int objnr; + + objnr = obj_to_index(cachep, page, objp); + if (objnr) { + objp = index_to_obj(cachep, page, objnr - 1); + realobj = (char *)objp + obj_offset(cachep); + printk(KERN_ERR "Prev obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + if (objnr + 1 < cachep->num) { + objp = index_to_obj(cachep, page, objnr + 1); + realobj = (char *)objp + obj_offset(cachep); + printk(KERN_ERR "Next obj: start=%p, len=%d\n", + realobj, size); + print_objinfo(cachep, objp, 2); + } + } +} +#endif + +#if DEBUG +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) +{ + int i; + for (i = 0; i < cachep->num; i++) { + void *objp = index_to_obj(cachep, page, i); + + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if (cachep->size % PAGE_SIZE == 0 && + OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), + cachep->size / PAGE_SIZE, 1); + else + check_poison_obj(cachep, objp); +#else + check_poison_obj(cachep, objp); +#endif + } + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "start of a freed object " + "was overwritten"); + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "end of a freed object " + "was overwritten"); + } + } +} +#else +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct page *page) +{ +} +#endif + +/** + * slab_destroy - destroy and release all objects in a slab + * @cachep: cache pointer being destroyed + * @page: page pointer being destroyed + * + * Destroy all the objs in a slab page, and release the mem back to the system. + * Before calling the slab page must have been unlinked from the cache. The + * kmem_cache_node ->list_lock is not held/needed. + */ +static void slab_destroy(struct kmem_cache *cachep, struct page *page) +{ + void *freelist; + + freelist = page->freelist; + slab_destroy_debugcheck(cachep, page); + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { + struct rcu_head *head; + + /* + * RCU free overloads the RCU head over the LRU. + * slab_page has been overloeaded over the LRU, + * however it is not used from now on so that + * we can use it safely. + */ + head = (void *)&page->rcu_head; + call_rcu(head, kmem_rcu_free); + + } else { + kmem_freepages(cachep, page); + } + + /* + * From now on, we don't use freelist + * although actual page can be freed in rcu context + */ + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->freelist_cache, freelist); +} + +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct page *page, *n; + + list_for_each_entry_safe(page, n, list, lru) { + list_del(&page->lru); + slab_destroy(cachep, page); + } +} + +/** + * calculate_slab_order - calculate size (page order) of slabs + * @cachep: pointer to the cache that is being created + * @size: size of objects to be created in this cache. + * @align: required alignment for the objects. + * @flags: slab allocation flags + * + * Also calculates the number of objects per slab. + * + * This could be made much more intelligent. For now, try to avoid using + * high order pages for slabs. When the gfp() functions are more friendly + * towards high-order requests, this should be changed. + */ +static size_t calculate_slab_order(struct kmem_cache *cachep, + size_t size, size_t align, unsigned long flags) +{ + unsigned long offslab_limit; + size_t left_over = 0; + int gfporder; + + for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { + unsigned int num; + size_t remainder; + + cache_estimate(gfporder, size, align, flags, &remainder, &num); + if (!num) + continue; + + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ + if (num > SLAB_OBJ_MAX_NUM) + break; + + if (flags & CFLGS_OFF_SLAB) { + size_t freelist_size_per_obj = sizeof(freelist_idx_t); + /* + * Max number of objs-per-slab for caches which + * use off-slab slabs. Needed to avoid a possible + * looping condition in cache_grow(). + */ + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) + freelist_size_per_obj += sizeof(char); + offslab_limit = size; + offslab_limit /= freelist_size_per_obj; + + if (num > offslab_limit) + break; + } + + /* Found something acceptable - save it away */ + cachep->num = num; + cachep->gfporder = gfporder; + left_over = remainder; + + /* + * A VFS-reclaimable slab tends to have most allocations + * as GFP_NOFS and we really don't want to have to be allocating + * higher-order pages when we are unable to shrink dcache. + */ + if (flags & SLAB_RECLAIM_ACCOUNT) + break; + + /* + * Large number of objects is good, but very large slabs are + * currently bad for the gfp()s. + */ + if (gfporder >= slab_max_order) + break; + + /* + * Acceptable internal fragmentation? + */ + if (left_over * 8 <= (PAGE_SIZE << gfporder)) + break; + } + return left_over; +} + +static struct array_cache __percpu *alloc_kmem_cache_cpus( + struct kmem_cache *cachep, int entries, int batchcount) +{ + int cpu; + size_t size; + struct array_cache __percpu *cpu_cache; + + size = sizeof(void *) * entries + sizeof(struct array_cache); + cpu_cache = __alloc_percpu(size, sizeof(void *)); + + if (!cpu_cache) + return NULL; + + for_each_possible_cpu(cpu) { + init_arraycache(per_cpu_ptr(cpu_cache, cpu), + entries, batchcount); + } + + return cpu_cache; +} + +static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + if (slab_state >= FULL) + return enable_cpucache(cachep, gfp); + + cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); + if (!cachep->cpu_cache) + return 1; + + if (slab_state == DOWN) { + /* Creation of first cache (kmem_cache). */ + set_up_node(kmem_cache, CACHE_CACHE); + } else if (slab_state == PARTIAL) { + /* For kmem_cache_node */ + set_up_node(cachep, SIZE_NODE); + } else { + int node; + + for_each_online_node(node) { + cachep->node[node] = kmalloc_node( + sizeof(struct kmem_cache_node), gfp, node); + BUG_ON(!cachep->node[node]); + kmem_cache_node_init(cachep->node[node]); + } + } + + cachep->node[numa_mem_id()]->next_reap = + jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + cpu_cache_get(cachep)->avail = 0; + cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep)->batchcount = 1; + cpu_cache_get(cachep)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; + return 0; +} + +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} + +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *cachep; + + cachep = find_mergeable(size, align, flags, name, ctor); + if (cachep) { + cachep->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + cachep->object_size = max_t(int, cachep->object_size, size); + } + return cachep; +} + +/** + * __kmem_cache_create - Create a cache. + * @cachep: cache management descriptor + * @flags: SLAB flags + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a int, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ +int +__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) +{ + size_t left_over, freelist_size; + size_t ralign = BYTES_PER_WORD; + gfp_t gfp; + int err; + size_t size = cachep->size; + +#if DEBUG +#if FORCED_DEBUG + /* + * Enable redzoning and last user accounting, except for caches with + * large objects, if the increased size would increase the object size + * above the next power of two: caches with object sizes just above a + * power of two have a significant amount of internal fragmentation. + */ + if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + + 2 * sizeof(unsigned long long))) + flags |= SLAB_RED_ZONE | SLAB_STORE_USER; + if (!(flags & SLAB_DESTROY_BY_RCU)) + flags |= SLAB_POISON; +#endif + if (flags & SLAB_DESTROY_BY_RCU) + BUG_ON(flags & SLAB_POISON); +#endif + + /* + * Check that size is in terms of words. This is needed to avoid + * unaligned accesses for some archs when redzoning is used, and makes + * sure any on-slab bufctl's are also correctly aligned. + */ + if (size & (BYTES_PER_WORD - 1)) { + size += (BYTES_PER_WORD - 1); + size &= ~(BYTES_PER_WORD - 1); + } + + if (flags & SLAB_RED_ZONE) { + ralign = REDZONE_ALIGN; + /* If redzoning, ensure that the second redzone is suitably + * aligned, by adjusting the object size accordingly. */ + size += REDZONE_ALIGN - 1; + size &= ~(REDZONE_ALIGN - 1); + } + + /* 3) caller mandated alignment */ + if (ralign < cachep->align) { + ralign = cachep->align; + } + /* disable debug if necessary */ + if (ralign > __alignof__(unsigned long long)) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); + /* + * 4) Store it. + */ + cachep->align = ralign; + + if (slab_is_available()) + gfp = GFP_KERNEL; + else + gfp = GFP_NOWAIT; + +#if DEBUG + + /* + * Both debugging options require word-alignment which is calculated + * into align above. + */ + if (flags & SLAB_RED_ZONE) { + /* add space for red zone words */ + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); + } + if (flags & SLAB_STORE_USER) { + /* user store requires one word storage behind the end of + * the real object. But if the second red zone needs to be + * aligned to 64 bits, we must allow that much space. + */ + if (flags & SLAB_RED_ZONE) + size += REDZONE_ALIGN; + else + size += BYTES_PER_WORD; + } +#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) + if (size >= kmalloc_size(INDEX_NODE + 1) + && cachep->object_size > cache_line_size() + && ALIGN(size, cachep->align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); + size = PAGE_SIZE; + } +#endif +#endif + + /* + * Determine if the slab management is 'on' or 'off' slab. + * (bootstrapping cannot cope with offslab caches so don't do + * it too early on. Always use on-slab management when + * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) + */ + if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && + !(flags & SLAB_NOLEAKTRACE)) + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + flags |= CFLGS_OFF_SLAB; + + size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + + left_over = calculate_slab_order(cachep, size, cachep->align, flags); + + if (!cachep->num) + return -E2BIG; + + freelist_size = calculate_freelist_size(cachep->num, cachep->align); + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { + flags &= ~CFLGS_OFF_SLAB; + left_over -= freelist_size; + } + + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ + freelist_size = calculate_freelist_size(cachep->num, 0); + +#ifdef CONFIG_PAGE_POISONING + /* If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + } + + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; + cachep->colour = left_over / cachep->colour_off; + cachep->freelist_size = freelist_size; + cachep->flags = flags; + cachep->allocflags = __GFP_COMP; + if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) + cachep->allocflags |= GFP_DMA; + cachep->size = size; + cachep->reciprocal_buffer_size = reciprocal_value(size); + + if (flags & CFLGS_OFF_SLAB) { + cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); + /* + * This is a possibility for one of the kmalloc_{dma,}_caches. + * But since we go off slab only for object size greater than + * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * in ascending order,this should not happen at all. + * But leave a BUG_ON for some lucky dude. + */ + BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); + } + + err = setup_cpu_cache(cachep, gfp); + if (err) { + __kmem_cache_shutdown(cachep); + return err; + } + + return 0; +} + +#if DEBUG +static void check_irq_off(void) +{ + BUG_ON(!irqs_disabled()); +} + +static void check_irq_on(void) +{ + BUG_ON(irqs_disabled()); +} + +static void check_spinlock_acquired(struct kmem_cache *cachep) +{ +#ifdef CONFIG_SMP + check_irq_off(); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); +#endif +} + +static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) +{ +#ifdef CONFIG_SMP + check_irq_off(); + assert_spin_locked(&get_node(cachep, node)->list_lock); +#endif +} + +#else +#define check_irq_off() do { } while(0) +#define check_irq_on() do { } while(0) +#define check_spinlock_acquired(x) do { } while(0) +#define check_spinlock_acquired_node(x, y) do { } while(0) +#endif + +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, + struct array_cache *ac, + int force, int node); + +static void do_drain(void *arg) +{ + struct kmem_cache *cachep = arg; + struct array_cache *ac; + int node = numa_mem_id(); + struct kmem_cache_node *n; + LIST_HEAD(list); + + check_irq_off(); + ac = cpu_cache_get(cachep); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); + ac->avail = 0; +} + +static void drain_cpu_caches(struct kmem_cache *cachep) +{ + struct kmem_cache_node *n; + int node; + + on_each_cpu(do_drain, cachep, 1); + check_irq_on(); + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) + drain_alien_cache(cachep, n->alien); + + for_each_kmem_cache_node(cachep, node, n) + drain_array(cachep, n, n->shared, 1, node); +} + +/* + * Remove slabs from the list of free slabs. + * Specify the number of slabs to drain in tofree. + * + * Returns the actual number of slabs released. + */ +static int drain_freelist(struct kmem_cache *cache, + struct kmem_cache_node *n, int tofree) +{ + struct list_head *p; + int nr_freed; + struct page *page; + + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&n->slabs_free)) { + + spin_lock_irq(&n->list_lock); + p = n->slabs_free.prev; + if (p == &n->slabs_free) { + spin_unlock_irq(&n->list_lock); + goto out; + } + + page = list_entry(p, struct page, lru); +#if DEBUG + BUG_ON(page->active); +#endif + list_del(&page->lru); + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. + */ + n->free_objects -= cache->num; + spin_unlock_irq(&n->list_lock); + slab_destroy(cache, page); + nr_freed++; + } +out: + return nr_freed; +} + +int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) +{ + int ret = 0; + int node; + struct kmem_cache_node *n; + + drain_cpu_caches(cachep); + + check_irq_on(); + for_each_kmem_cache_node(cachep, node, n) { + drain_freelist(cachep, n, slabs_tofree(cachep, n)); + + ret += !list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial); + } + return (ret ? 1 : 0); +} + +int __kmem_cache_shutdown(struct kmem_cache *cachep) +{ + int i; + struct kmem_cache_node *n; + int rc = __kmem_cache_shrink(cachep, false); + + if (rc) + return rc; + + free_percpu(cachep->cpu_cache); + + /* NUMA: free the node structures */ + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; + } + return 0; +} + +/* + * Get the memory for a slab management obj. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init(). + */ +static void *alloc_slabmgmt(struct kmem_cache *cachep, + struct page *page, int colour_off, + gfp_t local_flags, int nodeid) +{ + void *freelist; + void *addr = page_address(page); + + if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ + freelist = kmem_cache_alloc_node(cachep->freelist_cache, + local_flags, nodeid); + if (!freelist) + return NULL; + } else { + freelist = addr + colour_off; + colour_off += cachep->freelist_size; + } + page->active = 0; + page->s_mem = addr + colour_off; + return freelist; +} + +static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx) +{ + return ((freelist_idx_t *)page->freelist)[idx]; +} + +static inline void set_free_obj(struct page *page, + unsigned int idx, freelist_idx_t val) +{ + ((freelist_idx_t *)(page->freelist))[idx] = val; +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + + for (i = 0; i < cachep->num; i++) { + void *objp = index_to_obj(cachep, page, i); +#if DEBUG + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) + poison_obj(cachep, objp, POISON_FREE); + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = NULL; + + if (cachep->flags & SLAB_RED_ZONE) { + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + /* + * Constructors are not allowed to allocate memory from the same + * cache which they are a constructor for. Otherwise, deadlock. + * They must also be threaded. + */ + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + cachep->ctor(objp + obj_offset(cachep)); + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the" + " end of an object"); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the" + " start of an object"); + } + if ((cachep->size % PAGE_SIZE) == 0 && + OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) + kernel_map_pages(virt_to_page(objp), + cachep->size / PAGE_SIZE, 0); +#else + if (cachep->ctor) + cachep->ctor(objp); +#endif + set_obj_status(page, i, OBJECT_FREE); + set_free_obj(page, i, i); + } +} + +static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) +{ + if (CONFIG_ZONE_DMA_FLAG) { + if (flags & GFP_DMA) + BUG_ON(!(cachep->allocflags & GFP_DMA)); + else + BUG_ON(cachep->allocflags & GFP_DMA); + } +} + +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, + int nodeid) +{ + void *objp; + + objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); + page->active++; +#if DEBUG + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); +#endif + + return objp; +} + +static void slab_put_obj(struct kmem_cache *cachep, struct page *page, + void *objp, int nodeid) +{ + unsigned int objnr = obj_to_index(cachep, page, objp); +#if DEBUG + unsigned int i; + + /* Verify that the slab belongs to the intended node */ + WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); + + /* Verify double free bug */ + for (i = page->active; i < cachep->num; i++) { + if (get_free_obj(page, i) == objnr) { + printk(KERN_ERR "slab: double free detected in cache " + "'%s', objp %p\n", cachep->name, objp); + BUG(); + } + } +#endif + page->active--; + set_free_obj(page, page->active, objnr); +} + +/* + * Map pages beginning at addr to the given cache and slab. This is required + * for the slab allocator to be able to lookup the cache and slab of a + * virtual address for kfree, ksize, and slab debugging. + */ +static void slab_map_pages(struct kmem_cache *cache, struct page *page, + void *freelist) +{ + page->slab_cache = cache; + page->freelist = freelist; +} + +/* + * Grow (by 1) the number of slabs within a cache. This is called by + * kmem_cache_alloc() when there are no active objs left in a cache. + */ +static int cache_grow(struct kmem_cache *cachep, + gfp_t flags, int nodeid, struct page *page) +{ + void *freelist; + size_t offset; + gfp_t local_flags; + struct kmem_cache_node *n; + + /* + * Be lazy and only check for valid flags here, keeping it out of the + * critical path in kmem_cache_alloc(). + */ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + + /* Take the node list lock to change the colour_next on this node */ + check_irq_off(); + n = get_node(cachep, nodeid); + spin_lock(&n->list_lock); + + /* Get colour for the slab, and cal the next value. */ + offset = n->colour_next; + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + spin_unlock(&n->list_lock); + + offset *= cachep->colour_off; + + if (local_flags & __GFP_WAIT) + local_irq_enable(); + + /* + * The test for missing atomic flag is performed here, rather than + * the more obvious place, simply to reduce the critical path length + * in kmem_cache_alloc(). If a caller is seriously mis-behaving they + * will eventually be caught here (where it matters). + */ + kmem_flagcheck(cachep, flags); + + /* + * Get mem for the objs. Attempt to allocate a physical page from + * 'nodeid'. + */ + if (!page) + page = kmem_getpages(cachep, local_flags, nodeid); + if (!page) + goto failed; + + /* Get slab management. */ + freelist = alloc_slabmgmt(cachep, page, offset, + local_flags & ~GFP_CONSTRAINT_MASK, nodeid); + if (!freelist) + goto opps1; + + slab_map_pages(cachep, page, freelist); + + cache_init_objs(cachep, page); + + if (local_flags & __GFP_WAIT) + local_irq_disable(); + check_irq_off(); + spin_lock(&n->list_lock); + + /* Make slab active. */ + list_add_tail(&page->lru, &(n->slabs_free)); + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num; + spin_unlock(&n->list_lock); + return 1; +opps1: + kmem_freepages(cachep, page); +failed: + if (local_flags & __GFP_WAIT) + local_irq_disable(); + return 0; +} + +#if DEBUG + +/* + * Perform extra freeing checks: + * - detect bad pointers. + * - POISON/RED_ZONE checking + */ +static void kfree_debugcheck(const void *objp) +{ + if (!virt_addr_valid(objp)) { + printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + (unsigned long)objp); + BUG(); + } +} + +static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) +{ + unsigned long long redzone1, redzone2; + + redzone1 = *dbg_redzone1(cache, obj); + redzone2 = *dbg_redzone2(cache, obj); + + /* + * Redzone is ok. + */ + if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) + return; + + if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) + slab_error(cache, "double free detected"); + else + slab_error(cache, "memory outside object was overwritten"); + + printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", + obj, redzone1, redzone2); +} + +static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + unsigned int objnr; + struct page *page; + + BUG_ON(virt_to_cache(objp) != cachep); + + objp -= obj_offset(cachep); + kfree_debugcheck(objp); + page = virt_to_head_page(objp); + + if (cachep->flags & SLAB_RED_ZONE) { + verify_redzone_free(cachep, objp); + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = (void *)caller; + + objnr = obj_to_index(cachep, page, objp); + + BUG_ON(objnr >= cachep->num); + BUG_ON(objp != index_to_obj(cachep, page, objnr)); + + set_obj_status(page, objnr, OBJECT_FREE); + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { + store_stackinfo(cachep, objp, caller); + kernel_map_pages(virt_to_page(objp), + cachep->size / PAGE_SIZE, 0); + } else { + poison_obj(cachep, objp, POISON_FREE); + } +#else + poison_obj(cachep, objp, POISON_FREE); +#endif + } + return objp; +} + +#else +#define kfree_debugcheck(x) do { } while(0) +#define cache_free_debugcheck(x,objp,z) (objp) +#endif + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, + bool force_refill) +{ + int batchcount; + struct kmem_cache_node *n; + struct array_cache *ac; + int node; + + check_irq_off(); + node = numa_mem_id(); + if (unlikely(force_refill)) + goto force_grow; +retry: + ac = cpu_cache_get(cachep); + batchcount = ac->batchcount; + if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { + /* + * If there was little recent activity on this cache, then + * perform only a partial refill. Otherwise we could generate + * refill bouncing. + */ + batchcount = BATCHREFILL_LIMIT; + } + n = get_node(cachep, node); + + BUG_ON(ac->avail > 0 || !n); + spin_lock(&n->list_lock); + + /* See if we can refill from the shared array */ + if (n->shared && transfer_objects(ac, n->shared, batchcount)) { + n->shared->touched = 1; + goto alloc_done; + } + + while (batchcount > 0) { + struct list_head *entry; + struct page *page; + /* Get slab alloc is to come from. */ + entry = n->slabs_partial.next; + if (entry == &n->slabs_partial) { + n->free_touched = 1; + entry = n->slabs_free.next; + if (entry == &n->slabs_free) + goto must_grow; + } + + page = list_entry(entry, struct page, lru); + check_spinlock_acquired(cachep); + + /* + * The slab was either on partial or free list so + * there must be at least one object available for + * allocation. + */ + BUG_ON(page->active >= cachep->num); + + while (page->active < cachep->num && batchcount--) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + ac_put_obj(cachep, ac, slab_get_obj(cachep, page, + node)); + } + + /* move slabp to correct slabp list: */ + list_del(&page->lru); + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); + else + list_add(&page->lru, &n->slabs_partial); + } + +must_grow: + n->free_objects -= ac->avail; +alloc_done: + spin_unlock(&n->list_lock); + + if (unlikely(!ac->avail)) { + int x; +force_grow: + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); + + /* cache_grow can reenable interrupts, then ac could change. */ + ac = cpu_cache_get(cachep); + node = numa_mem_id(); + + /* no objects in sight? abort */ + if (!x && (ac->avail == 0 || force_refill)) + return NULL; + + if (!ac->avail) /* objects refilled by interrupt? */ + goto retry; + } + ac->touched = 1; + + return ac_get_obj(cachep, ac, flags, force_refill); +} + +static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, + gfp_t flags) +{ + might_sleep_if(flags & __GFP_WAIT); +#if DEBUG + kmem_flagcheck(cachep, flags); +#endif +} + +#if DEBUG +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, + gfp_t flags, void *objp, unsigned long caller) +{ + struct page *page; + + if (!objp) + return objp; + if (cachep->flags & SLAB_POISON) { +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) + kernel_map_pages(virt_to_page(objp), + cachep->size / PAGE_SIZE, 1); + else + check_poison_obj(cachep, objp); +#else + check_poison_obj(cachep, objp); +#endif + poison_obj(cachep, objp, POISON_INUSE); + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = (void *)caller; + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || + *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, "double free, or memory outside" + " object was overwritten"); + printk(KERN_ERR + "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + *dbg_redzone1(cachep, objp) = RED_ACTIVE; + *dbg_redzone2(cachep, objp) = RED_ACTIVE; + } + + page = virt_to_head_page(objp); + set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); + objp += obj_offset(cachep); + if (cachep->ctor && cachep->flags & SLAB_POISON) + cachep->ctor(objp); + if (ARCH_SLAB_MINALIGN && + ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { + printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + objp, (int)ARCH_SLAB_MINALIGN); + } + return objp; +} +#else +#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) +#endif + +static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) +{ + if (unlikely(cachep == kmem_cache)) + return false; + + return should_failslab(cachep->object_size, flags, cachep->flags); +} + +static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + void *objp; + struct array_cache *ac; + bool force_refill = false; + + check_irq_off(); + + ac = cpu_cache_get(cachep); + if (likely(ac->avail)) { + ac->touched = 1; + objp = ac_get_obj(cachep, ac, flags, false); + + /* + * Allow for the possibility all avail objects are not allowed + * by the current flags + */ + if (objp) { + STATS_INC_ALLOCHIT(cachep); + goto out; + } + force_refill = true; + } + + STATS_INC_ALLOCMISS(cachep); + objp = cache_alloc_refill(cachep, flags, force_refill); + /* + * the 'ac' may be updated by cache_alloc_refill(), + * and kmemleak_erase() requires its correct value. + */ + ac = cpu_cache_get(cachep); + +out: + /* + * To avoid a false negative, if an object that is in one of the + * per-CPU caches is leaked, we need to make sure kmemleak doesn't + * treat the array pointers as a reference to the object. + */ + if (objp) + kmemleak_erase(&ac->entry[ac->avail]); + return objp; +} + +#ifdef CONFIG_NUMA +/* + * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt() || (flags & __GFP_THISNODE)) + return NULL; + nid_alloc = nid_here = numa_mem_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_slab_spread_node(); + else if (current->mempolicy) + nid_alloc = mempolicy_slab_node(); + if (nid_alloc != nid_here) + return ____cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + +/* + * Fallback function if there was no memory available and no objects on a + * certain node and fall back is permitted. First we scan all the + * available node for available objects. If that fails then we + * perform an allocation without specifying a node. This allows the page + * allocator to do its reclaim / fallback magic. We then insert the + * slab into the proper nodelist and then allocate from it. + */ +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +{ + struct zonelist *zonelist; + gfp_t local_flags; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); + void *obj = NULL; + int nid; + unsigned int cpuset_mems_cookie; + + if (flags & __GFP_THISNODE) + return NULL; + + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); + +retry: + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + nid = zone_to_nid(zone); + + if (cpuset_zone_allowed(zone, flags) && + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); + if (obj) + break; + } + } + + if (!obj) { + /* + * This allocation will be performed within the constraints + * of the current cpuset / memory policy requirements. + * We may trigger various forms of reclaim on the allowed + * set and go into memory reserves if necessary. + */ + struct page *page; + + if (local_flags & __GFP_WAIT) + local_irq_enable(); + kmem_flagcheck(cache, flags); + page = kmem_getpages(cache, local_flags, numa_mem_id()); + if (local_flags & __GFP_WAIT) + local_irq_disable(); + if (page) { + /* + * Insert into the appropriate per node queues + */ + nid = page_to_nid(page); + if (cache_grow(cache, flags, nid, page)) { + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); + if (!obj) + /* + * Another processor may allocate the + * objects in the slab since we are + * not holding any locks. + */ + goto retry; + } else { + /* cache_grow already freed obj */ + obj = NULL; + } + } + } + + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return obj; +} + +/* + * A interface to enable slab creation on nodeid + */ +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid) +{ + struct list_head *entry; + struct page *page; + struct kmem_cache_node *n; + void *obj; + int x; + + VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); + n = get_node(cachep, nodeid); + BUG_ON(!n); + +retry: + check_irq_off(); + spin_lock(&n->list_lock); + entry = n->slabs_partial.next; + if (entry == &n->slabs_partial) { + n->free_touched = 1; + entry = n->slabs_free.next; + if (entry == &n->slabs_free) + goto must_grow; + } + + page = list_entry(entry, struct page, lru); + check_spinlock_acquired_node(cachep, nodeid); + + STATS_INC_NODEALLOCS(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + BUG_ON(page->active == cachep->num); + + obj = slab_get_obj(cachep, page, nodeid); + n->free_objects--; + /* move slabp to correct slabp list: */ + list_del(&page->lru); + + if (page->active == cachep->num) + list_add(&page->lru, &n->slabs_full); + else + list_add(&page->lru, &n->slabs_partial); + + spin_unlock(&n->list_lock); + goto done; + +must_grow: + spin_unlock(&n->list_lock); + x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); + if (x) + goto retry; + + return fallback_alloc(cachep, flags); + +done: + return obj; +} + +static __always_inline void * +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, + unsigned long caller) +{ + unsigned long save_flags; + void *ptr; + int slab_node = numa_mem_id(); + + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + + if (slab_should_failslab(cachep, flags)) + return NULL; + + cachep = memcg_kmem_get_cache(cachep, flags); + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + + if (nodeid == NUMA_NO_NODE) + nodeid = slab_node; + + if (unlikely(!get_node(cachep, nodeid))) { + /* Node not bootstrapped yet */ + ptr = fallback_alloc(cachep, flags); + goto out; + } + + if (nodeid == slab_node) { + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ + ptr = ____cache_alloc(cachep, flags); + if (ptr) + goto out; + } + /* ___cache_alloc_node can fall back to other nodes */ + ptr = ____cache_alloc_node(cachep, flags, nodeid); + out: + local_irq_restore(save_flags); + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); + kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, + flags); + + if (likely(ptr)) { + kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(ptr, 0, cachep->object_size); + } + + memcg_kmem_put_cache(cachep); + return ptr; +} + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +{ + void *objp; + + if (current->mempolicy || cpuset_do_slab_mem_spread()) { + objp = alternate_node_alloc(cache, flags); + if (objp) + goto out; + } + objp = ____cache_alloc(cache, flags); + + /* + * We may just have run out of memory on the local node. + * ____cache_alloc_node() knows how to locate memory on other nodes + */ + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_mem_id()); + + out: + return objp; +} +#else + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return ____cache_alloc(cachep, flags); +} + +#endif /* CONFIG_NUMA */ + +static __always_inline void * +slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) +{ + unsigned long save_flags; + void *objp; + + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + + if (slab_should_failslab(cachep, flags)) + return NULL; + + cachep = memcg_kmem_get_cache(cachep, flags); + + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + objp = __do_cache_alloc(cachep, flags); + local_irq_restore(save_flags); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, + flags); + prefetchw(objp); + + if (likely(objp)) { + kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); + if (unlikely(flags & __GFP_ZERO)) + memset(objp, 0, cachep->object_size); + } + + memcg_kmem_put_cache(cachep); + return objp; +} + +/* + * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller + */ +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) +{ + int i; + struct kmem_cache_node *n = get_node(cachep, node); + + for (i = 0; i < nr_objects; i++) { + void *objp; + struct page *page; + + clear_obj_pfmemalloc(&objpp[i]); + objp = objpp[i]; + + page = virt_to_head_page(objp); + list_del(&page->lru); + check_spinlock_acquired_node(cachep, node); + slab_put_obj(cachep, page, objp, node); + STATS_DEC_ACTIVE(cachep); + n->free_objects++; + + /* fixup slab chains */ + if (page->active == 0) { + if (n->free_objects > n->free_limit) { + n->free_objects -= cachep->num; + list_add_tail(&page->lru, list); + } else { + list_add(&page->lru, &n->slabs_free); + } + } else { + /* Unconditionally move a slab to the end of the + * partial list on free - maximum time for the + * other objects to be freed, too. + */ + list_add_tail(&page->lru, &n->slabs_partial); + } + } +} + +static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +{ + int batchcount; + struct kmem_cache_node *n; + int node = numa_mem_id(); + LIST_HEAD(list); + + batchcount = ac->batchcount; +#if DEBUG + BUG_ON(!batchcount || batchcount > ac->avail); +#endif + check_irq_off(); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + if (n->shared) { + struct array_cache *shared_array = n->shared; + int max = shared_array->limit - shared_array->avail; + if (max) { + if (batchcount > max) + batchcount = max; + memcpy(&(shared_array->entry[shared_array->avail]), + ac->entry, sizeof(void *) * batchcount); + shared_array->avail += batchcount; + goto free_done; + } + } + + free_block(cachep, ac->entry, batchcount, node, &list); +free_done: +#if STATS + { + int i = 0; + struct list_head *p; + + p = n->slabs_free.next; + while (p != &(n->slabs_free)) { + struct page *page; + + page = list_entry(p, struct page, lru); + BUG_ON(page->active); + + i++; + p = p->next; + } + STATS_SET_FREEABLE(cachep, i); + } +#endif + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); + ac->avail -= batchcount; + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); +} + +/* + * Release an obj back to its cache. If the obj has a constructed state, it must + * be in this state _before_ it is released. Called with disabled ints. + */ +static inline void __cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + struct array_cache *ac = cpu_cache_get(cachep); + + check_irq_off(); + kmemleak_free_recursive(objp, cachep->flags); + objp = cache_free_debugcheck(cachep, objp, caller); + + kmemcheck_slab_free(cachep, objp, cachep->object_size); + + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) + return; + + if (ac->avail < ac->limit) { + STATS_INC_FREEHIT(cachep); + } else { + STATS_INC_FREEMISS(cachep); + cache_flusharray(cachep, ac); + } + + ac_put_obj(cachep, ac, objp); +} + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. The flags are only relevant + * if the cache has no available objects. + */ +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + void *ret = slab_alloc(cachep, flags, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, + cachep->object_size, cachep->size, flags); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_TRACING +void * +kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) +{ + void *ret; + + ret = slab_alloc(cachep, flags, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, + size, cachep->size, flags); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_trace); +#endif + +#ifdef CONFIG_NUMA +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + */ +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +{ + void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + + trace_kmem_cache_alloc_node(_RET_IP_, ret, + cachep->object_size, cachep->size, + flags, nodeid); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid, + size_t size) +{ + void *ret; + + ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, cachep->size, + flags, nodeid); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); +#endif + +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) +{ + struct kmem_cache *cachep; + + cachep = kmalloc_slab(size, flags); + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; + return kmem_cache_alloc_node_trace(cachep, flags, node, size); +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); +#endif /* CONFIG_NUMA */ + +/** + * __do_kmalloc - allocate memory + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kmalloc). + * @caller: function caller for debug tracking of the caller + */ +static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, + unsigned long caller) +{ + struct kmem_cache *cachep; + void *ret; + + cachep = kmalloc_slab(size, flags); + if (unlikely(ZERO_OR_NULL_PTR(cachep))) + return cachep; + ret = slab_alloc(cachep, flags, caller); + + trace_kmalloc(caller, ret, + size, cachep->size, flags); + + return ret; +} + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc(size, flags, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) +{ + return __do_kmalloc(size, flags, caller); +} +EXPORT_SYMBOL(__kmalloc_track_caller); + +/** + * kmem_cache_free - Deallocate an object + * @cachep: The cache the allocation was from. + * @objp: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ +void kmem_cache_free(struct kmem_cache *cachep, void *objp) +{ + unsigned long flags; + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; + + local_irq_save(flags); + debug_check_no_locks_freed(objp, cachep->object_size); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, _RET_IP_); + local_irq_restore(flags); + + trace_kmem_cache_free(_RET_IP_, objp); +} +EXPORT_SYMBOL(kmem_cache_free); + +/** + * kfree - free previously allocated memory + * @objp: pointer returned by kmalloc. + * + * If @objp is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *objp) +{ + struct kmem_cache *c; + unsigned long flags; + + trace_kfree(_RET_IP_, objp); + + if (unlikely(ZERO_OR_NULL_PTR(objp))) + return; + local_irq_save(flags); + kfree_debugcheck(objp); + c = virt_to_cache(objp); + debug_check_no_locks_freed(objp, c->object_size); + + debug_check_no_obj_freed(objp, c->object_size); + __cache_free(c, (void *)objp, _RET_IP_); + local_irq_restore(flags); +} +EXPORT_SYMBOL(kfree); + +/* + * This initializes kmem_cache_node or resizes various caches for all nodes. + */ +static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) +{ + int node; + struct kmem_cache_node *n; + struct array_cache *new_shared; + struct alien_cache **new_alien = NULL; + + for_each_online_node(node) { + + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } + + new_shared = NULL; + if (cachep->shared) { + new_shared = alloc_arraycache(node, + cachep->shared*cachep->batchcount, + 0xbaadf00d, gfp); + if (!new_shared) { + free_alien_cache(new_alien); + goto fail; + } + } + + n = get_node(cachep, node); + if (n) { + struct array_cache *shared = n->shared; + LIST_HEAD(list); + + spin_lock_irq(&n->list_lock); + + if (shared) + free_block(cachep, shared->entry, + shared->avail, node, &list); + + n->shared = new_shared; + if (!n->alien) { + n->alien = new_alien; + new_alien = NULL; + } + n->free_limit = (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + kfree(shared); + free_alien_cache(new_alien); + continue; + } + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) { + free_alien_cache(new_alien); + kfree(new_shared); + goto fail; + } + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + n->shared = new_shared; + n->alien = new_alien; + n->free_limit = (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + cachep->node[node] = n; + } + return 0; + +fail: + if (!cachep->list.next) { + /* Cache is not active yet. Roll back what we did */ + node--; + while (node >= 0) { + n = get_node(cachep, node); + if (n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[node] = NULL; + } + node--; + } + } + return -ENOMEM; +} + +/* Always called with the slab_mutex held */ +static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + struct array_cache __percpu *cpu_cache, *prev; + int cpu; + + cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); + if (!cpu_cache) + return -ENOMEM; + + prev = cachep->cpu_cache; + cachep->cpu_cache = cpu_cache; + kick_all_cpus_sync(); + + check_irq_on(); + cachep->batchcount = batchcount; + cachep->limit = limit; + cachep->shared = shared; + + if (!prev) + goto alloc_node; + + for_each_online_cpu(cpu) { + LIST_HEAD(list); + int node; + struct kmem_cache_node *n; + struct array_cache *ac = per_cpu_ptr(prev, cpu); + + node = cpu_to_mem(cpu); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + } + free_percpu(prev); + +alloc_node: + return alloc_kmem_cache_node(cachep, gfp); +} + +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + int ret; + struct kmem_cache *c; + + ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + + if (slab_state < FULL) + return ret; + + if ((ret < 0) || !is_root_cache(cachep)) + return ret; + + lockdep_assert_held(&slab_mutex); + for_each_memcg_cache(c, cachep) { + /* return value determined by the root cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); + } + + return ret; +} + +/* Called with slab_mutex held always */ +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) +{ + int err; + int limit = 0; + int shared = 0; + int batchcount = 0; + + if (!is_root_cache(cachep)) { + struct kmem_cache *root = memcg_root_cache(cachep); + limit = root->limit; + shared = root->shared; + batchcount = root->batchcount; + } + + if (limit && shared && batchcount) + goto skip_setup; + /* + * The head array serves three purposes: + * - create a LIFO ordering, i.e. return objects that are cache-warm + * - reduce the number of spinlock operations. + * - reduce the number of linked list operations on the slab and + * bufctl chains: array operations are cheaper. + * The numbers are guessed, we should auto-tune as described by + * Bonwick. + */ + if (cachep->size > 131072) + limit = 1; + else if (cachep->size > PAGE_SIZE) + limit = 8; + else if (cachep->size > 1024) + limit = 24; + else if (cachep->size > 256) + limit = 54; + else + limit = 120; + + /* + * CPU bound tasks (e.g. network routing) can exhibit cpu bound + * allocation behaviour: Most allocs on one cpu, most free operations + * on another cpu. For these cases, an efficient object passing between + * cpus is necessary. This is provided by a shared array. The array + * replaces Bonwick's magazine layer. + * On uniprocessor, it's functionally equivalent (but less efficient) + * to a larger limit. Thus disabled by default. + */ + shared = 0; + if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) + shared = 8; + +#if DEBUG + /* + * With debugging enabled, large batchcount lead to excessively long + * periods with disabled local interrupts. Limit the batchcount + */ + if (limit > 32) + limit = 32; +#endif + batchcount = (limit + 1) / 2; +skip_setup: + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); + if (err) + printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + cachep->name, -err); + return err; +} + +/* + * Drain an array if it contains any elements taking the node lock only if + * necessary. Note that the node listlock also protects the array_cache + * if drain_array() is used on the shared array. + */ +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, + struct array_cache *ac, int force, int node) +{ + LIST_HEAD(list); + int tofree; + + if (!ac || !ac->avail) + return; + if (ac->touched && !force) { + ac->touched = 0; + } else { + spin_lock_irq(&n->list_lock); + if (ac->avail) { + tofree = force ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + free_block(cachep, ac->entry, tofree, node, &list); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), + sizeof(void *) * ac->avail); + } + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + } +} + +/** + * cache_reap - Reclaim memory from caches. + * @w: work descriptor + * + * Called from workqueue/eventd every few seconds. + * Purpose: + * - clear the per-cpu caches for this CPU. + * - return freeable pages to the main free memory pool. + * + * If we cannot acquire the cache chain mutex then just give up - we'll try + * again on the next iteration. + */ +static void cache_reap(struct work_struct *w) +{ + struct kmem_cache *searchp; + struct kmem_cache_node *n; + int node = numa_mem_id(); + struct delayed_work *work = to_delayed_work(w); + + if (!mutex_trylock(&slab_mutex)) + /* Give up. Setup the next iteration. */ + goto out; + + list_for_each_entry(searchp, &slab_caches, list) { + check_irq_on(); + + /* + * We only take the node lock if absolutely necessary and we + * have established with reasonable certainty that + * we can do some work if the lock was obtained. + */ + n = get_node(searchp, node); + + reap_alien(searchp, n); + + drain_array(searchp, n, cpu_cache_get(searchp), 0, node); + + /* + * These are racy checks but it does not matter + * if we skip one check or scan twice. + */ + if (time_after(n->next_reap, jiffies)) + goto next; + + n->next_reap = jiffies + REAPTIMEOUT_NODE; + + drain_array(searchp, n, n->shared, 0, node); + + if (n->free_touched) + n->free_touched = 0; + else { + int freed; + + freed = drain_freelist(searchp, n, (n->free_limit + + 5 * searchp->num - 1) / (5 * searchp->num)); + STATS_ADD_REAPED(searchp, freed); + } +next: + cond_resched(); + } + check_irq_on(); + mutex_unlock(&slab_mutex); + next_reap_node(); +out: + /* Set up the next iteration */ + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); +} + +#ifdef CONFIG_SLABINFO +void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) +{ + struct page *page; + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs = 0; + unsigned long num_slabs, free_objects = 0, shared_avail = 0; + const char *name; + char *error = NULL; + int node; + struct kmem_cache_node *n; + + active_objs = 0; + num_slabs = 0; + for_each_kmem_cache_node(cachep, node, n) { + + check_irq_on(); + spin_lock_irq(&n->list_lock); + + list_for_each_entry(page, &n->slabs_full, lru) { + if (page->active != cachep->num && !error) + error = "slabs_full accounting error"; + active_objs += cachep->num; + active_slabs++; + } + list_for_each_entry(page, &n->slabs_partial, lru) { + if (page->active == cachep->num && !error) + error = "slabs_partial accounting error"; + if (!page->active && !error) + error = "slabs_partial accounting error"; + active_objs += page->active; + active_slabs++; + } + list_for_each_entry(page, &n->slabs_free, lru) { + if (page->active && !error) + error = "slabs_free accounting error"; + num_slabs++; + } + free_objects += n->free_objects; + if (n->shared) + shared_avail += n->shared->avail; + + spin_unlock_irq(&n->list_lock); + } + num_slabs += active_slabs; + num_objs = num_slabs * cachep->num; + if (num_objs - active_objs != free_objects && !error) + error = "free_objects accounting error"; + + name = cachep->name; + if (error) + printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + + sinfo->active_objs = active_objs; + sinfo->num_objs = num_objs; + sinfo->active_slabs = active_slabs; + sinfo->num_slabs = num_slabs; + sinfo->shared_avail = shared_avail; + sinfo->limit = cachep->limit; + sinfo->batchcount = cachep->batchcount; + sinfo->shared = cachep->shared; + sinfo->objects_per_slab = cachep->num; + sinfo->cache_order = cachep->gfporder; +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) +{ +#if STATS + { /* node stats */ + unsigned long high = cachep->high_mark; + unsigned long allocs = cachep->num_allocations; + unsigned long grown = cachep->grown; + unsigned long reaped = cachep->reaped; + unsigned long errors = cachep->errors; + unsigned long max_freeable = cachep->max_freeable; + unsigned long node_allocs = cachep->node_allocs; + unsigned long node_frees = cachep->node_frees; + unsigned long overflows = cachep->node_overflow; + + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " + "%4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees, overflows); + } + /* cpu stats */ + { + unsigned long allochit = atomic_read(&cachep->allochit); + unsigned long allocmiss = atomic_read(&cachep->allocmiss); + unsigned long freehit = atomic_read(&cachep->freehit); + unsigned long freemiss = atomic_read(&cachep->freemiss); + + seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", + allochit, allocmiss, freehit, freemiss); + } +#endif +} + +#define MAX_SLABINFO_WRITE 128 +/** + * slabinfo_write - Tuning for the slab allocator + * @file: unused + * @buffer: user buffer + * @count: data length + * @ppos: unused + */ +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; + int limit, batchcount, shared, res; + struct kmem_cache *cachep; + + if (count > MAX_SLABINFO_WRITE) + return -EINVAL; + if (copy_from_user(&kbuf, buffer, count)) + return -EFAULT; + kbuf[MAX_SLABINFO_WRITE] = '\0'; + + tmp = strchr(kbuf, ' '); + if (!tmp) + return -EINVAL; + *tmp = '\0'; + tmp++; + if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) + return -EINVAL; + + /* Find the cache in the chain of caches. */ + mutex_lock(&slab_mutex); + res = -EINVAL; + list_for_each_entry(cachep, &slab_caches, list) { + if (!strcmp(cachep->name, kbuf)) { + if (limit < 1 || batchcount < 1 || + batchcount > limit || shared < 0) { + res = 0; + } else { + res = do_tune_cpucache(cachep, limit, + batchcount, shared, + GFP_KERNEL); + } + break; + } + } + mutex_unlock(&slab_mutex); + if (res >= 0) + res = count; + return res; +} + +#ifdef CONFIG_DEBUG_SLAB_LEAK + +static inline int add_caller(unsigned long *n, unsigned long v) +{ + unsigned long *p; + int l; + if (!v) + return 1; + l = n[1]; + p = n + 2; + while (l) { + int i = l/2; + unsigned long *q = p + 2 * i; + if (*q == v) { + q[1]++; + return 1; + } + if (*q > v) { + l = i; + } else { + p = q + 2; + l -= i + 1; + } + } + if (++n[1] == n[0]) + return 0; + memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); + p[0] = v; + p[1] = 1; + return 1; +} + +static void handle_slab(unsigned long *n, struct kmem_cache *c, + struct page *page) +{ + void *p; + int i; + + if (n[0] == n[1]) + return; + for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { + if (get_obj_status(page, i) != OBJECT_ACTIVE) + continue; + + if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + return; + } +} + +static void show_symbol(struct seq_file *m, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset, size; + char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; + + if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { + seq_printf(m, "%s+%#lx/%#lx", name, offset, size); + if (modname[0]) + seq_printf(m, " [%s]", modname); + return; + } +#endif + seq_printf(m, "%p", (void *)address); +} + +static int leaks_show(struct seq_file *m, void *p) +{ + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); + struct page *page; + struct kmem_cache_node *n; + const char *name; + unsigned long *x = m->private; + int node; + int i; + + if (!(cachep->flags & SLAB_STORE_USER)) + return 0; + if (!(cachep->flags & SLAB_RED_ZONE)) + return 0; + + /* OK, we can do it */ + + x[1] = 0; + + for_each_kmem_cache_node(cachep, node, n) { + + check_irq_on(); + spin_lock_irq(&n->list_lock); + + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + name = cachep->name; + if (x[0] == x[1]) { + /* Increase the buffer size */ + mutex_unlock(&slab_mutex); + m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL); + if (!m->private) { + /* Too bad, we are really out */ + m->private = x; + mutex_lock(&slab_mutex); + return -ENOMEM; + } + *(unsigned long *)m->private = x[0] * 2; + kfree(x); + mutex_lock(&slab_mutex); + /* Now make sure this entry will be retried */ + m->count = m->size; + return 0; + } + for (i = 0; i < x[1]; i++) { + seq_printf(m, "%s: %lu ", name, x[2*i+3]); + show_symbol(m, x[2*i+2]); + seq_putc(m, '\n'); + } + + return 0; +} + +static const struct seq_operations slabstats_op = { + .start = slab_start, + .next = slab_next, + .stop = slab_stop, + .show = leaks_show, +}; + +static int slabstats_open(struct inode *inode, struct file *file) +{ + unsigned long *n; + + n = __seq_open_private(file, &slabstats_op, PAGE_SIZE); + if (!n) + return -ENOMEM; + + *n = PAGE_SIZE / (2 * sizeof(unsigned long)); + + return 0; +} + +static const struct file_operations proc_slabstats_operations = { + .open = slabstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static int __init slab_proc_init(void) +{ +#ifdef CONFIG_DEBUG_SLAB_LEAK + proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); +#endif + return 0; +} +module_init(slab_proc_init); +#endif + +/** + * ksize - get the actual amount of memory allocated for a given object + * @objp: Pointer to the object + * + * kmalloc may internally round up allocations and return more memory + * than requested. ksize() can be used to determine the actual amount of + * memory allocated. The caller may use this additional memory, even though + * a smaller amount of memory was initially specified with the kmalloc call. + * The caller must guarantee that objp points to a valid object previously + * allocated with either kmalloc() or kmem_cache_alloc(). The object + * must not be freed during the duration of the call. + */ +size_t ksize(const void *objp) +{ + BUG_ON(!objp); + if (unlikely(objp == ZERO_SIZE_PTR)) + return 0; + + return virt_to_cache(objp)->object_size; +} +EXPORT_SYMBOL(ksize); diff --git a/kernel/mm/slab.h b/kernel/mm/slab.h new file mode 100644 index 000000000..0c9bda0eb --- /dev/null +++ b/kernel/mm/slab.h @@ -0,0 +1,384 @@ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + unsigned long flags; /* Active flags on the slab */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include +#endif + +#ifdef CONFIG_SLUB +#include +#endif + +#include + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size); + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void create_kmalloc_caches(unsigned long); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); +#endif + + +/* Functions provided by the slab allocators */ +extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); + +extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, + unsigned long flags); +extern void create_boot_cache(struct kmem_cache *, const char *name, + size_t size, unsigned long flags); + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)); + +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)); +#else +static inline struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ return NULL; } + +static inline unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_NOTRACK) +#else +#define SLAB_CACHE_FLAGS (0) +#endif + +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +int __kmem_cache_shutdown(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *, bool); +void slab_kmem_cache_release(struct kmem_cache *); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +#ifdef CONFIG_MEMCG_KMEM +/* + * Iterate over all memcg caches of the given root cache. The caller must hold + * slab_mutex. + */ +#define for_each_memcg_cache(iter, root) \ + list_for_each_entry(iter, &(root)->memcg_params.list, \ + memcg_params.list) + +#define for_each_memcg_cache_safe(iter, tmp, root) \ + list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ + memcg_params.list) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return s->memcg_params.is_root_cache; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return p == s || p == s->memcg_params.root_cache; +} + +/* + * We use suffixes to the name in memcg because we can't have caches + * created in the system with the same name. But when we print them + * locally, better refer to them with the base name + */ +static inline const char *cache_name(struct kmem_cache *s) +{ + if (!is_root_cache(s)) + s = s->memcg_params.root_cache; + return s->name; +} + +/* + * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. + * That said the caller must assure the memcg's cache won't go away by either + * taking a css reference to the owner cgroup, or holding the slab_mutex. + */ +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) +{ + struct kmem_cache *cachep; + struct memcg_cache_array *arr; + + rcu_read_lock(); + arr = rcu_dereference(s->memcg_params.memcg_caches); + + /* + * Make sure we will access the up-to-date value. The code updating + * memcg_caches issues a write barrier to match this (see + * memcg_create_kmem_cache()). + */ + cachep = lockless_dereference(arr->entries[idx]); + rcu_read_unlock(); + + return cachep; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + if (is_root_cache(s)) + return s; + return s->memcg_params.root_cache; +} + +static __always_inline int memcg_charge_slab(struct kmem_cache *s, + gfp_t gfp, int order) +{ + if (!memcg_kmem_enabled()) + return 0; + if (is_root_cache(s)) + return 0; + return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); +} + +static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ + if (!memcg_kmem_enabled()) + return; + if (is_root_cache(s)) + return; + memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); +} + +extern void slab_init_memcg_params(struct kmem_cache *); + +#else /* !CONFIG_MEMCG_KMEM */ + +#define for_each_memcg_cache(iter, root) \ + for ((void)(iter), (void)(root); 0; ) +#define for_each_memcg_cache_safe(iter, tmp, root) \ + for ((void)(iter), (void)(tmp), (void)(root); 0; ) + +static inline bool is_root_cache(struct kmem_cache *s) +{ + return true; +} + +static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) +{ + return true; +} + +static inline const char *cache_name(struct kmem_cache *s) +{ + return s->name; +} + +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) +{ + return NULL; +} + +static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) +{ + return s; +} + +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ +} + +static inline void slab_init_memcg_params(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + struct page *page; + + /* + * When kmemcg is not being used, both assignments should return the + * same value. but we don't want to pay the assignment price in that + * case. If it is not compiled in, the compiler should be smart enough + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ + if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + return s; + + page = virt_to_head_page(x); + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __func__, cachep->name, s->name); + WARN_ON_ONCE(1); + return s; +} + +#ifndef CONFIG_SLOB +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { +#ifdef CONFIG_SLUB + raw_spinlock_t list_lock; +#else + spinlock_t list_lock; +#endif + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct alien_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +#endif + +void *slab_start(struct seq_file *m, loff_t *pos); +void *slab_next(struct seq_file *m, void *p, loff_t *pos); +void slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); + +#endif /* MM_SLAB_H */ diff --git a/kernel/mm/slab_common.c b/kernel/mm/slab_common.c new file mode 100644 index 000000000..999bb3424 --- /dev/null +++ b/kernel/mm/slab_common.c @@ -0,0 +1,1171 @@ +/* + * Slab allocator functions that are independent of the allocator strategy + * + * (C) 2012 Christoph Lameter + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include "slab.h" + +enum slab_state slab_state; +LIST_HEAD(slab_caches); +DEFINE_MUTEX(slab_mutex); +struct kmem_cache *kmem_cache; + +/* + * Set of flags that will prevent slab merging + */ +#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) + +#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA | SLAB_NOTRACK) + +/* + * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) + */ +static int slab_nomerge; + +static int __init setup_slab_nomerge(char *str) +{ + slab_nomerge = 1; + return 1; +} + +#ifdef CONFIG_SLUB +__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); +#endif + +__setup("slab_nomerge", setup_slab_nomerge); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->object_size; +} +EXPORT_SYMBOL(kmem_cache_size); + +#ifdef CONFIG_DEBUG_VM +static int kmem_cache_sanity_check(const char *name, size_t size) +{ + struct kmem_cache *s = NULL; + + if (!name || in_interrupt() || size < sizeof(void *) || + size > KMALLOC_MAX_SIZE) { + pr_err("kmem_cache_create(%s) integrity check failed\n", name); + return -EINVAL; + } + + list_for_each_entry(s, &slab_caches, list) { + char tmp; + int res; + + /* + * This happens when the module gets unloaded and doesn't + * destroy its slab cache and no-one else reuses the vmalloc + * area of the module. Print a warning. + */ + res = probe_kernel_address(s->name, tmp); + if (res) { + pr_err("Slab cache with size %d has lost its name\n", + s->object_size); + continue; + } + } + + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ + return 0; +} +#else +static inline int kmem_cache_sanity_check(const char *name, size_t size) +{ + return 0; +} +#endif + +#ifdef CONFIG_MEMCG_KMEM +void slab_init_memcg_params(struct kmem_cache *s) +{ + s->memcg_params.is_root_cache = true; + INIT_LIST_HEAD(&s->memcg_params.list); + RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); +} + +static int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct memcg_cache_array *arr; + + if (memcg) { + s->memcg_params.is_root_cache = false; + s->memcg_params.memcg = memcg; + s->memcg_params.root_cache = root_cache; + return 0; + } + + slab_init_memcg_params(s); + + if (!memcg_nr_cache_ids) + return 0; + + arr = kzalloc(sizeof(struct memcg_cache_array) + + memcg_nr_cache_ids * sizeof(void *), + GFP_KERNEL); + if (!arr) + return -ENOMEM; + + RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr); + return 0; +} + +static void destroy_memcg_params(struct kmem_cache *s) +{ + if (is_root_cache(s)) + kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); +} + +static int update_memcg_params(struct kmem_cache *s, int new_array_size) +{ + struct memcg_cache_array *old, *new; + + if (!is_root_cache(s)) + return 0; + + new = kzalloc(sizeof(struct memcg_cache_array) + + new_array_size * sizeof(void *), GFP_KERNEL); + if (!new) + return -ENOMEM; + + old = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + if (old) + memcpy(new->entries, old->entries, + memcg_nr_cache_ids * sizeof(void *)); + + rcu_assign_pointer(s->memcg_params.memcg_caches, new); + if (old) + kfree_rcu(old, rcu); + return 0; +} + +int memcg_update_all_caches(int num_memcgs) +{ + struct kmem_cache *s; + int ret = 0; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + ret = update_memcg_params(s, num_memcgs); + /* + * Instead of freeing the memory, we'll just leave the caches + * up to this point in an updated state. + */ + if (ret) + break; + } + mutex_unlock(&slab_mutex); + return ret; +} +#else +static inline int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + return 0; +} + +static inline void destroy_memcg_params(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +/* + * Find a mergeable slab cache + */ +int slab_unmergeable(struct kmem_cache *s) +{ + if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE)) + return 1; + + if (!is_root_cache(s)) + return 1; + + if (s->ctor) + return 1; + + /* + * We may have set a slab to be unmergeable during bootstrap. + */ + if (s->refcount < 0) + return 1; + + return 0; +} + +struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) +{ + struct kmem_cache *s; + + if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) + return NULL; + + if (ctor) + return NULL; + + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name, NULL); + + list_for_each_entry_reverse(s, &slab_caches, list) { + if (slab_unmergeable(s)) + continue; + + if (size > s->size) + continue; + + if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align - 1)) != s->size) + continue; + + if (s->size - size >= sizeof(void *)) + continue; + + if (IS_ENABLED(CONFIG_SLAB) && align && + (align > s->align || s->align % align)) + continue; + + return s; + } + return NULL; +} + +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +static struct kmem_cache * +do_kmem_cache_create(const char *name, size_t object_size, size_t size, + size_t align, unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + int err; + + err = -ENOMEM; + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (!s) + goto out; + + s->name = name; + s->object_size = object_size; + s->size = size; + s->align = align; + s->ctor = ctor; + + err = init_memcg_params(s, memcg, root_cache); + if (err) + goto out_free_cache; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); +out: + if (err) + return ERR_PTR(err); + return s; + +out_free_cache: + destroy_memcg_params(s); + kmem_cache_free(kmem_cache, s); + goto out; +} + +/* + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + */ +struct kmem_cache * +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *s; + const char *cache_name; + int err; + + get_online_cpus(); + get_online_mems(); + memcg_get_cache_ids(); + + mutex_lock(&slab_mutex); + + err = kmem_cache_sanity_check(name, size); + if (err) { + s = NULL; /* suppress uninit var warning */ + goto out_unlock; + } + + /* + * Some allocators will constraint the set of valid flags to a subset + * of all flags. We expect them to define CACHE_CREATE_MASK in this + * case, and we'll just provide them with a sanitized version of the + * passed flags. + */ + flags &= CACHE_CREATE_MASK; + + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_unlock; + + cache_name = kstrdup_const(name, GFP_KERNEL); + if (!cache_name) { + err = -ENOMEM; + goto out_unlock; + } + + s = do_kmem_cache_create(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); + if (IS_ERR(s)) { + err = PTR_ERR(s); + kfree_const(cache_name); + } + +out_unlock: + mutex_unlock(&slab_mutex); + + memcg_put_cache_ids(); + put_online_mems(); + put_online_cpus(); + + if (err) { + if (flags & SLAB_PANIC) + panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", + name, err); + else { + printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + name, err); + dump_stack(); + } + return NULL; + } + return s; +} +EXPORT_SYMBOL(kmem_cache_create); + +static int do_kmem_cache_shutdown(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + if (__kmem_cache_shutdown(s) != 0) { + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + return -EBUSY; + } + + if (s->flags & SLAB_DESTROY_BY_RCU) + *need_rcu_barrier = true; + +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + list_del(&s->memcg_params.list); +#endif + list_move(&s->list, release); + return 0; +} + +static void do_kmem_cache_release(struct list_head *release, + bool need_rcu_barrier) +{ + struct kmem_cache *s, *s2; + + if (need_rcu_barrier) + rcu_barrier(); + + list_for_each_entry_safe(s, s2, release, list) { +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif + } +} + +#ifdef CONFIG_MEMCG_KMEM +/* + * memcg_create_kmem_cache - Create a cache for a memory cgroup. + * @memcg: The memory cgroup the new cache is for. + * @root_cache: The parent of the new cache. + * + * This function attempts to create a kmem cache that will serve allocation + * requests going from @memcg to @root_cache. The new cache inherits properties + * from its parent. + */ +void memcg_create_kmem_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) +{ + static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ + struct cgroup_subsys_state *css = mem_cgroup_css(memcg); + struct memcg_cache_array *arr; + struct kmem_cache *s = NULL; + char *cache_name; + int idx; + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + + /* + * The memory cgroup could have been deactivated while the cache + * creation work was pending. + */ + if (!memcg_kmem_is_active(memcg)) + goto out_unlock; + + idx = memcg_cache_id(memcg); + arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (arr->entries[idx]) + goto out_unlock; + + cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + css->id, memcg_name_buf); + if (!cache_name) + goto out_unlock; + + s = do_kmem_cache_create(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); + /* + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. + */ + if (IS_ERR(s)) { + kfree(cache_name); + goto out_unlock; + } + + list_add(&s->memcg_params.list, &root_cache->memcg_params.list); + + /* + * Since readers won't lock (see cache_from_memcg_idx()), we need a + * barrier here to ensure nobody will see the kmem_cache partially + * initialized. + */ + smp_wmb(); + arr->entries[idx] = s; + +out_unlock: + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); +} + +void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_cache_array *arr; + struct kmem_cache *s, *c; + + idx = memcg_cache_id(memcg); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + c = arr->entries[idx]; + if (!c) + continue; + + __kmem_cache_shrink(c, true); + arr->entries[idx] = NULL; + } + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); +} + +void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) +{ + LIST_HEAD(release); + bool need_rcu_barrier = false; + struct kmem_cache *s, *s2; + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (is_root_cache(s) || s->memcg_params.memcg != memcg) + continue; + /* + * The cgroup is about to be freed and therefore has no charges + * left. Hence, all its caches must be empty by now. + */ + BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + } + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); +} +#endif /* CONFIG_MEMCG_KMEM */ + +void slab_kmem_cache_release(struct kmem_cache *s) +{ + destroy_memcg_params(s); + kfree_const(s->name); + kmem_cache_free(kmem_cache, s); +} + +void kmem_cache_destroy(struct kmem_cache *s) +{ + struct kmem_cache *c, *c2; + LIST_HEAD(release); + bool need_rcu_barrier = false; + bool busy = false; + + BUG_ON(!is_root_cache(s)); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + + s->refcount--; + if (s->refcount) + goto out_unlock; + + for_each_memcg_cache_safe(c, c2, s) { + if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + busy = true; + } + + if (!busy) + do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); + +out_unlock: + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); + + do_kmem_cache_release(&release, need_rcu_barrier); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(struct kmem_cache *cachep) +{ + int ret; + + get_online_cpus(); + get_online_mems(); + ret = __kmem_cache_shrink(cachep, false); + put_online_mems(); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL(kmem_cache_shrink); + +int slab_is_available(void) +{ + return slab_state >= UP; +} + +#ifndef CONFIG_SLOB +/* Create a cache during boot when no slab services are available yet */ +void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, + unsigned long flags) +{ + int err; + + s->name = name; + s->size = s->object_size = size; + s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + slab_init_memcg_params(s); + + err = __kmem_cache_create(s, flags); + + if (err) + panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", + name, size, err); + + s->refcount = -1; /* Exempt from merging for now */ +} + +struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, + unsigned long flags) +{ + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + if (!s) + panic("Out of memory when creating slab %s\n", name); + + create_boot_cache(s, name, size, flags); + list_add(&s->list, &slab_caches); + s->refcount = 1; + return s; +} + +struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +EXPORT_SYMBOL(kmalloc_caches); + +#ifdef CONFIG_ZONE_DMA +struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; +EXPORT_SYMBOL(kmalloc_dma_caches); +#endif + +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static s8 size_index[24] = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + +/* + * Find the kmem_cache structure that serves a given size of + * allocation + */ +struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) +{ + int index; + + if (unlikely(size > KMALLOC_MAX_SIZE)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; + + index = size_index[size_index_elem(size)]; + } else + index = fls(size - 1); + +#ifdef CONFIG_ZONE_DMA + if (unlikely((flags & GFP_DMA))) + return kmalloc_dma_caches[index]; + +#endif + return kmalloc_caches[index]; +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + + /* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } + + if (KMALLOC_MIN_SIZE >= 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + + } + + if (KMALLOC_MIN_SIZE >= 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[size_index_elem(i)] = 8; + } + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[i]) { + kmalloc_caches[i] = create_kmalloc_cache(NULL, + 1 << i, flags); + } + + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) + kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); + + if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) + kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + } + + /* Kmalloc array is now usable */ + slab_state = UP; + + for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + char *n; + + if (s) { + n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); + + BUG_ON(!n); + s->name = n; + } + } + +#ifdef CONFIG_ZONE_DMA + for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + + if (s) { + int size = kmalloc_size(i); + char *n = kasprintf(GFP_NOWAIT, + "dma-kmalloc-%d", size); + + BUG_ON(!n); + kmalloc_dma_caches[i] = create_kmalloc_cache(n, + size, SLAB_CACHE_DMA | flags); + } + } +#endif +} +#endif /* !CONFIG_SLOB */ + +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + kasan_kmalloc_large(ret, size); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + +#ifdef CONFIG_TRACING +void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +{ + void *ret = kmalloc_order(size, flags, order); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order_trace); +#endif + +#ifdef CONFIG_SLABINFO + +#ifdef CONFIG_SLAB +#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) +#else +#define SLABINFO_RIGHTS S_IRUSR +#endif + +static void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name " + " "); + seq_puts(m, " : tunables "); + seq_puts(m, " : slabdata "); +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, " : globalstat " + " "); + seq_puts(m, " : cpustat "); +#endif + seq_putc(m, '\n'); +} + +void *slab_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&slab_mutex); + return seq_list_start(&slab_caches, *pos); +} + +void *slab_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +void slab_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + +static void +memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) +{ + struct kmem_cache *c; + struct slabinfo sinfo; + + if (!is_root_cache(s)) + return; + + for_each_memcg_cache(c, s) { + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(c, &sinfo); + + info->active_slabs += sinfo.active_slabs; + info->num_slabs += sinfo.num_slabs; + info->shared_avail += sinfo.shared_avail; + info->active_objs += sinfo.active_objs; + info->num_objs += sinfo.num_objs; + } +} + +static void cache_show(struct kmem_cache *s, struct seq_file *m) +{ + struct slabinfo sinfo; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + + memcg_accumulate_slabinfo(s, &sinfo); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, + sinfo.objects_per_slab, (1 << sinfo.cache_order)); + + seq_printf(m, " : tunables %4u %4u %4u", + sinfo.limit, sinfo.batchcount, sinfo.shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); + slabinfo_show_stats(m, s); + seq_putc(m, '\n'); +} + +static int slab_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (p == slab_caches.next) + print_slabinfo_header(m); + if (is_root_cache(s)) + cache_show(s, m); + return 0; +} + +#ifdef CONFIG_MEMCG_KMEM +int memcg_slab_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + if (p == slab_caches.next) + print_slabinfo_header(m); + if (!is_root_cache(s) && s->memcg_params.memcg == memcg) + cache_show(s, m); + return 0; +} +#endif + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ +static const struct seq_operations slabinfo_op = { + .start = slab_start, + .next = slab_next, + .stop = slab_stop, + .show = slab_show, +}; + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .write = slabinfo_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", SLABINFO_RIGHTS, NULL, + &proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); +#endif /* CONFIG_SLABINFO */ + +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) { + kasan_krealloc((void *)p, new_size); + return (void *)p; + } + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/kernel/mm/slob.c b/kernel/mm/slob.c new file mode 100644 index 000000000..4765f6501 --- /dev/null +++ b/kernel/mm/slob.c @@ -0,0 +1,641 @@ +/* + * SLOB Allocator: Simple List Of Blocks + * + * Matt Mackall 12/30/03 + * + * NUMA support by Paul Mundt, 2007. + * + * How SLOB works: + * + * The core of SLOB is a traditional K&R style heap allocator, with + * support for returning aligned objects. The granularity of this + * allocator is as little as 2 bytes, however typically most architectures + * will require 4 bytes on 32-bit and 8 bytes on 64-bit. + * + * The slob heap is a set of linked list of pages from alloc_pages(), + * and within each page, there is a singly-linked list of free blocks + * (slob_t). The heap is grown on demand. To reduce fragmentation, + * heap pages are segregated into three lists, with objects less than + * 256 bytes, objects less than 1024 bytes, and all other objects. + * + * Allocation from heap involves first searching for a page with + * sufficient free blocks (using a next-fit-like approach) followed by + * a first-fit scan of the page. Deallocation inserts objects back + * into the free list in address order, so this is effectively an + * address-ordered first fit. + * + * Above this is an implementation of kmalloc/kfree. Blocks returned + * from kmalloc are prepended with a 4-byte header with the kmalloc size. + * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls + * alloc_pages() directly, allocating compound pages so the page order + * does not have to be separately tracked. + * These objects are detected in kfree() because PageSlab() + * is false for them. + * + * SLAB is emulated on top of SLOB by simply calling constructors and + * destructors for every SLAB allocation. Objects are returned with the + * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which + * case the low-level allocator will fragment blocks to create the proper + * alignment. Again, objects of page-size or greater are allocated by + * calling alloc_pages(). As SLAB objects know their size, no separate + * size bookkeeping is necessary and there is essentially no allocation + * space overhead, and compound pages aren't needed for multi-page + * allocations. + * + * NUMA support in SLOB is fairly simplistic, pushing most of the real + * logic down to the page allocator, and simply doing the node accounting + * on the upper levels. In the event that a node id is explicitly + * provided, alloc_pages_exact_node() with the specified node id is used + * instead. The common case (or when the node id isn't explicitly provided) + * will default to the current node, as per numa_node_id(). + * + * Node aware pages are still inserted in to the global freelist, and + * these are scanned for by matching against the node id encoded in the + * page flags. As a result, block allocations that can be satisfied from + * the freelist will only be done so on pages residing on the same node, + * in order to prevent random node placement. + */ + +#include +#include + +#include +#include /* struct reclaim_state */ +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "slab.h" +/* + * slob_block has a field 'units', which indicates size of block if +ve, + * or offset of next block if -ve (in SLOB_UNITs). + * + * Free blocks of size 1 unit simply contain the offset of the next block. + * Those with larger size contain their size in the first SLOB_UNIT of + * memory, and the offset of the next free block in the second SLOB_UNIT. + */ +#if PAGE_SIZE <= (32767 * 2) +typedef s16 slobidx_t; +#else +typedef s32 slobidx_t; +#endif + +struct slob_block { + slobidx_t units; +}; +typedef struct slob_block slob_t; + +/* + * All partially free slob pages go on these lists. + */ +#define SLOB_BREAK1 256 +#define SLOB_BREAK2 1024 +static LIST_HEAD(free_slob_small); +static LIST_HEAD(free_slob_medium); +static LIST_HEAD(free_slob_large); + +/* + * slob_page_free: true for pages on free_slob_pages list. + */ +static inline int slob_page_free(struct page *sp) +{ + return PageSlobFree(sp); +} + +static void set_slob_page_free(struct page *sp, struct list_head *list) +{ + list_add(&sp->lru, list); + __SetPageSlobFree(sp); +} + +static inline void clear_slob_page_free(struct page *sp) +{ + list_del(&sp->lru); + __ClearPageSlobFree(sp); +} + +#define SLOB_UNIT sizeof(slob_t) +#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) + +/* + * struct slob_rcu is inserted at the tail of allocated slob blocks, which + * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free + * the block using call_rcu. + */ +struct slob_rcu { + struct rcu_head head; + int size; +}; + +/* + * slob_lock protects all slob allocator structures. + */ +static DEFINE_SPINLOCK(slob_lock); + +/* + * Encode the given size and next info into a free slob block s. + */ +static void set_slob(slob_t *s, slobidx_t size, slob_t *next) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t offset = next - base; + + if (size > 1) { + s[0].units = size; + s[1].units = offset; + } else + s[0].units = -offset; +} + +/* + * Return the size of a slob block. + */ +static slobidx_t slob_units(slob_t *s) +{ + if (s->units > 0) + return s->units; + return 1; +} + +/* + * Return the next free slob block pointer after this one. + */ +static slob_t *slob_next(slob_t *s) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t next; + + if (s[0].units < 0) + next = -s[0].units; + else + next = s[1].units; + return base+next; +} + +/* + * Returns true if s is the last free block in its page. + */ +static int slob_last(slob_t *s) +{ + return !((unsigned long)slob_next(s) & ~PAGE_MASK); +} + +static void *slob_new_pages(gfp_t gfp, int order, int node) +{ + void *page; + +#ifdef CONFIG_NUMA + if (node != NUMA_NO_NODE) + page = alloc_pages_exact_node(node, gfp, order); + else +#endif + page = alloc_pages(gfp, order); + + if (!page) + return NULL; + + return page_address(page); +} + +static void slob_free_pages(void *b, int order) +{ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += 1 << order; + free_pages((unsigned long)b, order); +} + +/* + * Allocate a slob block within a given slob_page sp. + */ +static void *slob_page_alloc(struct page *sp, size_t size, int align) +{ + slob_t *prev, *cur, *aligned = NULL; + int delta = 0, units = SLOB_UNITS(size); + + for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { + slobidx_t avail = slob_units(cur); + + if (align) { + aligned = (slob_t *)ALIGN((unsigned long)cur, align); + delta = aligned - cur; + } + if (avail >= units + delta) { /* room enough? */ + slob_t *next; + + if (delta) { /* need to fragment head to align? */ + next = slob_next(cur); + set_slob(aligned, avail - delta, next); + set_slob(cur, delta, aligned); + prev = cur; + cur = aligned; + avail = slob_units(cur); + } + + next = slob_next(cur); + if (avail == units) { /* exact fit? unlink. */ + if (prev) + set_slob(prev, slob_units(prev), next); + else + sp->freelist = next; + } else { /* fragment */ + if (prev) + set_slob(prev, slob_units(prev), cur + units); + else + sp->freelist = cur + units; + set_slob(cur + units, avail - units, next); + } + + sp->units -= units; + if (!sp->units) + clear_slob_page_free(sp); + return cur; + } + if (slob_last(cur)) + return NULL; + } +} + +/* + * slob_alloc: entry point into the slob allocator. + */ +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +{ + struct page *sp; + struct list_head *prev; + struct list_head *slob_list; + slob_t *b = NULL; + unsigned long flags; + + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + + spin_lock_irqsave(&slob_lock, flags); + /* Iterate through each partially free page, try to find room */ + list_for_each_entry(sp, slob_list, lru) { +#ifdef CONFIG_NUMA + /* + * If there's a node specification, search for a partial + * page with a matching node id in the freelist. + */ + if (node != NUMA_NO_NODE && page_to_nid(sp) != node) + continue; +#endif + /* Enough room on this page? */ + if (sp->units < SLOB_UNITS(size)) + continue; + + /* Attempt to alloc */ + prev = sp->lru.prev; + b = slob_page_alloc(sp, size, align); + if (!b) + continue; + + /* Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) */ + if (prev != slob_list->prev && + slob_list->next != prev->next) + list_move_tail(slob_list, prev->next); + break; + } + spin_unlock_irqrestore(&slob_lock, flags); + + /* Not enough space: must allocate a new page */ + if (!b) { + b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); + if (!b) + return NULL; + sp = virt_to_page(b); + __SetPageSlab(sp); + + spin_lock_irqsave(&slob_lock, flags); + sp->units = SLOB_UNITS(PAGE_SIZE); + sp->freelist = b; + INIT_LIST_HEAD(&sp->lru); + set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); + set_slob_page_free(sp, slob_list); + b = slob_page_alloc(sp, size, align); + BUG_ON(!b); + spin_unlock_irqrestore(&slob_lock, flags); + } + if (unlikely((gfp & __GFP_ZERO) && b)) + memset(b, 0, size); + return b; +} + +/* + * slob_free: entry point into the slob allocator. + */ +static void slob_free(void *block, int size) +{ + struct page *sp; + slob_t *prev, *next, *b = (slob_t *)block; + slobidx_t units; + unsigned long flags; + struct list_head *slob_list; + + if (unlikely(ZERO_OR_NULL_PTR(block))) + return; + BUG_ON(!size); + + sp = virt_to_page(block); + units = SLOB_UNITS(size); + + spin_lock_irqsave(&slob_lock, flags); + + if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { + /* Go directly to page allocator. Do not pass slob allocator */ + if (slob_page_free(sp)) + clear_slob_page_free(sp); + spin_unlock_irqrestore(&slob_lock, flags); + __ClearPageSlab(sp); + page_mapcount_reset(sp); + slob_free_pages(b, 0); + return; + } + + if (!slob_page_free(sp)) { + /* This slob page is about to become partially free. Easy! */ + sp->units = units; + sp->freelist = b; + set_slob(b, units, + (void *)((unsigned long)(b + + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + set_slob_page_free(sp, slob_list); + goto out; + } + + /* + * Otherwise the page is already partially free, so find reinsertion + * point. + */ + sp->units += units; + + if (b < (slob_t *)sp->freelist) { + if (b + units == sp->freelist) { + units += slob_units(sp->freelist); + sp->freelist = slob_next(sp->freelist); + } + set_slob(b, units, sp->freelist); + sp->freelist = b; + } else { + prev = sp->freelist; + next = slob_next(prev); + while (b > next) { + prev = next; + next = slob_next(prev); + } + + if (!slob_last(prev) && b + units == next) { + units += slob_units(next); + set_slob(b, units, slob_next(next)); + } else + set_slob(b, units, next); + + if (prev + slob_units(prev) == b) { + units = slob_units(b) + slob_units(prev); + set_slob(prev, units, slob_next(b)); + } else + set_slob(prev, slob_units(prev), b); + } +out: + spin_unlock_irqrestore(&slob_lock, flags); +} + +/* + * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. + */ + +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) +{ + unsigned int *m; + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + void *ret; + + gfp &= gfp_allowed_mask; + + lockdep_trace_alloc(gfp); + + if (size < PAGE_SIZE - align) { + if (!size) + return ZERO_SIZE_PTR; + + m = slob_alloc(size + align, gfp, align, node); + + if (!m) + return NULL; + *m = size; + ret = (void *)m + align; + + trace_kmalloc_node(caller, ret, + size, size + align, gfp, node); + } else { + unsigned int order = get_order(size); + + if (likely(order)) + gfp |= __GFP_COMP; + ret = slob_new_pages(gfp, order, node); + + trace_kmalloc_node(caller, ret, + size, PAGE_SIZE << order, gfp, node); + } + + kmemleak_alloc(ret, size, 1, gfp); + return ret; +} + +void *__kmalloc(size_t size, gfp_t gfp) +{ + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); +} + +#ifdef CONFIG_NUMA +void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, node, caller); +} +#endif + +void kfree(const void *block) +{ + struct page *sp; + + trace_kfree(_RET_IP_, block); + + if (unlikely(ZERO_OR_NULL_PTR(block))) + return; + kmemleak_free(block); + + sp = virt_to_page(block); + if (PageSlab(sp)) { + int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + slob_free(m, *m + align); + } else + __free_pages(sp, compound_order(sp)); +} +EXPORT_SYMBOL(kfree); + +/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ +size_t ksize(const void *block) +{ + struct page *sp; + int align; + unsigned int *m; + + BUG_ON(!block); + if (unlikely(block == ZERO_SIZE_PTR)) + return 0; + + sp = virt_to_page(block); + if (unlikely(!PageSlab(sp))) + return PAGE_SIZE << compound_order(sp); + + align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; +} +EXPORT_SYMBOL(ksize); + +int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) +{ + if (flags & SLAB_DESTROY_BY_RCU) { + /* leave room for rcu footer at the end of object */ + c->size += sizeof(struct slob_rcu); + } + c->flags = flags; + return 0; +} + +static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) +{ + void *b; + + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + + if (c->size < PAGE_SIZE) { + b = slob_alloc(c->size, flags, c->align, node); + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); + } else { + b = slob_new_pages(flags, get_order(c->size), node); + trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, + PAGE_SIZE << get_order(c->size), + flags, node); + } + + if (b && c->ctor) + c->ctor(b); + + kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); + return b; +} + +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return slob_alloc_node(cachep, flags, NUMA_NO_NODE); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_NUMA +void *__kmalloc_node(size_t size, gfp_t gfp, int node) +{ + return __do_kmalloc_node(size, gfp, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) +{ + return slob_alloc_node(cachep, gfp, node); +} +EXPORT_SYMBOL(kmem_cache_alloc_node); +#endif + +static void __kmem_cache_free(void *b, int size) +{ + if (size < PAGE_SIZE) + slob_free(b, size); + else + slob_free_pages(b, get_order(size)); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct slob_rcu *slob_rcu = (struct slob_rcu *)head; + void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu)); + + __kmem_cache_free(b, slob_rcu->size); +} + +void kmem_cache_free(struct kmem_cache *c, void *b) +{ + kmemleak_free_recursive(b, c->flags); + if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { + struct slob_rcu *slob_rcu; + slob_rcu = b + (c->size - sizeof(struct slob_rcu)); + slob_rcu->size = c->size; + call_rcu(&slob_rcu->head, kmem_rcu_free); + } else { + __kmem_cache_free(b, c->size); + } + + trace_kmem_cache_free(_RET_IP_, b); +} +EXPORT_SYMBOL(kmem_cache_free); + +int __kmem_cache_shutdown(struct kmem_cache *c) +{ + /* No way to check for remaining objects */ + return 0; +} + +int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) +{ + return 0; +} + +struct kmem_cache kmem_cache_boot = { + .name = "kmem_cache", + .size = sizeof(struct kmem_cache), + .flags = SLAB_PANIC, + .align = ARCH_KMALLOC_MINALIGN, +}; + +void __init kmem_cache_init(void) +{ + kmem_cache = &kmem_cache_boot; + slab_state = UP; +} + +void __init kmem_cache_init_late(void) +{ + slab_state = FULL; +} diff --git a/kernel/mm/slub.c b/kernel/mm/slub.c new file mode 100644 index 000000000..f657453ad --- /dev/null +++ b/kernel/mm/slub.c @@ -0,0 +1,5400 @@ +/* + * SLUB: A slab allocator that limits cache line use instead of queuing + * objects in per cpu and per node lists. + * + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. + * + * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter + */ + +#include +#include /* struct reclaim_state */ +#include +#include +#include +#include +#include +#include "slab.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +/* + * Lock order: + * 1. slab_mutex (Global Mutex) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) + * + * slab_mutex + * + * The role of the slab_mutex is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. + * + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. + * + * The list_lock protects the partial and full list on each node and + * the partial slab counter. If taken then no new slabs may be added or + * removed from the lists nor make the number of partial slabs be modified. + * (Note that the total number of slabs is an atomic value that may be + * modified without taking the list lock). + * + * The list_lock is a centralized lock and thus we avoid taking it as + * much as possible. As long as SLUB does not have to handle partial + * slabs, operations can continue without any centralized lock. F.e. + * allocating a long series of objects that fill up slabs does not require + * the list lock. + * Interrupts are disabled during allocation and deallocation in order to + * make the slab allocator safe to use in the context of an irq. In addition + * interrupts are disabled to ensure that the processor does not change + * while handling per_cpu slabs, due to kernel preemption. + * + * SLUB assigns one slab for allocation to each processor. + * Allocations only occur from these slabs called cpu slabs. + * + * Slabs with free elements are kept on a partial list and during regular + * operations no list for full slabs is used. If an object in a full slab is + * freed then the slab will show up again on the partial lists. + * We track full slabs for debugging purposes though because otherwise we + * cannot scan all objects. + * + * Slabs are freed when they become empty. Teardown and setup is + * minimal so we rely on the page allocators per cpu caches for + * fast frees and allocs. + * + * Overloading of page flags that are otherwise used for LRU management. + * + * PageActive The slab is frozen and exempt from list processing. + * This means that the slab is dedicated to a purpose + * such as satisfying allocations for a specific + * processor. Objects may be freed in the slab while + * it is frozen but slab_free will then skip the usual + * list operations. It is up to the processor holding + * the slab to integrate the slab into the slab lists + * when the slab is no longer needed. + * + * One use of this flag is to mark slabs that are + * used for allocations. Then such a slab becomes a cpu + * slab. The cpu slab may be equipped with an additional + * freelist that allows lockless access to + * free objects in addition to the regular freelist + * that requires the slab lock. + * + * PageError Slab requires special handling due to debug + * options set. This moves slab handling out of + * the fast path and disables lockless freelists. + */ + +static inline int kmem_cache_debug(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + return unlikely(s->flags & SLAB_DEBUG_FLAGS); +#else + return 0; +#endif +} + +static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + return !kmem_cache_debug(s); +#else + return false; +#endif +} + +/* + * Issues still to be resolved: + * + * - Support PAGE_ALLOC_DEBUG. Should be easy to do. + * + * - Variable sizing of the per node arrays + */ + +/* Enable to test recovery from slab corruption on boot */ +#undef SLUB_RESILIENCY_TEST + +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + +/* + * Mininum number of partial slabs. These will be left on the partial + * lists even if they are empty. kmem_cache_shrink may reclaim them. + */ +#define MIN_PARTIAL 5 + +/* + * Maximum number of desirable partial slabs. + * The existence of more partial slabs makes kmem_cache_shrink + * sort the partial list by the number of objects in use. + */ +#define MAX_PARTIAL 10 + +#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_STORE_USER) + +/* + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slub_debug=O is used and a cache's min order increases with + * metadata. + */ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ + +/* Internal SLUB flags */ +#define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ + +#ifdef CONFIG_SMP +static struct notifier_block slab_notifier; +#endif + +/* + * Tracking user of a slab. + */ +#define TRACK_ADDRS_COUNT 16 +struct track { + unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; + +#ifdef CONFIG_SYSFS +static int sysfs_slab_add(struct kmem_cache *); +static int sysfs_slab_alias(struct kmem_cache *, const char *); +static void memcg_propagate_slab_attrs(struct kmem_cache *s); +#else +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) + { return 0; } +static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } +#endif + +static inline void stat(const struct kmem_cache *s, enum stat_item si) +{ +#ifdef CONFIG_SLUB_STATS + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); +#endif +} + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, const void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +static inline void *get_freepointer(struct kmem_cache *s, void *object) +{ + return *(void **)(object + s->offset); +} + +static void prefetch_freepointer(const struct kmem_cache *s, void *object) +{ + prefetch(object + s->offset); +} + +static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) +{ + void *p; + +#ifdef CONFIG_DEBUG_PAGEALLOC + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); +#else + p = get_freepointer(s, object); +#endif + return p; +} + +static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) +{ + *(void **)(object + s->offset) = fp; +} + +/* Loop over all objects in a slab */ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ + __p += (__s)->size) + +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + +/* Determine object index from a given position */ +static inline int slab_index(void *p, struct kmem_cache *s, void *addr) +{ + return (p - addr) / s->size; +} + +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size, int reserved) +{ + struct kmem_cache_order_objects x = { + (order << OO_SHIFT) + order_objects(order, size, reserved) + }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> OO_SHIFT; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & OO_MASK; +} + +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ + struct page tmp; + tmp.counters = counters_new; + /* + * page->counters can cover frozen/inuse/objects as well + * as page->_count. If we assign to ->counters directly + * we run the risk of losing updates to page->_count, so + * be careful and only assign to the fields we need. + */ + page->frozen = tmp.frozen; + page->inuse = tmp.inuse; + page->objects = tmp.objects; +} + +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, &page->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return true; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + set_page_slub_counters(page, counters_new); + slab_unlock(page); + return true; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return false; +} + +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, &page->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return true; + } else +#endif + { + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + if (page->freelist == freelist_old && + page->counters == counters_old) { + page->freelist = freelist_new; + set_page_slub_counters(page, counters_new); + slab_unlock(page); + local_irq_restore(flags); + return true; + } + slab_unlock(page); + local_irq_restore(flags); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return false; +} + +#ifdef CONFIG_SLUB_DEBUG +/* + * Determine a map of object in use on a page. + * + * Node listlock must be held to guarantee that the page does + * not vanish from under us. + */ +static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) +{ + void *p; + void *addr = page_address(page); + + for (p = page->freelist; p; p = get_freepointer(s, p)) + set_bit(slab_index(p, s, addr), map); +} + +/* + * Debug settings: + */ +#ifdef CONFIG_SLUB_DEBUG_ON +static int slub_debug = DEBUG_DEFAULT_FLAGS; +#else +static int slub_debug; +#endif + +static char *slub_debug_slabs; +static int disable_higher_order_debug; + +/* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kasan_enable_current(); +} + +/* + * Object debugging + */ +static void print_section(char *text, u8 *addr, unsigned int length) +{ + metadata_access_enable(); + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); + metadata_access_disable(); +} + +static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) +{ + struct track *p; + + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; + + return p + alloc; +} + +static void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) +{ + struct track *p = get_track(s, object, alloc); + + if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + metadata_access_enable(); + save_stack_trace(&trace); + metadata_access_disable(); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + p->when = jiffies; + } else + memset(p, 0, sizeof(struct track)); +} + +static void init_tracking(struct kmem_cache *s, void *object) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); +} + +static void print_track(const char *s, struct track *t) +{ + if (!t->addr) + return; + + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + pr_err("\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif +} + +static void print_tracking(struct kmem_cache *s, void *object) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + print_track("Allocated", get_track(s, object, TRACK_ALLOC)); + print_track("Freed", get_track(s, object, TRACK_FREE)); +} + +static void print_page_info(struct page *page) +{ + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); + +} + +static void slab_bug(struct kmem_cache *s, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); + + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + va_end(args); +} + +static void slab_fix(struct kmem_cache *s, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); + va_end(args); +} + +static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned int off; /* Offset of last byte */ + u8 *addr = page_address(page); + + print_tracking(s, p); + + print_page_info(page); + + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); + + if (p > addr + 16) + print_section("Bytes b4 ", p - 16, 16); + + print_section("Object ", p, min_t(unsigned long, s->object_size, + PAGE_SIZE)); + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p + s->object_size, + s->inuse - s->object_size); + + if (s->offset) + off = s->offset + sizeof(void *); + else + off = s->inuse; + + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + + if (off != s->size) + /* Beginning of the filler is the free pointer */ + print_section("Padding ", p + off, s->size - off); + + dump_stack(); +} + +void object_err(struct kmem_cache *s, struct page *page, + u8 *object, char *reason) +{ + slab_bug(s, "%s", reason); + print_trailer(s, page, object); +} + +static void slab_err(struct kmem_cache *s, struct page *page, + const char *fmt, ...) +{ + va_list args; + char buf[100]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + slab_bug(s, "%s", buf); + print_page_info(page); + dump_stack(); +} + +static void init_object(struct kmem_cache *s, void *object, u8 val) +{ + u8 *p = object; + + if (s->flags & __OBJECT_POISON) { + memset(p, POISON_FREE, s->object_size - 1); + p[s->object_size - 1] = POISON_END; + } + + if (s->flags & SLAB_RED_ZONE) + memset(p + s->object_size, val, s->inuse - s->object_size); +} + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); + memset(from, data, to - from); +} + +static int check_bytes_and_report(struct kmem_cache *s, struct page *page, + u8 *object, char *what, + u8 *start, unsigned int value, unsigned int bytes) +{ + u8 *fault; + u8 *end; + + metadata_access_enable(); + fault = memchr_inv(start, value, bytes); + metadata_access_disable(); + if (!fault) + return 1; + + end = start + bytes; + while (end > fault && end[-1] == value) + end--; + + slab_bug(s, "%s overwritten", what); + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault[0], value); + print_trailer(s, page, object); + + restore_bytes(s, what, value, fault, end); + return 0; +} + +/* + * Object layout: + * + * object address + * Bytes of the object to be managed. + * If the freepointer may overlay the object then the free + * pointer is the first word of the object. + * + * Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) + * + * object + s->object_size + * Padding to reach word boundary. This is also used for Redzoning. + * Padding is extended by another word if Redzoning is enabled and + * object_size == inuse. + * + * We fill with 0xbb (RED_INACTIVE) for inactive objects and with + * 0xcc (RED_ACTIVE) for objects in use. + * + * object + s->inuse + * Meta data starts here. + * + * A. Free pointer (if we cannot overwrite object on free) + * B. Tracking data for SLAB_STORE_USER + * C. Padding to reach required alignment boundary or at mininum + * one word if debugging is on to be able to detect writes + * before the word boundary. + * + * Padding is done using 0x5a (POISON_INUSE) + * + * object + s->size + * Nothing is used beyond s->size. + * + * If slabcaches are merged then the object_size and inuse boundaries are mostly + * ignored. And therefore no slab options that rely on these boundaries + * may be used with merged slabcaches. + */ + +static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned long off = s->inuse; /* The end of info */ + + if (s->offset) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); + + if (s->size == off) + return 1; + + return check_bytes_and_report(s, page, p, "Object padding", + p + off, POISON_INUSE, s->size - off); +} + +/* Check the pad bytes at the end of a slab page */ +static int slab_pad_check(struct kmem_cache *s, struct page *page) +{ + u8 *start; + u8 *fault; + u8 *end; + int length; + int remainder; + + if (!(s->flags & SLAB_POISON)) + return 1; + + start = page_address(page); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; + end = start + length; + remainder = length % s->size; + if (!remainder) + return 1; + + metadata_access_enable(); + fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + metadata_access_disable(); + if (!fault) + return 1; + while (end > fault && end[-1] == POISON_INUSE) + end--; + + slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + print_section("Padding ", end - remainder, remainder); + + restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); + return 0; +} + +static int check_object(struct kmem_cache *s, struct page *page, + void *object, u8 val) +{ + u8 *p = object; + u8 *endobject = object + s->object_size; + + if (s->flags & SLAB_RED_ZONE) { + if (!check_bytes_and_report(s, page, object, "Redzone", + endobject, val, s->inuse - s->object_size)) + return 0; + } else { + if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { + check_bytes_and_report(s, page, p, "Alignment padding", + endobject, POISON_INUSE, + s->inuse - s->object_size); + } + } + + if (s->flags & SLAB_POISON) { + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && + (!check_bytes_and_report(s, page, p, "Poison", p, + POISON_FREE, s->object_size - 1) || + !check_bytes_and_report(s, page, p, "Poison", + p + s->object_size - 1, POISON_END, 1))) + return 0; + /* + * check_pad_bytes cleans up on its own. + */ + check_pad_bytes(s, page, p); + } + + if (!s->offset && val == SLUB_RED_ACTIVE) + /* + * Object and freepointer overlap. Cannot check + * freepointer while object is allocated. + */ + return 1; + + /* Check free pointer validity */ + if (!check_valid_pointer(s, page, get_freepointer(s, p))) { + object_err(s, page, p, "Freepointer corrupt"); + /* + * No choice but to zap it and thus lose the remainder + * of the free objects in this slab. May cause + * another error because the object count is now wrong. + */ + set_freepointer(s, p, NULL); + return 0; + } + return 1; +} + +static int check_slab(struct kmem_cache *s, struct page *page) +{ + int maxobj; + + VM_BUG_ON(!irqs_disabled()); + + if (!PageSlab(page)) { + slab_err(s, page, "Not a valid slab page"); + return 0; + } + + maxobj = order_objects(compound_order(page), s->size, s->reserved); + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { + slab_err(s, page, "inuse %u > max %u", + page->inuse, page->objects); + return 0; + } + /* Slab_pad_check fixes things up after itself */ + slab_pad_check(s, page); + return 1; +} + +/* + * Determine if a certain object on a page is on the freelist. Must hold the + * slab lock to guarantee that the chains are in a consistent state. + */ +static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +{ + int nr = 0; + void *fp; + void *object = NULL; + int max_objects; + + fp = page->freelist; + while (fp && nr <= page->objects) { + if (fp == search) + return 1; + if (!check_valid_pointer(s, page, fp)) { + if (object) { + object_err(s, page, object, + "Freechain corrupt"); + set_freepointer(s, object, NULL); + } else { + slab_err(s, page, "Freepointer corrupt"); + page->freelist = NULL; + page->inuse = page->objects; + slab_fix(s, "Freelist cleared"); + return 0; + } + break; + } + object = fp; + fp = get_freepointer(s, object); + nr++; + } + + max_objects = order_objects(compound_order(page), s->size, s->reserved); + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } + if (page->inuse != page->objects - nr) { + slab_err(s, page, "Wrong object count. Counter is %d but " + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; + slab_fix(s, "Object count adjusted."); + } + return search == NULL; +} + +static void trace(struct kmem_cache *s, struct page *page, void *object, + int alloc) +{ + if (s->flags & SLAB_TRACE) { + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + s->name, + alloc ? "alloc" : "free", + object, page->inuse, + page->freelist); + + if (!alloc) + print_section("Object ", (void *)object, + s->object_size); + + dump_stack(); + } +} + +/* + * Tracking of fully allocated slabs for debugging purposes. + */ +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + lockdep_assert_held(&n->list_lock); + list_add(&page->lru, &n->full); +} + +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + lockdep_assert_held(&n->list_lock); + list_del(&page->lru); +} + +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (likely(n)) { + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); +} + +/* Object debug checks for alloc/free paths */ +static void setup_object_debug(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + return; + + init_object(s, object, SLUB_RED_INACTIVE); + init_tracking(s, object); +} + +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, + void *object, unsigned long addr) +{ + if (!check_slab(s, page)) + goto bad; + + if (!check_valid_pointer(s, page, object)) { + object_err(s, page, object, "Freelist Pointer check fails"); + goto bad; + } + + if (!check_object(s, page, object, SLUB_RED_INACTIVE)) + goto bad; + + /* Success perform special debug activities for allocs */ + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_ALLOC, addr); + trace(s, page, object, 1); + init_object(s, object, SLUB_RED_ACTIVE); + return 1; + +bad: + if (PageSlab(page)) { + /* + * If this is a slab page then lets do the best we can + * to avoid issues in the future. Marking all objects + * as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + page->inuse = page->objects; + page->freelist = NULL; + } + return 0; +} + +static noinline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + raw_spin_lock_irqsave(&n->list_lock, *flags); + slab_lock(page); + + if (!check_slab(s, page)) + goto fail; + + if (!check_valid_pointer(s, page, object)) { + slab_err(s, page, "Invalid object pointer 0x%p", object); + goto fail; + } + + if (on_freelist(s, page, object)) { + object_err(s, page, object, "Object already free"); + goto fail; + } + + if (!check_object(s, page, object, SLUB_RED_ACTIVE)) + goto out; + + if (unlikely(s != page->slab_cache)) { + if (!PageSlab(page)) { + slab_err(s, page, "Attempt to free object(0x%p) " + "outside of slab", object); + } else if (!page->slab_cache) { + pr_err("SLUB : no slab for object 0x%p.\n", + object); + dump_stack(); + } else + object_err(s, page, object, + "page slab pointer corrupt."); + goto fail; + } + + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_FREE, addr); + trace(s, page, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); +out: + slab_unlock(page); + /* + * Keep node_lock to preserve integrity + * until the object is actually freed + */ + return n; + +fail: + slab_unlock(page); + raw_spin_unlock_irqrestore(&n->list_lock, *flags); + slab_fix(s, "Object at 0x%p not freed", object); + return NULL; +} + +static int __init setup_slub_debug(char *str) +{ + slub_debug = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + if (*str == ',') + /* + * No options but restriction on slabs. This means full + * debugging for slabs matching a pattern. + */ + goto check_slabs; + + slub_debug = 0; + if (*str == '-') + /* + * Switch off all debugging measures. + */ + goto out; + + /* + * Determine which debug features should be switched on + */ + for (; *str && *str != ','; str++) { + switch (tolower(*str)) { + case 'f': + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z': + slub_debug |= SLAB_RED_ZONE; + break; + case 'p': + slub_debug |= SLAB_POISON; + break; + case 'u': + slub_debug |= SLAB_STORE_USER; + break; + case 't': + slub_debug |= SLAB_TRACE; + break; + case 'a': + slub_debug |= SLAB_FAILSLAB; + break; + case 'o': + /* + * Avoid enabling debugging on caches if its minimum + * order would increase as a result. + */ + disable_higher_order_debug = 1; + break; + default: + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); + } + } + +check_slabs: + if (*str == ',') + slub_debug_slabs = str + 1; +out: + return 1; +} + +__setup("slub_debug", setup_slub_debug); + +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + /* + * Enable debugging if selected on the kernel commandline. + */ + if (slub_debug && (!slub_debug_slabs || (name && + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) + flags |= slub_debug; + + return flags; +} +#else +static inline void setup_object_debug(struct kmem_cache *s, + struct page *page, void *object) {} + +static inline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { return 0; } + +static inline struct kmem_cache_node *free_debug_processing( + struct kmem_cache *s, struct page *page, void *object, + unsigned long addr, unsigned long *flags) { return NULL; } + +static inline int slab_pad_check(struct kmem_cache *s, struct page *page) + { return 1; } +static inline int check_object(struct kmem_cache *s, struct page *page, + void *object, u8 val) { return 1; } +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +unsigned long kmem_cache_flags(unsigned long object_size, + unsigned long flags, const char *name, + void (*ctor)(void *)) +{ + return flags; +} +#define slub_debug 0 + +#define disable_higher_order_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} + +#endif /* CONFIG_SLUB_DEBUG */ + +struct slub_free_list { + raw_spinlock_t lock; + struct list_head list; +}; +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); + kasan_kmalloc_large(ptr, size); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); + kasan_kfree_large(x); +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + if (should_failslab(s->object_size, flags, s->flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) +{ + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); + memcg_kmem_put_cache(s); + kasan_slab_alloc(s, object); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); + + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); + + kasan_slab_free(s, x); +} + +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); + } +} + +/* + * Slab allocation and freeing + */ +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) +{ + struct page *page; + int order = oo_order(oo); + + flags |= __GFP_NOTRACK; + + if (memcg_charge_slab(s, flags, order)) + return NULL; + + if (node == NUMA_NO_NODE) + page = alloc_pages(flags, order); + else + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; +} + +static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; + void *start, *p; + int idx, order; + bool enableirqs; + + flags &= gfp_allowed_mask; + + enableirqs = (flags & __GFP_WAIT) != 0; +#ifdef CONFIG_PREEMPT_RT_FULL + enableirqs |= system_state == SYSTEM_RUNNING; +#endif + if (enableirqs) + local_irq_enable(); + + flags |= s->allocflags; + + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + + page = alloc_slab_page(s, alloc_gfp, node, oo); + if (unlikely(!page)) { + oo = s->min; + alloc_gfp = flags; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(s, alloc_gfp, node, oo); + if (unlikely(!page)) + goto out; + stat(s, ORDER_FALLBACK); + } + + if (kmemcheck_enabled && + !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + int pages = 1 << oo_order(oo); + + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); + + /* + * Objects from caches that have a constructor don't get + * cleared when they're allocated, so we need to do it here. + */ + if (s->ctor) + kmemcheck_mark_uninitialized_pages(page, pages); + else + kmemcheck_mark_unallocated_pages(page, pages); + } + + page->objects = oo_objects(oo); + + order = compound_order(page); + page->slab_cache = s; + __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); + + start = page_address(page); + + if (unlikely(s->flags & SLAB_POISON)) + memset(start, POISON_INUSE, PAGE_SIZE << order); + + kasan_poison_slab(page); + + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); + } + + page->freelist = start; + page->inuse = page->objects; + page->frozen = 1; + +out: + if (enableirqs) + local_irq_disable(); + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << oo_order(oo)); + + inc_slabs_node(s, page_to_nid(page), page->objects); + + return page; +} + +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } + + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); +} + +static void __free_slab(struct kmem_cache *s, struct page *page) +{ + int order = compound_order(page); + int pages = 1 << order; + + if (kmem_cache_debug(s)) { + void *p; + + slab_pad_check(s, page); + for_each_object(p, s, page_address(page), + page->objects) + check_object(s, page, p, SLUB_RED_INACTIVE); + } + + kmemcheck_free_shadow(page, compound_order(page)); + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + -pages); + + __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + + page_mapcount_reset(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; + __free_pages(page, order); + memcg_uncharge_slab(s, order); +} + +static void free_delayed(struct list_head *h) +{ + while(!list_empty(h)) { + struct page *page = list_first_entry(h, struct page, lru); + + list_del(&page->lru); + __free_slab(page->slab_cache, page); + } +} + +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + +static void rcu_free_slab(struct rcu_head *h) +{ + struct page *page; + + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + + __free_slab(page->slab_cache, page); +} + +static void free_slab(struct kmem_cache *s, struct page *page) +{ + if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } + + call_rcu(head, rcu_free_slab); + } else if (irqs_disabled()) { + struct slub_free_list *f = this_cpu_ptr(&slub_free_list); + + raw_spin_lock(&f->lock); + list_add(&page->lru, &f->list); + raw_spin_unlock(&f->lock); + } else + __free_slab(s, page); +} + +static void discard_slab(struct kmem_cache *s, struct page *page) +{ + dec_slabs_node(s, page_to_nid(page), page->objects); + free_slab(s, page); +} + +/* + * Management of partially allocated slabs. + */ +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail) +{ + n->nr_partial++; + if (tail == DEACTIVATE_TO_TAIL) + list_add_tail(&page->lru, &n->partial); + else + list_add(&page->lru, &n->partial); +} + +static inline void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) +{ + lockdep_assert_held(&n->list_lock); + __add_partial(n, page, tail); +} + +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page) +{ + list_del(&page->lru); + n->nr_partial--; +} + +static inline void remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + lockdep_assert_held(&n->list_lock); + __remove_partial(n, page); +} + +/* + * Remove slab from the partial list, freeze it and + * return the pointer to the freelist. + * + * Returns a list of objects or NULL if it fails. + */ +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page, + int mode, int *objects) +{ + void *freelist; + unsigned long counters; + struct page new; + + lockdep_assert_held(&n->list_lock); + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + *objects = new.objects - new.inuse; + if (mode) { + new.inuse = page->objects; + new.freelist = NULL; + } else { + new.freelist = freelist; + } + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + if (!__cmpxchg_double_slab(s, page, + freelist, counters, + new.freelist, new.counters, + "acquire_slab")) + return NULL; + + remove_partial(n, page); + WARN_ON(!freelist); + return freelist; +} + +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); + +/* + * Try to allocate a partial slab from a specific node. + */ +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct kmem_cache_cpu *c, gfp_t flags) +{ + struct page *page, *page2; + void *object = NULL; + int available = 0; + int objects; + + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab and there is none available then get_partials() + * will return NULL. + */ + if (!n || !n->nr_partial) + return NULL; + + raw_spin_lock(&n->list_lock); + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t; + + if (!pfmemalloc_match(page, flags)) + continue; + + t = acquire_slab(s, n, page, object == NULL, &objects); + if (!t) + break; + + available += objects; + if (!object) { + c->page = page; + stat(s, ALLOC_FROM_PARTIAL); + object = t; + } else { + put_cpu_partial(s, page, 0); + stat(s, CPU_PARTIAL_NODE); + } + if (!kmem_cache_has_cpu_partial(s) + || available > s->cpu_partial / 2) + break; + + } + raw_spin_unlock(&n->list_lock); + return object; +} + +/* + * Get a page from somewhere. Search in increasing NUMA distances. + */ +static void *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(flags); + void *object; + unsigned int cpuset_mems_cookie; + + /* + * The defrag ratio allows a configuration of the tradeoffs between + * inter node defragmentation and node local allocations. A lower + * defrag_ratio increases the tendency to do local allocations + * instead of attempting to obtain partial slabs from other nodes. + * + * If the defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If the ratio is higher then kmalloc() + * may return off node objects because partial slabs are obtained + * from other nodes and filled up. + * + * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes + * defrag_ratio = 1000) then every (well almost) allocation will + * first attempt to defrag slab caches on other nodes. This means + * scanning over all nodes to look for partial slabs which may be + * expensive if we do it every time we are trying to find a slab + * with available objects. + */ + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return NULL; + + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed(zone, flags) && + n->nr_partial > s->min_partial) { + object = get_partial_node(s, n, c, flags); + if (object) { + /* + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update + */ + return object; + } + } + } + } while (read_mems_allowed_retry(cpuset_mems_cookie)); +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) +{ + void *object; + int searchnode = node; + + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); + else if (!node_present_pages(node)) + searchnode = node_to_mem_node(node); + + object = get_partial_node(s, get_node(s, searchnode), c, flags); + if (object || node != NUMA_NO_NODE) + return object; + + return get_any_partial(s, flags, c); +} + +#ifdef CONFIG_PREEMPT +/* + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. + */ +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} + +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + pr_info("%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + pr_warn("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + pr_warn("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); +} + +static void init_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +} + +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct page *page, + void *freelist) +{ + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *nextfree; + int tail = DEACTIVATE_TO_HEAD; + struct page new; + struct page old; + + if (page->freelist) { + stat(s, DEACTIVATE_REMOTE_FREES); + tail = DEACTIVATE_TO_TAIL; + } + + /* + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. + */ + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!__cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } + + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. + */ +redo: + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && n->nr_partial >= s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + raw_spin_lock(&n->list_lock); + } + } else { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + raw_spin_lock(&n->list_lock); + } + } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, n, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + + } + } + + l = m; + if (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + if (lock) + raw_spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } +} + +/* + * Unfreeze all the cpu partial slabs. + * + * This function must be called with interrupts disabled + * for the cpu using c (or some other guarantee must be there + * to guarantee no concurrent accesses). + */ +static void unfreeze_partials(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct kmem_cache_node *n = NULL, *n2 = NULL; + struct page *page, *discard_page = NULL; + + while ((page = c->partial)) { + struct page new; + struct page old; + + c->partial = page->next; + + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) + raw_spin_unlock(&n->list_lock); + + n = n2; + raw_spin_lock(&n->list_lock); + } + + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + } while (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + page->next = discard_page; + discard_page = page; + } else { + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } + + if (n) + raw_spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; + discard_page = discard_page->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } +#endif +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct page *oldpage; + int pages; + int pobjects; + + preempt_disable(); + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + struct slub_free_list *f; + unsigned long flags; + LIST_HEAD(tofree); + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + f = this_cpu_ptr(&slub_free_list); + raw_spin_lock(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock(&f->lock); + local_irq_restore(flags); + free_delayed(&tofree); + oldpage = NULL; + pobjects = 0; + pages = 0; + stat(s, CPU_PARTIAL_DRAIN); + } + } + + pages++; + pobjects += page->objects - page->inuse; + + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; + + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) + != oldpage); + if (unlikely(!s->cpu_partial)) { + unsigned long flags; + + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + } + preempt_enable(); +#endif +} + +static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +{ + stat(s, CPUSLAB_FLUSH); + deactivate_slab(s, c->page, c->freelist); + + c->tid = next_tid(c->tid); + c->page = NULL; + c->freelist = NULL; +} + +/* + * Flush cpu slab. + * + * Called from IPI handler with interrupts disabled. + */ +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s, c); + } +} + +static void flush_cpu_slab(void *d) +{ + struct kmem_cache *s = d; + + __flush_cpu_slab(s, smp_processor_id()); +} + +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->page || c->partial; +} + +static void flush_all(struct kmem_cache *s) +{ + LIST_HEAD(tofree); + int cpu; + + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); + for_each_online_cpu(cpu) { + struct slub_free_list *f; + + if (!has_cpu_slab(cpu, s)) + continue; + + f = &per_cpu(slub_free_list, cpu); + raw_spin_lock_irq(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock_irq(&f->lock); + free_delayed(&tofree); + } +} + +/* + * Check if the objects in a per cpu structure fit numa + * locality expectations. + */ +static inline int node_match(struct page *page, int node) +{ +#ifdef CONFIG_NUMA + if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) + return 0; +#endif + return 1; +} + +#ifdef CONFIG_SLUB_DEBUG +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + raw_spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + raw_spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ + +static noinline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) +{ +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + int node; + struct kmem_cache_node *n; + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; + + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nid, gfpflags); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); + + if (oo_order(s->min) > get_order(s->object_size)) + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); + + for_each_kmem_cache_node(s, node, n) { + unsigned long nr_slabs; + unsigned long nr_objs; + unsigned long nr_free; + + nr_free = count_partial(n, count_free); + nr_slabs = node_nr_slabs(n); + nr_objs = node_nr_objs(n); + + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", + node, nr_slabs, nr_objs, nr_free); + } +#endif +} + +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *freelist; + struct kmem_cache_cpu *c = *pc; + struct page *page; + + freelist = get_partial(s, flags, node, c); + + if (freelist) + return freelist; + + page = new_slab(s, flags, node); + if (page) { + c = raw_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + freelist = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->page = page; + *pc = c; + } else + freelist = NULL; + + return freelist; +} + +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) +{ + if (unlikely(PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + +/* + * Check the page->freelist of a page and either transfer the freelist to the + * per cpu freelist or deactivate the page. + * + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. + * + * This function must be called with interrupt disabled. + */ +static inline void *get_freelist(struct kmem_cache *s, struct page *page) +{ + struct page new; + unsigned long counters; + void *freelist; + + do { + freelist = page->freelist; + counters = page->counters; + + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = page->objects; + new.frozen = freelist != NULL; + + } while (!__cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; +} + +/* + * Slow path. The lockless freelist is empty or we need to perform + * debugging duties. + * + * Processing is still very fast if new objects have been freed to the + * regular freelist. In that case we simply take over the regular freelist + * as the lockless freelist and zap the regular freelist. + * + * If that is not working then we fall back to the partial lists. We take the + * first element of the freelist as the object to allocate now and move the + * rest of the freelist to the lockless freelist. + * + * And if we were unable to get a new slab from the partial slab lists then + * we need to allocate a new slab. This is the slowest path since it involves + * a call to the page allocator and the setup of a new slab. + */ +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) +{ + struct slub_free_list *f; + void *freelist; + struct page *page; + unsigned long flags; + LIST_HEAD(tofree); + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif + + page = c->page; + if (!page) + goto new_slab; +redo: + + if (unlikely(!node_match(page, node))) { + int searchnode = node; + + if (node != NUMA_NO_NODE && !node_present_pages(node)) + searchnode = node_to_mem_node(node); + + if (unlikely(!node_match(page, searchnode))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } + } + + /* + * By rights, we should be searching for a slab page that was + * PFMEMALLOC but right now, we are losing the pfmemalloc + * information when the page leaves the per-cpu allocator + */ + if (unlikely(!pfmemalloc_match(page, gfpflags))) { + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } + + /* must check again c->freelist in case of cpu migration or IRQ */ + freelist = c->freelist; + if (freelist) + goto load_freelist; + + freelist = get_freelist(s, page); + + if (!freelist) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } + + stat(s, ALLOC_REFILL); + +load_freelist: + /* + * freelist is pointing to the list of objects to be used. + * page is pointing to the page from which the objects are obtained. + * That page must be frozen for per cpu allocations to work. + */ + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); +out: + f = this_cpu_ptr(&slub_free_list); + raw_spin_lock(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock(&f->lock); + local_irq_restore(flags); + free_delayed(&tofree); + return freelist; + +new_slab: + + if (c->partial) { + page = c->page = c->partial; + c->partial = page->next; + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; + } + + freelist = new_slab_objects(s, gfpflags, node, &c); + + if (unlikely(!freelist)) { + slab_out_of_memory(s, gfpflags, node); + goto out; + } + + page = c->page; + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) + goto load_freelist; + + /* Only entered in the debug case */ + if (kmem_cache_debug(s) && + !alloc_debug_processing(s, page, freelist, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ + + deactivate_slab(s, page, get_freepointer(s, freelist)); + c->page = NULL; + c->freelist = NULL; + goto out; +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __always_inline void *slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr) +{ + void **object; + struct kmem_cache_cpu *c; + struct page *page; + unsigned long tid; + + s = slab_pre_alloc_hook(s, gfpflags); + if (!s) + return NULL; +redo: + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + * + * We should guarantee that tid and kmem_cache are retrieved on + * the same cpu. It could be different if CONFIG_PREEMPT so we need + * to check if it is matched or not. + */ + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); + + /* + * Irqless object alloc/free algorithm used here depends on sequence + * of fetching cpu_slab's data. tid should be fetched before anything + * on c to guarantee that object and page associated with previous tid + * won't be used with current tid. If we fetch tid first, object and + * page could be one associated with next tid and our alloc/free + * request will be failed. In this case, we will retry. So, no problem. + */ + barrier(); + + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + + object = c->freelist; + page = c->page; + if (unlikely(!object || !node_match(page, node))) { + object = __slab_alloc(s, gfpflags, node, addr, c); + stat(s, ALLOC_SLOWPATH); + } else { + void *next_object = get_freepointer_safe(s, object); + + /* + * The cmpxchg will only match if there was no additional + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock + * semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only + * against code executing on this cpu *not* from access by + * other cpus. + */ + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + next_object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } + prefetch_freepointer(s, next_object); + stat(s, ALLOC_FASTPATH); + } + + if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, s->object_size); + + slab_post_alloc_hook(s, gfpflags, object); + + return object; +} + +static __always_inline void *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, unsigned long addr) +{ + return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); +} + +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + void *ret = slab_alloc(s, gfpflags, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, + s->size, gfpflags); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = slab_alloc(s, gfpflags, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + kasan_kmalloc(s, ret, size); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_trace); +#endif + +#ifdef CONFIG_NUMA +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +{ + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + + trace_kmem_cache_alloc_node(_RET_IP_, ret, + s->object_size, s->size, gfpflags, node); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, + int node, size_t size) +{ + void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, s->size, gfpflags, node); + + kasan_kmalloc(s, ret, size); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); +#endif +#endif + +/* + * Slow path handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. + * + * So we still attempt to reduce cache line usage. Just take the slab + * lock and free the item. If there is no additional partial page + * handling required then we can return immediately. + */ +static void __slab_free(struct kmem_cache *s, struct page *page, + void *x, unsigned long addr) +{ + void *prior; + void **object = (void *)x; + int was_frozen; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long uninitialized_var(flags); + + stat(s, FREE_SLOWPATH); + + if (kmem_cache_debug(s) && + !(n = free_debug_processing(s, page, x, addr, &flags))) + return; + + do { + if (unlikely(n)) { + raw_spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen) { + + if (kmem_cache_has_cpu_partial(s) && !prior) { + + /* + * Slab was on no list before and will be + * partially empty + * We can defer the list move and instead + * freeze it. + */ + new.frozen = 1; + + } else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + raw_spin_lock_irqsave(&n->list_lock, flags); + + } + } + + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); + + if (likely(!n)) { + + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) { + put_cpu_partial(s, page, 1); + stat(s, CPU_PARTIAL_FREE); + } + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } + + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) + goto slab_empty; + + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (kmem_cache_debug(s)) + remove_full(s, n, page); + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + raw_spin_unlock_irqrestore(&n->list_lock, flags); + return; + +slab_empty: + if (prior) { + /* + * Slab on the partial list. + */ + remove_partial(n, page); + stat(s, FREE_REMOVE_PARTIAL); + } else { + /* Slab must be on the full list */ + remove_full(s, n, page); + } + + raw_spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, FREE_SLAB); + discard_slab(s, page); +} + +/* + * Fastpath with forced inlining to produce a kfree and kmem_cache_free that + * can perform fastpath freeing without additional function calls. + * + * The fastpath is only possible if we are freeing to the current cpu slab + * of this processor. This typically the case if we have just allocated + * the item before. + * + * If fastpath is not possible then fall back to __slab_free where we deal + * with all sorts of special processing. + */ +static __always_inline void slab_free(struct kmem_cache *s, + struct page *page, void *x, unsigned long addr) +{ + void **object = (void *)x; + struct kmem_cache_cpu *c; + unsigned long tid; + + slab_free_hook(s, x); + +redo: + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); + + /* Same with comment on barrier() in slab_alloc_node() */ + barrier(); + + if (likely(page == c->page)) { + set_freepointer(s, object, c->freelist); + + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } + stat(s, FREE_FASTPATH); + } else + __slab_free(s, page, x, addr); + +} + +void kmem_cache_free(struct kmem_cache *s, void *x) +{ + s = cache_from_obj(s, x); + if (!s) + return; + slab_free(s, virt_to_head_page(x), x, _RET_IP_); + trace_kmem_cache_free(_RET_IP_, x); +} +EXPORT_SYMBOL(kmem_cache_free); + +/* + * Object placement in a slab is made very easy because we always start at + * offset 0. If we tune the size of the object to the alignment then we can + * get the required alignment by putting one properly sized object after + * another. + * + * Notice that the allocation order determines the sizes of the per cpu + * caches. Each processor has always one slab available for allocations. + * Increasing the allocation order reduces the number of times that slabs + * must be moved on and off the partial lists and is therefore a factor in + * locking overhead. + */ + +/* + * Mininum / Maximum order of slab pages. This influences locking overhead + * and slab fragmentation. A higher order reduces the number of partial slabs + * and increases the number of allocations possible without having to + * take the list_lock. + */ +static int slub_min_order; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects; + +/* + * Calculate the order of allocation given an slab object size. + * + * The order of allocation has significant impact on performance and other + * system components. Generally order 0 allocations should be preferred since + * order 0 does not cause fragmentation in the page allocator. Larger objects + * be problematic to put into order 0 slabs because there may be too much + * unused space left. We go to a higher order if more than 1/16th of the slab + * would be wasted. + * + * In order to reach satisfactory performance we must ensure that a minimum + * number of objects is in one slab. Otherwise we may generate too much + * activity on the partial lists which requires taking the list_lock. This is + * less a concern for large slabs though which are rarely used. + * + * slub_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slub_max_order then + * we try to keep the page order as low as possible. So we accept more waste + * of space in favor of a small page order. + * + * Higher order allocations also allow the placement of more objects in a + * slab and thereby reduce object handling overhead. If the user has + * requested a higher mininum order then we start with that one instead of + * the smallest order which will fit the object. + */ +static inline int slab_order(int size, int min_objects, + int max_order, int fract_leftover, int reserved) +{ + int order; + int rem; + int min_order = slub_min_order; + + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; + + for (order = max(min_order, + fls(min_objects * size - 1) - PAGE_SHIFT); + order <= max_order; order++) { + + unsigned long slab_size = PAGE_SIZE << order; + + if (slab_size < min_objects * size + reserved) + continue; + + rem = (slab_size - reserved) % size; + + if (rem <= slab_size / fract_leftover) + break; + + } + + return order; +} + +static inline int calculate_order(int size, int reserved) +{ + int order; + int min_objects; + int fraction; + int max_objects; + + /* + * Attempt to find best configuration for a slab. This + * works by first attempting to generate a layout with + * the best configuration and backing off gradually. + * + * First we reduce the acceptable waste in a slab. Then + * we reduce the minimum objects required in a slab. + */ + min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * (fls(nr_cpu_ids) + 1); + max_objects = order_objects(slub_max_order, size, reserved); + min_objects = min(min_objects, max_objects); + + while (min_objects > 1) { + fraction = 16; + while (fraction >= 4) { + order = slab_order(size, min_objects, + slub_max_order, fraction, reserved); + if (order <= slub_max_order) + return order; + fraction /= 2; + } + min_objects--; + } + + /* + * We were unable to place multiple objects in a slab. Now + * lets see if we can place a single object there. + */ + order = slab_order(size, 1, slub_max_order, 1, reserved); + if (order <= slub_max_order) + return order; + + /* + * Doh this slab cannot be placed using slub_max_order. + */ + order = slab_order(size, 1, MAX_ORDER, 1, reserved); + if (order < MAX_ORDER) + return order; + return -ENOSYS; +} + +static void +init_kmem_cache_node(struct kmem_cache_node *n) +{ + n->nr_partial = 0; + raw_spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); +#ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); + INIT_LIST_HEAD(&n->full); +#endif +} + +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +{ + BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < + KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + + /* + * Must align to double word boundary for the double cmpxchg + * instructions to work; see __pcpu_double_call_return_bool(). + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), + 2 * sizeof(void *)); + + if (!s->cpu_slab) + return 0; + + init_kmem_cache_cpus(s); + + return 1; +} + +static struct kmem_cache *kmem_cache_node; + +/* + * No kmalloc_node yet so do it by hand. We know that this is the first + * slab on the node for this slabcache. There are no concurrent accesses + * possible. + * + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping + * memory on a fresh node that has no slab structures yet. + */ +static void early_kmem_cache_node_alloc(int node) +{ + struct page *page; + struct kmem_cache_node *n; + + BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); + + page = new_slab(kmem_cache_node, GFP_NOWAIT, node); + + BUG_ON(!page); + if (page_to_nid(page) != node) { + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); + } + + n = page->freelist; + BUG_ON(!n); + page->freelist = get_freepointer(kmem_cache_node, n); + page->inuse = 1; + page->frozen = 0; + kmem_cache_node->node[node] = n; +#ifdef CONFIG_SLUB_DEBUG + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); +#endif + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); + init_kmem_cache_node(n); + inc_slabs_node(kmem_cache_node, node, page->objects); + + /* + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. + */ + __add_partial(n, page, DEACTIVATE_TO_HEAD); +} + +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); + s->node[node] = NULL; + } +} + +static int init_kmem_cache_nodes(struct kmem_cache *s) +{ + int node; + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n; + + if (slab_state == DOWN) { + early_kmem_cache_node_alloc(node); + continue; + } + n = kmem_cache_alloc_node(kmem_cache_node, + GFP_KERNEL, node); + + if (!n) { + free_kmem_cache_nodes(s); + return 0; + } + + s->node[node] = n; + init_kmem_cache_node(n); + } + return 1; +} + +static void set_min_partial(struct kmem_cache *s, unsigned long min) +{ + if (min < MIN_PARTIAL) + min = MIN_PARTIAL; + else if (min > MAX_PARTIAL) + min = MAX_PARTIAL; + s->min_partial = min; +} + +/* + * calculate_sizes() determines the order and the distribution of data within + * a slab object. + */ +static int calculate_sizes(struct kmem_cache *s, int forced_order) +{ + unsigned long flags = s->flags; + unsigned long size = s->object_size; + int order; + + /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); + +#ifdef CONFIG_SLUB_DEBUG + /* + * Determine if we can poison the object itself. If the user of + * the slab may touch the object after free or before allocation + * then we should never poison the object itself. + */ + if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + !s->ctor) + s->flags |= __OBJECT_POISON; + else + s->flags &= ~__OBJECT_POISON; + + + /* + * If we are Redzoning then check if there is some space between the + * end of the object and the free pointer. If not then add an + * additional word to have some bytes to store Redzone information. + */ + if ((flags & SLAB_RED_ZONE) && size == s->object_size) + size += sizeof(void *); +#endif + + /* + * With that we have determined the number of bytes in actual use + * by the object. This is the potential offset to the free pointer. + */ + s->inuse = size; + + if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + s->ctor)) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or + * destructor or are poisoning the objects. + */ + s->offset = size; + size += sizeof(void *); + } + +#ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* + * Need to store information about allocs and frees after + * the object. + */ + size += 2 * sizeof(struct track); + + if (flags & SLAB_RED_ZONE) + /* + * Add some empty padding so that we can catch + * overwrites from earlier objects rather than let + * tracking information or the free pointer be + * corrupted if a user writes before the start + * of the object. + */ + size += sizeof(void *); +#endif + + /* + * SLUB stores one object immediately after another beginning from + * offset 0. In order to align the objects we have to simply size + * each object to conform to the alignment. + */ + size = ALIGN(size, s->align); + s->size = size; + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size, s->reserved); + + if (order < 0) + return 0; + + s->allocflags = 0; + if (order) + s->allocflags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + s->allocflags |= GFP_DMA; + + if (s->flags & SLAB_RECLAIM_ACCOUNT) + s->allocflags |= __GFP_RECLAIMABLE; + + /* + * Determine the number of objects per slab + */ + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; + + return !!oo_objects(s->oo); +} + +static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) +{ + s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); + s->reserved = 0; + + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); + + if (!calculate_sizes(s, -1)) + goto error; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s, -1)) + goto error; + } + } + +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. + */ + if (!kmem_cache_has_cpu_partial(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; + +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + if (!init_kmem_cache_nodes(s)) + goto error; + + if (alloc_kmem_cache_cpus(s)) + return 0; + + free_kmem_cache_nodes(s); +error: + if (flags & SLAB_PANIC) + panic("Cannot create slab %s size=%lu realsize=%u " + "order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); + return -EINVAL; +} + +static void list_slab_objects(struct kmem_cache *s, struct page *page, + const char *text) +{ +#ifdef CONFIG_SLUB_DEBUG + void *addr = page_address(page); + void *p; + unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * + sizeof(long), GFP_ATOMIC); + if (!map) + return; + slab_err(s, page, text, s->name); + slab_lock(page); + + get_map(s, page, map); + for_each_object(p, s, addr, page->objects) { + + if (!test_bit(slab_index(p, s, addr), map)) { + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); + print_tracking(s, p); + } + } + slab_unlock(page); + kfree(map); +#endif +} + +/* + * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. + */ +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) +{ + struct page *page, *h; + + list_for_each_entry_safe(page, h, &n->partial, lru) { + if (!page->inuse) { + __remove_partial(n, page); + discard_slab(s, page); + } else { + list_slab_objects(s, page, + "Objects remaining in %s on kmem_cache_close()"); + } + } +} + +/* + * Release all resources used by a slab cache. + */ +static inline int kmem_cache_close(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + flush_all(s); + /* Attempt to free all objects */ + for_each_kmem_cache_node(s, node, n) { + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) + return 1; + } + free_percpu(s->cpu_slab); + free_kmem_cache_nodes(s); + return 0; +} + +int __kmem_cache_shutdown(struct kmem_cache *s) +{ + return kmem_cache_close(s); +} + +/******************************************************************** + * Kmalloc subsystem + *******************************************************************/ + +static int __init setup_slub_min_order(char *str) +{ + get_option(&str, &slub_min_order); + + return 1; +} + +__setup("slub_min_order=", setup_slub_min_order); + +static int __init setup_slub_max_order(char *str) +{ + get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); + + return 1; +} + +__setup("slub_max_order=", setup_slub_max_order); + +static int __init setup_slub_min_objects(char *str) +{ + get_option(&str, &slub_min_objects); + + return 1; +} + +__setup("slub_min_objects=", setup_slub_min_objects); + +void *__kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return kmalloc_large(size, flags); + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = slab_alloc(s, flags, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + + kasan_kmalloc(s, ret, size); + + return ret; +} +EXPORT_SYMBOL(__kmalloc); + +#ifdef CONFIG_NUMA +static void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); + if (page) + ptr = page_address(page); + + kmalloc_large_node_hook(ptr, size, flags); + return ptr; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = kmalloc_large_node(size, flags, node); + + trace_kmalloc_node(_RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = slab_alloc_node(s, flags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + + kasan_kmalloc(s, ret, size); + + return ret; +} +EXPORT_SYMBOL(__kmalloc_node); +#endif + +static size_t __ksize(const void *object) +{ + struct page *page; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { + WARN_ON(!PageCompound(page)); + return PAGE_SIZE << compound_order(page); + } + + return slab_ksize(page->slab_cache); +} + +size_t ksize(const void *object) +{ + size_t size = __ksize(object); + /* We assume that ksize callers could use whole allocated area, + so we need unpoison this area. */ + kasan_krealloc(object, size); + return size; +} +EXPORT_SYMBOL(ksize); + +void kfree(const void *x) +{ + struct page *page; + void *object = (void *)x; + + trace_kfree(_RET_IP_, x); + + if (unlikely(ZERO_OR_NULL_PTR(x))) + return; + + page = virt_to_head_page(x); + if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kfree_hook(x); + __free_kmem_pages(page, compound_order(page)); + return; + } + slab_free(page->slab_cache, page, object, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +#define SHRINK_PROMOTE_MAX 32 + +/* + * kmem_cache_shrink discards empty slabs and promotes the slabs filled + * up most to the head of the partial lists. New allocations will then + * fill those up and thus they can be removed from the partial lists. + * + * The slabs with the least items are placed last. This results in them + * being allocated from last increasing the chance that the last objects + * are freed in them. + */ +int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) +{ + int node; + int i; + struct kmem_cache_node *n; + struct page *page; + struct page *t; + struct list_head discard; + struct list_head promote[SHRINK_PROMOTE_MAX]; + unsigned long flags; + int ret = 0; + + if (deactivate) { + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), + * so we have to make sure the change is visible. + */ + kick_all_cpus_sync(); + } + + flush_all(s); + for_each_kmem_cache_node(s, node, n) { + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); + + raw_spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists of slabs to discard or promote. + * + * Note that concurrent frees may occur while we hold the + * list_lock. page->inuse here is the upper limit. + */ + list_for_each_entry_safe(page, t, &n->partial, lru) { + int free = page->objects - page->inuse; + + /* Do not reread page->inuse */ + barrier(); + + /* We do not keep full slabs on the list */ + BUG_ON(free <= 0); + + if (free == page->objects) { + list_move(&page->lru, &discard); + n->nr_partial--; + } else if (free <= SHRINK_PROMOTE_MAX) + list_move(&page->lru, promote + free - 1); + } + + /* + * Promote the slabs filled up most to the head of the + * partial list. + */ + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); + + raw_spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, &discard, lru) + discard_slab(s, page); + + if (slabs_node(s, node)) + ret = 1; + } + + return ret; +} + +static int slab_mem_going_offline_callback(void *arg) +{ + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) + __kmem_cache_shrink(s, false); + mutex_unlock(&slab_mutex); + + return 0; +} + +static void slab_mem_offline_callback(void *arg) +{ + struct kmem_cache_node *n; + struct kmem_cache *s; + struct memory_notify *marg = arg; + int offline_node; + + offline_node = marg->status_change_nid_normal; + + /* + * If the node still has available memory. we need kmem_cache_node + * for it yet. + */ + if (offline_node < 0) + return; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + n = get_node(s, offline_node); + if (n) { + /* + * if n->nr_slabs > 0, slabs still exist on the node + * that is going down. We were unable to free them, + * and offline_pages() function shouldn't call this + * callback. So, we must fail. + */ + BUG_ON(slabs_node(s, offline_node)); + + s->node[offline_node] = NULL; + kmem_cache_free(kmem_cache_node, n); + } + } + mutex_unlock(&slab_mutex); +} + +static int slab_mem_going_online_callback(void *arg) +{ + struct kmem_cache_node *n; + struct kmem_cache *s; + struct memory_notify *marg = arg; + int nid = marg->status_change_nid_normal; + int ret = 0; + + /* + * If the node's memory is already available, then kmem_cache_node is + * already created. Nothing to do. + */ + if (nid < 0) + return 0; + + /* + * We are bringing a node online. No memory is available yet. We must + * allocate a kmem_cache_node structure in order to bring the node + * online. + */ + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + /* + * XXX: kmem_cache_alloc_node will fallback to other nodes + * since memory is not yet available from the node that + * is brought up. + */ + n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + init_kmem_cache_node(n); + s->node[nid] = n; + } +out: + mutex_unlock(&slab_mutex); + return ret; +} + +static int slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + ret = slab_mem_going_online_callback(arg); + break; + case MEM_GOING_OFFLINE: + ret = slab_mem_going_offline_callback(arg); + break; + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + slab_mem_offline_callback(arg); + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + return ret; +} + +static struct notifier_block slab_memory_callback_nb = { + .notifier_call = slab_memory_callback, + .priority = SLAB_CALLBACK_PRI, +}; + +/******************************************************************** + * Basic setup of slabs + *******************************************************************/ + +/* + * Used for early kmem_cache structures that were allocated using + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. + */ + +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) +{ + int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; + + memcpy(s, static_cache, kmem_cache->object_size); + + /* + * This runs very early, and only the boot processor is supposed to be + * up. Even if it weren't true, IRQs are not up so we couldn't fire + * IPIs around. + */ + __flush_cpu_slab(s, smp_processor_id()); + for_each_kmem_cache_node(s, node, n) { + struct page *p; + + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; + +#ifdef CONFIG_SLUB_DEBUG + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; +#endif + } + slab_init_memcg_params(s); + list_add(&s->list, &slab_caches); + return s; +} + +void __init kmem_cache_init(void) +{ + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; + int cpu; + + for_each_possible_cpu(cpu) { + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); + } + + if (debug_guardpage_minorder()) + slub_max_order = 0; + + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; + + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); + + register_hotmemory_notifier(&slab_memory_callback_nb); + + /* Able to allocate the per node structures */ + slab_state = PARTIAL; + + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN); + + kmem_cache = bootstrap(&boot_kmem_cache); + + /* + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. + */ + kmem_cache_node = bootstrap(&boot_kmem_cache_node); + + /* Now we can use the kmem_cache to allocate kmalloc slabs */ + create_kmalloc_caches(0); + +#ifdef CONFIG_SMP + register_cpu_notifier(&slab_notifier); +#endif + + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", + cache_line_size(), + slub_min_order, slub_max_order, slub_min_objects, + nr_cpu_ids, nr_node_ids); +} + +void __init kmem_cache_init_late(void) +{ +} + +struct kmem_cache * +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) +{ + struct kmem_cache *s, *c; + + s = find_mergeable(size, align, flags, name, ctor); + if (s) { + s->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->object_size = max(s->object_size, (int)size); + s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + + for_each_memcg_cache(c, s) { + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + + if (sysfs_slab_alias(s, name)) { + s->refcount--; + s = NULL; + } + } + + return s; +} + +int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +{ + int err; + + err = kmem_cache_open(s, flags); + if (err) + return err; + + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + + memcg_propagate_slab_attrs(s); + err = sysfs_slab_add(s); + if (err) + kmem_cache_close(s); + + return err; +} + +#ifdef CONFIG_SMP +/* + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. + */ +static int slab_cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + struct kmem_cache *s; + unsigned long flags; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + local_irq_save(flags); + __flush_cpu_slab(s, cpu); + local_irq_restore(flags); + } + mutex_unlock(&slab_mutex); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block slab_notifier = { + .notifier_call = slab_cpuup_callback +}; + +#endif + +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return kmalloc_large(size, gfpflags); + + s = kmalloc_slab(size, gfpflags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = slab_alloc(s, gfpflags, caller); + + /* Honor the call site pointer we received. */ + trace_kmalloc(caller, ret, size, s->size, gfpflags); + + return ret; +} + +#ifdef CONFIG_NUMA +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = kmalloc_large_node(size, gfpflags, node); + + trace_kmalloc_node(caller, ret, + size, PAGE_SIZE << get_order(size), + gfpflags, node); + + return ret; + } + + s = kmalloc_slab(size, gfpflags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = slab_alloc_node(s, gfpflags, node, caller); + + /* Honor the call site pointer we received. */ + trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + + return ret; +} +#endif + +#ifdef CONFIG_SYSFS +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; +} +#endif + +#ifdef CONFIG_SLUB_DEBUG +static int validate_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) +{ + void *p; + void *addr = page_address(page); + + if (!check_slab(s, page) || + !on_freelist(s, page, NULL)) + return 0; + + /* Now we know that a valid freelist exists */ + bitmap_zero(map, page->objects); + + get_map(s, page, map); + for_each_object(p, s, addr, page->objects) { + if (test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) + return 0; + } + + for_each_object(p, s, addr, page->objects) + if (!test_bit(slab_index(p, s, addr), map)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) + return 0; + return 1; +} + +static void validate_slab_slab(struct kmem_cache *s, struct page *page, + unsigned long *map) +{ + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); +} + +static int validate_slab_node(struct kmem_cache *s, + struct kmem_cache_node *n, unsigned long *map) +{ + unsigned long count = 0; + struct page *page; + unsigned long flags; + + raw_spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(page, &n->partial, lru) { + validate_slab_slab(s, page, map); + count++; + } + if (count != n->nr_partial) + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); + + if (!(s->flags & SLAB_STORE_USER)) + goto out; + + list_for_each_entry(page, &n->full, lru) { + validate_slab_slab(s, page, map); + count++; + } + if (count != atomic_long_read(&n->nr_slabs)) + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); + +out: + raw_spin_unlock_irqrestore(&n->list_lock, flags); + return count; +} + +static long validate_slab_cache(struct kmem_cache *s) +{ + int node; + unsigned long count = 0; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; + + if (!map) + return -ENOMEM; + + flush_all(s); + for_each_kmem_cache_node(s, node, n) + count += validate_slab_node(s, n, map); + kfree(map); + return count; +} +/* + * Generate lists of code addresses where slabcache objects are allocated + * and freed. + */ + +struct location { + unsigned long count; + unsigned long addr; + long long sum_time; + long min_time; + long max_time; + long min_pid; + long max_pid; + DECLARE_BITMAP(cpus, NR_CPUS); + nodemask_t nodes; +}; + +struct loc_track { + unsigned long max; + unsigned long count; + struct location *loc; +}; + +static void free_loc_track(struct loc_track *t) +{ + if (t->max) + free_pages((unsigned long)t->loc, + get_order(sizeof(struct location) * t->max)); +} + +static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) +{ + struct location *l; + int order; + + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(flags, order); + if (!l) + return 0; + + if (t->count) { + memcpy(l, t->loc, sizeof(struct location) * t->count); + free_loc_track(t); + } + t->max = max; + t->loc = l; + return 1; +} + +static int add_location(struct loc_track *t, struct kmem_cache *s, + const struct track *track) +{ + long start, end, pos; + struct location *l; + unsigned long caddr; + unsigned long age = jiffies - track->when; + + start = -1; + end = t->count; + + for ( ; ; ) { + pos = start + (end - start + 1) / 2; + + /* + * There is nothing at "end". If we end up there + * we need to add something to before end. + */ + if (pos == end) + break; + + caddr = t->loc[pos].addr; + if (track->addr == caddr) { + + l = &t->loc[pos]; + l->count++; + if (track->when) { + l->sum_time += age; + if (age < l->min_time) + l->min_time = age; + if (age > l->max_time) + l->max_time = age; + + if (track->pid < l->min_pid) + l->min_pid = track->pid; + if (track->pid > l->max_pid) + l->max_pid = track->pid; + + cpumask_set_cpu(track->cpu, + to_cpumask(l->cpus)); + } + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; + } + + if (track->addr < caddr) + end = pos; + else + start = pos; + } + + /* + * Not found. Insert new tracking element. + */ + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) + return 0; + + l = t->loc + pos; + if (pos < t->count) + memmove(l + 1, l, + (t->count - pos) * sizeof(struct location)); + t->count++; + l->count = 1; + l->addr = track->addr; + l->sum_time = age; + l->min_time = age; + l->max_time = age; + l->min_pid = track->pid; + l->max_pid = track->pid; + cpumask_clear(to_cpumask(l->cpus)); + cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); + nodes_clear(l->nodes); + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; +} + +static void process_slab(struct loc_track *t, struct kmem_cache *s, + struct page *page, enum track_item alloc, + unsigned long *map) +{ + void *addr = page_address(page); + void *p; + + bitmap_zero(map, page->objects); + get_map(s, page, map); + + for_each_object(p, s, addr, page->objects) + if (!test_bit(slab_index(p, s, addr), map)) + add_location(t, s, get_track(s, p, alloc)); +} + +static int list_locations(struct kmem_cache *s, char *buf, + enum track_item alloc) +{ + int len = 0; + unsigned long i; + struct loc_track t = { 0, 0, NULL }; + int node; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; + + if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) { + kfree(map); + return sprintf(buf, "Out of memory\n"); + } + /* Push back cpu slabs */ + flush_all(s); + + for_each_kmem_cache_node(s, node, n) { + unsigned long flags; + struct page *page; + + if (!atomic_long_read(&n->nr_slabs)) + continue; + + raw_spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + process_slab(&t, s, page, alloc, map); + list_for_each_entry(page, &n->full, lru) + process_slab(&t, s, page, alloc, map); + raw_spin_unlock_irqrestore(&n->list_lock, flags); + } + + for (i = 0; i < t.count; i++) { + struct location *l = &t.loc[i]; + + if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) + break; + len += sprintf(buf + len, "%7ld ", l->count); + + if (l->addr) + len += sprintf(buf + len, "%pS", (void *)l->addr); + else + len += sprintf(buf + len, ""); + + if (l->sum_time != l->min_time) { + len += sprintf(buf + len, " age=%ld/%ld/%ld", + l->min_time, + (long)div_u64(l->sum_time, l->count), + l->max_time); + } else + len += sprintf(buf + len, " age=%ld", + l->min_time); + + if (l->min_pid != l->max_pid) + len += sprintf(buf + len, " pid=%ld-%ld", + l->min_pid, l->max_pid); + else + len += sprintf(buf + len, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && + !cpumask_empty(to_cpumask(l->cpus)) && + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); + + if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); + + len += sprintf(buf + len, "\n"); + } + + free_loc_track(&t); + kfree(map); + if (!t.count) + len += sprintf(buf, "No data\n"); + return len; +} +#endif + +#ifdef SLUB_RESILIENCY_TEST +static void __init resiliency_test(void) +{ + u8 *p; + + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); + + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); + + validate_slab_cache(kmalloc_caches[4]); + + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches[5]); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches[6]); + + pr_err("\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[7]); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[8]); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[9]); +} +#else +#ifdef CONFIG_SYSFS +static void resiliency_test(void) {}; +#endif +#endif + +#ifdef CONFIG_SYSFS +enum slab_stat_type { + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ +}; + +#define SO_ALL (1 << SL_ALL) +#define SO_PARTIAL (1 << SL_PARTIAL) +#define SO_CPU (1 << SL_CPU) +#define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) + +static ssize_t show_slab_objects(struct kmem_cache *s, + char *buf, unsigned long flags) +{ + unsigned long total = 0; + int node; + int x; + unsigned long *nodes; + + nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + if (!nodes) + return -ENOMEM; + + if (flags & SO_CPU) { + int cpu; + + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, + cpu); + int node; + struct page *page; + + page = READ_ONCE(c->page); + if (!page) + continue; + + node = page_to_nid(page); + if (flags & SO_TOTAL) + x = page->objects; + else if (flags & SO_OBJECTS) + x = page->inuse; + else + x = 1; + + total += x; + nodes[node] += x; + + page = READ_ONCE(c->partial); + if (page) { + node = page_to_nid(page); + if (flags & SO_TOTAL) + WARN_ON_ONCE(1); + else if (flags & SO_OBJECTS) + WARN_ON_ONCE(1); + else + x = page->pages; + total += x; + nodes[node] += x; + } + } + } + + get_online_mems(); +#ifdef CONFIG_SLUB_DEBUG + if (flags & SO_ALL) { + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); + else + x = atomic_long_read(&n->nr_slabs); + total += x; + nodes[node] += x; + } + + } else +#endif + if (flags & SO_PARTIAL) { + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); + else + x = n->nr_partial; + total += x; + nodes[node] += x; + } + } + x = sprintf(buf, "%lu", total); +#ifdef CONFIG_NUMA + for (node = 0; node < nr_node_ids; node++) + if (nodes[node]) + x += sprintf(buf + x, " N%d=%lu", + node, nodes[node]); +#endif + put_online_mems(); + kfree(nodes); + return x + sprintf(buf + x, "\n"); +} + +#ifdef CONFIG_SLUB_DEBUG +static int any_slab_objects(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (atomic_long_read(&n->total_objects)) + return 1; + + return 0; +} +#endif + +#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) +#define to_slab(n) container_of(n, struct kmem_cache, kobj) + +struct slab_attribute { + struct attribute attr; + ssize_t (*show)(struct kmem_cache *s, char *buf); + ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); +}; + +#define SLAB_ATTR_RO(_name) \ + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) + +#define SLAB_ATTR(_name) \ + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0600, _name##_show, _name##_store) + +static ssize_t slab_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->size); +} +SLAB_ATTR_RO(slab_size); + +static ssize_t align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->align); +} +SLAB_ATTR_RO(align); + +static ssize_t object_size_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->object_size); +} +SLAB_ATTR_RO(object_size); + +static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", oo_objects(s->oo)); +} +SLAB_ATTR_RO(objs_per_slab); + +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long order; + int err; + + err = kstrtoul(buf, 10, &order); + if (err) + return err; + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + +static ssize_t order_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", oo_order(s->oo)); +} +SLAB_ATTR(order); + +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = kstrtoul(buf, 10, &min); + if (err) + return err; + + set_min_partial(s, min); + return length; +} +SLAB_ATTR(min_partial); + +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = kstrtoul(buf, 10, &objects); + if (err) + return err; + if (objects && !kmem_cache_has_cpu_partial(s)) + return -EINVAL; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + +static ssize_t ctor_show(struct kmem_cache *s, char *buf) +{ + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); +} +SLAB_ATTR_RO(ctor); + +static ssize_t aliases_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); +} +SLAB_ATTR_RO(aliases); + +static ssize_t partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL); +} +SLAB_ATTR_RO(partial); + +static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_CPU); +} +SLAB_ATTR_RO(cpu_slabs); + +static ssize_t objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects); + +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int pages = 0; + int cpu; + int len; + + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); + +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); +} + +static ssize_t reclaim_account_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_RECLAIM_ACCOUNT; + if (buf[0] == '1') + s->flags |= SLAB_RECLAIM_ACCOUNT; + return length; +} +SLAB_ATTR(reclaim_account); + +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); +} +SLAB_ATTR_RO(hwcache_align); + +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); +} +SLAB_ATTR_RO(cache_dma); +#endif + +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); +} +SLAB_ATTR_RO(destroy_by_rcu); + +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + +#ifdef CONFIG_SLUB_DEBUG +static ssize_t slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL); +} +SLAB_ATTR_RO(slabs); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); +} + +static ssize_t sanity_checks_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_DEBUG_FREE; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_DEBUG_FREE; + } + return length; +} +SLAB_ATTR(sanity_checks); + +static ssize_t trace_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); +} + +static ssize_t trace_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + /* + * Tracing a merged cache is going to give confusing results + * as well as cause other issues like converting a mergeable + * cache into an umergeable one. + */ + if (s->refcount > 1) + return -EINVAL; + + s->flags &= ~SLAB_TRACE; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_TRACE; + } + return length; +} +SLAB_ATTR(trace); + +static ssize_t red_zone_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); +} + +static ssize_t red_zone_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_RED_ZONE; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_RED_ZONE; + } + calculate_sizes(s, -1); + return length; +} +SLAB_ATTR(red_zone); + +static ssize_t poison_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); +} + +static ssize_t poison_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_POISON; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_POISON; + } + calculate_sizes(s, -1); + return length; +} +SLAB_ATTR(poison); + +static ssize_t store_user_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); +} + +static ssize_t store_user_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_STORE_USER; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_STORE_USER; + } + calculate_sizes(s, -1); + return length; +} +SLAB_ATTR(store_user); + +static ssize_t validate_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t validate_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = -EINVAL; + + if (buf[0] == '1') { + ret = validate_slab_cache(s); + if (ret >= 0) + ret = length; + } + return ret; +} +SLAB_ATTR(validate); + +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + if (s->refcount > 1) + return -EINVAL; + + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + +static ssize_t shrink_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t shrink_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') + kmem_cache_shrink(s); + else + return -EINVAL; + return length; +} +SLAB_ATTR(shrink); + +#ifdef CONFIG_NUMA +static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); +} + +static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned long ratio; + int err; + + err = kstrtoul(buf, 10, &ratio); + if (err) + return err; + + if (ratio <= 100) + s->remote_node_defrag_ratio = ratio * 10; + + return length; +} +SLAB_ATTR(remote_node_defrag_ratio); +#endif + +#ifdef CONFIG_SLUB_STATS +static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) +{ + unsigned long sum = 0; + int cpu; + int len; + int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + for_each_online_cpu(cpu) { + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; + + data[cpu] = x; + sum += x; + } + + len = sprintf(buf, "%lu", sum); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + if (data[cpu] && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); + } +#endif + kfree(data); + return len + sprintf(buf + len, "\n"); +} + +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; +} + +#define STAT_ATTR(si, text) \ +static ssize_t text##_show(struct kmem_cache *s, char *buf) \ +{ \ + return show_stat(s, buf, si); \ +} \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ + +STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); +STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_FASTPATH, free_fastpath); +STAT_ATTR(FREE_SLOWPATH, free_slowpath); +STAT_ATTR(FREE_FROZEN, free_frozen); +STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); +STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); +STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); +STAT_ATTR(ALLOC_SLAB, alloc_slab); +STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); +STAT_ATTR(FREE_SLAB, free_slab); +STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); +STAT_ATTR(DEACTIVATE_FULL, deactivate_full); +STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); +STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); +STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); +STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); +STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); +STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); +STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +#endif + +static struct attribute *slab_attrs[] = { + &slab_size_attr.attr, + &object_size_attr.attr, + &objs_per_slab_attr.attr, + &order_attr.attr, + &min_partial_attr.attr, + &cpu_partial_attr.attr, + &objects_attr.attr, + &objects_partial_attr.attr, + &partial_attr.attr, + &cpu_slabs_attr.attr, + &ctor_attr.attr, + &aliases_attr.attr, + &align_attr.attr, + &hwcache_align_attr.attr, + &reclaim_account_attr.attr, + &destroy_by_rcu_attr.attr, + &shrink_attr.attr, + &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, +#ifdef CONFIG_SLUB_DEBUG + &total_objects_attr.attr, + &slabs_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, + &red_zone_attr.attr, + &poison_attr.attr, + &store_user_attr.attr, + &validate_attr.attr, + &alloc_calls_attr.attr, + &free_calls_attr.attr, +#endif +#ifdef CONFIG_ZONE_DMA + &cache_dma_attr.attr, +#endif +#ifdef CONFIG_NUMA + &remote_node_defrag_ratio_attr.attr, +#endif +#ifdef CONFIG_SLUB_STATS + &alloc_fastpath_attr.attr, + &alloc_slowpath_attr.attr, + &free_fastpath_attr.attr, + &free_slowpath_attr.attr, + &free_frozen_attr.attr, + &free_add_partial_attr.attr, + &free_remove_partial_attr.attr, + &alloc_from_partial_attr.attr, + &alloc_slab_attr.attr, + &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, + &free_slab_attr.attr, + &cpuslab_flush_attr.attr, + &deactivate_full_attr.attr, + &deactivate_empty_attr.attr, + &deactivate_to_head_attr.attr, + &deactivate_to_tail_attr.attr, + &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, + &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, + &cpu_partial_node_attr.attr, + &cpu_partial_drain_attr.attr, +#endif +#ifdef CONFIG_FAILSLAB + &failslab_attr.attr, +#endif + + NULL +}; + +static struct attribute_group slab_attr_group = { + .attrs = slab_attrs, +}; + +static ssize_t slab_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->show) + return -EIO; + + err = attribute->show(s, buf); + + return err; +} + +static ssize_t slab_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + int err; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->store) + return -EIO; + + err = attribute->store(s, buf, len); +#ifdef CONFIG_MEMCG_KMEM + if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { + struct kmem_cache *c; + + mutex_lock(&slab_mutex); + if (s->max_attr_size < len) + s->max_attr_size = len; + + /* + * This is a best effort propagation, so this function's return + * value will be determined by the parent cache only. This is + * basically because not all attributes will have a well + * defined semantics for rollbacks - most of the actions will + * have permanent effects. + * + * Returning the error value of any of the children that fail + * is not 100 % defined, in the sense that users seeing the + * error code won't be able to know anything about the state of + * the cache. + * + * Only returning the error code for the parent cache at least + * has well defined semantics. The cache being written to + * directly either failed or succeeded, in which case we loop + * through the descendants with best-effort propagation. + */ + for_each_memcg_cache(c, s) + attribute->store(c, buf, len); + mutex_unlock(&slab_mutex); + } +#endif + return err; +} + +static void memcg_propagate_slab_attrs(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + int i; + char *buffer = NULL; + struct kmem_cache *root_cache; + + if (is_root_cache(s)) + return; + + root_cache = s->memcg_params.root_cache; + + /* + * This mean this cache had no attribute written. Therefore, no point + * in copying default values around + */ + if (!root_cache->max_attr_size) + return; + + for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { + char mbuf[64]; + char *buf; + struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + + if (!attr || !attr->store || !attr->show) + continue; + + /* + * It is really bad that we have to allocate here, so we will + * do it only as a fallback. If we actually allocate, though, + * we can just use the allocated buffer until the end. + * + * Most of the slub attributes will tend to be very small in + * size, but sysfs allows buffers up to a page, so they can + * theoretically happen. + */ + if (buffer) + buf = buffer; + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + buf = mbuf; + else { + buffer = (char *) get_zeroed_page(GFP_KERNEL); + if (WARN_ON(!buffer)) + continue; + buf = buffer; + } + + attr->show(root_cache, buf); + attr->store(s, buf, strlen(buf)); + } + + if (buffer) + free_page((unsigned long)buffer); +#endif +} + +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); +} + +static const struct sysfs_ops slab_sysfs_ops = { + .show = slab_attr_show, + .store = slab_attr_store, +}; + +static struct kobj_type slab_ktype = { + .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, +}; + +static int uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &slab_ktype) + return 1; + return 0; +} + +static const struct kset_uevent_ops slab_uevent_ops = { + .filter = uevent_filter, +}; + +static struct kset *slab_kset; + +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params.root_cache->memcg_kset; +#endif + return slab_kset; +} + +#define ID_STR_LENGTH 64 + +/* Create a unique string id for a slab cache: + * + * Format :[flags-]size + */ +static char *create_unique_id(struct kmem_cache *s) +{ + char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); + char *p = name; + + BUG_ON(!name); + + *p++ = ':'; + /* + * First flags affecting slabcache operations. We will only + * get here for aliasable slabs so we do not need to support + * too many flags. The flags here must cover all flags that + * are matched during merging to guarantee that the id is + * unique. + */ + if (s->flags & SLAB_CACHE_DMA) + *p++ = 'd'; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + *p++ = 'a'; + if (s->flags & SLAB_DEBUG_FREE) + *p++ = 'F'; + if (!(s->flags & SLAB_NOTRACK)) + *p++ = 't'; + if (p != name + 1) + *p++ = '-'; + p += sprintf(p, "%07d", s->size); + + BUG_ON(p > name + ID_STR_LENGTH - 1); + return name; +} + +static int sysfs_slab_add(struct kmem_cache *s) +{ + int err; + const char *name; + int unmergeable = slab_unmergeable(s); + + if (unmergeable) { + /* + * Slabcache can never be merged so we can use the name proper. + * This is typically the case for debug situations. In that + * case we can catch duplicate names easily. + */ + sysfs_remove_link(&slab_kset->kobj, s->name); + name = s->name; + } else { + /* + * Create a unique name for the slab as a target + * for the symlinks. + */ + name = create_unique_id(s); + } + + s->kobj.kset = cache_kset(s); + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); + if (err) + goto out_put_kobj; + + err = sysfs_create_group(&s->kobj, &slab_attr_group); + if (err) + goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + err = -ENOMEM; + goto out_del_kobj; + } + } +#endif + + kobject_uevent(&s->kobj, KOBJ_ADD); + if (!unmergeable) { + /* Setup first alias */ + sysfs_slab_alias(s, s->name); + } +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; +} + +void sysfs_slab_remove(struct kmem_cache *s) +{ + if (slab_state < FULL) + /* + * Sysfs has not been setup yet so no need to remove the + * cache from sysfs. + */ + return; + +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif + kobject_uevent(&s->kobj, KOBJ_REMOVE); + kobject_del(&s->kobj); + kobject_put(&s->kobj); +} + +/* + * Need to buffer aliases during bootup until sysfs becomes + * available lest we lose that information. + */ +struct saved_alias { + struct kmem_cache *s; + const char *name; + struct saved_alias *next; +}; + +static struct saved_alias *alias_list; + +static int sysfs_slab_alias(struct kmem_cache *s, const char *name) +{ + struct saved_alias *al; + + if (slab_state == FULL) { + /* + * If we have a leftover link then remove it. + */ + sysfs_remove_link(&slab_kset->kobj, name); + return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); + } + + al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); + if (!al) + return -ENOMEM; + + al->s = s; + al->name = name; + al->next = alias_list; + alias_list = al; + return 0; +} + +static int __init slab_sysfs_init(void) +{ + struct kmem_cache *s; + int err; + + mutex_lock(&slab_mutex); + + slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + if (!slab_kset) { + mutex_unlock(&slab_mutex); + pr_err("Cannot register slab subsystem.\n"); + return -ENOSYS; + } + + slab_state = FULL; + + list_for_each_entry(s, &slab_caches, list) { + err = sysfs_slab_add(s); + if (err) + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); + } + + while (alias_list) { + struct saved_alias *al = alias_list; + + alias_list = alias_list->next; + err = sysfs_slab_alias(al->s, al->name); + if (err) + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); + kfree(al); + } + + mutex_unlock(&slab_mutex); + resiliency_test(); + return 0; +} + +__initcall(slab_sysfs_init); +#endif /* CONFIG_SYSFS */ + +/* + * The /proc/slabinfo ABI + */ +#ifdef CONFIG_SLABINFO +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) +{ + unsigned long nr_slabs = 0; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + nr_slabs += node_nr_slabs(n); + nr_objs += node_nr_objs(n); + nr_free += count_partial(n, count_free); + } + + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) +{ +} + +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return -EIO; +} +#endif /* CONFIG_SLABINFO */ diff --git a/kernel/mm/sparse-vmemmap.c b/kernel/mm/sparse-vmemmap.c new file mode 100644 index 000000000..4cba9c278 --- /dev/null +++ b/kernel/mm/sparse-vmemmap.c @@ -0,0 +1,235 @@ +/* + * Virtual Memory Map support + * + * (C) 2007 sgi. Christoph Lameter. + * + * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, + * virt_to_page, page_address() to be implemented as a base offset + * calculation without memory access. + * + * However, virtual mappings need a page table and TLBs. Many Linux + * architectures already map their physical space using 1-1 mappings + * via TLBs. For those arches the virtual memory map is essentially + * for free if we use the same page size as the 1-1 mappings. In that + * case the overhead consists of a few additional pages that are + * allocated to create a view of memory for vmemmap. + * + * The architecture is expected to provide a vmemmap_populate() function + * to instantiate the mapping. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Allocate a block of memory to be used to back the virtual memory map + * or to back the page tables that are used to create the mapping. + * Uses the main allocators if they are available, else bootmem. + */ + +static void * __init_refok __earlyonly_bootmem_alloc(int node, + unsigned long size, + unsigned long align, + unsigned long goal) +{ + return memblock_virt_alloc_try_nid(size, align, goal, + BOOTMEM_ALLOC_ACCESSIBLE, node); +} + +static void *vmemmap_buf; +static void *vmemmap_buf_end; + +void * __meminit vmemmap_alloc_block(unsigned long size, int node) +{ + /* If the main allocator is up use that, fallback to bootmem. */ + if (slab_is_available()) { + struct page *page; + + if (node_state(node, N_HIGH_MEMORY)) + page = alloc_pages_node( + node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + get_order(size)); + else + page = alloc_pages( + GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + get_order(size)); + if (page) + return page_address(page); + return NULL; + } else + return __earlyonly_bootmem_alloc(node, size, size, + __pa(MAX_DMA_ADDRESS)); +} + +/* need to make sure size is all the same during early stage */ +void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +{ + void *ptr; + + if (!vmemmap_buf) + return vmemmap_alloc_block(size, node); + + /* take the from buf */ + ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); + if (ptr + size > vmemmap_buf_end) + return vmemmap_alloc_block(size, node); + + vmemmap_buf = ptr + size; + + return ptr; +} + +void __meminit vmemmap_verify(pte_t *pte, int node, + unsigned long start, unsigned long end) +{ + unsigned long pfn = pte_pfn(*pte); + int actual_node = early_pfn_to_nid(pfn); + + if (node_distance(actual_node, node) > LOCAL_DISTANCE) + printk(KERN_WARNING "[%lx-%lx] potential offnode " + "page_structs\n", start, end - 1); +} + +pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + pte_t entry; + void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); + if (!p) + return NULL; + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } + return pte; +} + +pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) +{ + pmd_t *pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + pmd_populate_kernel(&init_mm, pmd, p); + } + return pmd; +} + +pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node) +{ + pud_t *pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + pud_populate(&init_mm, pud, p); + } + return pud; +} + +pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) +{ + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p) + return NULL; + pgd_populate(&init_mm, pgd, p); + } + return pgd; +} + +int __meminit vmemmap_populate_basepages(unsigned long start, + unsigned long end, int node) +{ + unsigned long addr = start; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (; addr < end; addr += PAGE_SIZE) { + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_populate(pud, addr, node); + if (!pmd) + return -ENOMEM; + pte = vmemmap_pte_populate(pmd, addr, node); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + } + + return 0; +} + +struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) +{ + unsigned long start; + unsigned long end; + struct page *map; + + map = pfn_to_page(pnum * PAGES_PER_SECTION); + start = (unsigned long)map; + end = (unsigned long)(map + PAGES_PER_SECTION); + + if (vmemmap_populate(start, end, nid)) + return NULL; + + return map; +} + +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + void *vmemmap_buf_start; + + size = ALIGN(size, PMD_SIZE); + vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, + PMD_SIZE, __pa(MAX_DMA_ADDRESS)); + + if (vmemmap_buf_start) { + vmemmap_buf = vmemmap_buf_start; + vmemmap_buf_end = vmemmap_buf_start + size * map_count; + } + + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } + + if (vmemmap_buf_start) { + /* need to free left buf */ + memblock_free_early(__pa(vmemmap_buf), + vmemmap_buf_end - vmemmap_buf); + vmemmap_buf = NULL; + vmemmap_buf_end = NULL; + } +} diff --git a/kernel/mm/sparse.c b/kernel/mm/sparse.c new file mode 100644 index 000000000..d1b48b691 --- /dev/null +++ b/kernel/mm/sparse.c @@ -0,0 +1,811 @@ +/* + * sparse memory mappings. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include +#include +#include + +/* + * Permanent SPARSEMEM data: + * + * 1) mem_section - memory sections, mem_map's for valid memory + */ +#ifdef CONFIG_SPARSEMEM_EXTREME +struct mem_section *mem_section[NR_SECTION_ROOTS] + ____cacheline_internodealigned_in_smp; +#else +struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] + ____cacheline_internodealigned_in_smp; +#endif +EXPORT_SYMBOL(mem_section); + +#ifdef NODE_NOT_IN_PAGE_FLAGS +/* + * If we did not store the node number in the page then we have to + * do a lookup in the section_to_node_table in order to find which + * node the page belongs to. + */ +#if MAX_NUMNODES <= 256 +static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#else +static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#endif + +int page_to_nid(const struct page *page) +{ + return section_to_node_table[page_to_section(page)]; +} +EXPORT_SYMBOL(page_to_nid); + +static void set_section_nid(unsigned long section_nr, int nid) +{ + section_to_node_table[section_nr] = nid; +} +#else /* !NODE_NOT_IN_PAGE_FLAGS */ +static inline void set_section_nid(unsigned long section_nr, int nid) +{ +} +#endif + +#ifdef CONFIG_SPARSEMEM_EXTREME +static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) +{ + struct mem_section *section = NULL; + unsigned long array_size = SECTIONS_PER_ROOT * + sizeof(struct mem_section); + + if (slab_is_available()) { + if (node_state(nid, N_HIGH_MEMORY)) + section = kzalloc_node(array_size, GFP_KERNEL, nid); + else + section = kzalloc(array_size, GFP_KERNEL); + } else { + section = memblock_virt_alloc_node(array_size, nid); + } + + return section; +} + +static int __meminit sparse_index_init(unsigned long section_nr, int nid) +{ + unsigned long root = SECTION_NR_TO_ROOT(section_nr); + struct mem_section *section; + + if (mem_section[root]) + return -EEXIST; + + section = sparse_index_alloc(nid); + if (!section) + return -ENOMEM; + + mem_section[root] = section; + + return 0; +} +#else /* !SPARSEMEM_EXTREME */ +static inline int sparse_index_init(unsigned long section_nr, int nid) +{ + return 0; +} +#endif + +/* + * Although written for the SPARSEMEM_EXTREME case, this happens + * to also work for the flat array case because + * NR_SECTION_ROOTS==NR_MEM_SECTIONS. + */ +int __section_nr(struct mem_section* ms) +{ + unsigned long root_nr; + struct mem_section* root; + + for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { + root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); + if (!root) + continue; + + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) + break; + } + + VM_BUG_ON(root_nr == NR_SECTION_ROOTS); + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); +} + +/* + * During early boot, before section_mem_map is used for an actual + * mem_map, we use section_mem_map to store the section's NUMA + * node. This keeps us from having to use another data structure. The + * node information is cleared just before we store the real mem_map. + */ +static inline unsigned long sparse_encode_early_nid(int nid) +{ + return (nid << SECTION_NID_SHIFT); +} + +static inline int sparse_early_nid(struct mem_section *section) +{ + return (section->section_mem_map >> SECTION_NID_SHIFT); +} + +/* Validate the physical addressing limitations of the model */ +void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); + + /* + * Sanity checks - do not allow an architecture to pass + * in larger pfns than the maximum scope of sparsemem: + */ + if (*start_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *start_pfn = max_sparsemem_pfn; + *end_pfn = max_sparsemem_pfn; + } else if (*end_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *end_pfn = max_sparsemem_pfn; + } +} + +/* Record a memory area against a node. */ +void __init memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + start &= PAGE_SECTION_MASK; + mminit_validate_memmodel_limits(&start, &end); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + unsigned long section = pfn_to_section_nr(pfn); + struct mem_section *ms; + + sparse_index_init(section, nid); + set_section_nid(section, nid); + + ms = __nr_to_section(section); + if (!ms->section_mem_map) + ms->section_mem_map = sparse_encode_early_nid(nid) | + SECTION_MARKED_PRESENT; + } +} + +/* + * Only used by the i386 NUMA architecures, but relatively + * generic code. + */ +unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long nr_pages = 0; + + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + if (nid != early_pfn_to_nid(pfn)) + continue; + + if (pfn_present(pfn)) + nr_pages += PAGES_PER_SECTION; + } + + return nr_pages * sizeof(struct page); +} + +/* + * Subtle, we encode the real pfn into the mem_map such that + * the identity pfn - section_mem_map will return the actual + * physical page frame number. + */ +static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) +{ + return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); +} + +/* + * Decode mem_map from the coded memmap + */ +struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) +{ + /* mask off the extra low bits of information */ + coded_mem_map &= SECTION_MAP_MASK; + return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); +} + +static int __meminit sparse_init_one_section(struct mem_section *ms, + unsigned long pnum, struct page *mem_map, + unsigned long *pageblock_bitmap) +{ + if (!present_section(ms)) + return -EINVAL; + + ms->section_mem_map &= ~SECTION_MAP_MASK; + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | + SECTION_HAS_MEM_MAP; + ms->pageblock_flags = pageblock_bitmap; + + return 1; +} + +unsigned long usemap_size(void) +{ + unsigned long size_bytes; + size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; + size_bytes = roundup(size_bytes, sizeof(unsigned long)); + return size_bytes; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static unsigned long *__kmalloc_section_usemap(void) +{ + return kmalloc(usemap_size(), GFP_KERNEL); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static unsigned long * __init +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long size) +{ + unsigned long goal, limit; + unsigned long *p; + int nid; + /* + * A page may contain usemaps for other sections preventing the + * page being freed and making a section unremovable while + * other sections referencing the usemap remain active. Similarly, + * a pgdat can prevent a section being removed. If section A + * contains a pgdat and section B contains the usemap, both + * sections become inter-dependent. This allocates usemaps + * from the same section as the pgdat where possible to avoid + * this problem. + */ + goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + p = memblock_virt_alloc_try_nid_nopanic(size, + SMP_CACHE_BYTES, goal, limit, + nid); + if (!p && limit) { + limit = 0; + goto again; + } + return p; +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ + unsigned long usemap_snr, pgdat_snr; + static unsigned long old_usemap_snr = NR_MEM_SECTIONS; + static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; + struct pglist_data *pgdat = NODE_DATA(nid); + int usemap_nid; + + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + if (usemap_snr == pgdat_snr) + return; + + if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) + /* skip redundant message */ + return; + + old_usemap_snr = usemap_snr; + old_pgdat_snr = pgdat_snr; + + usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); + if (usemap_nid != nid) { + printk(KERN_INFO + "node %d must be removed before remove section %ld\n", + nid, usemap_snr); + return; + } + /* + * There is a circular dependency. + * Some platforms allow un-removable section because they will just + * gather other removable sections for dynamic partitioning. + * Just notify un-removable section's number here. + */ + printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, + pgdat_snr, nid); + printk(KERN_CONT + " have a circular dependency on usemap and pgdat allocations\n"); +} +#else +static unsigned long * __init +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long size) +{ + return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +static void __init sparse_early_usemaps_alloc_node(void *data, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long usemap_count, int nodeid) +{ + void *usemap; + unsigned long pnum; + unsigned long **usemap_map = (unsigned long **)data; + int size = usemap_size(); + + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), + size * usemap_count); + if (!usemap) { + printk(KERN_WARNING "%s: allocation failed\n", __func__); + return; + } + + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + check_usemap_section_nr(nodeid, usemap_map[pnum]); + } +} + +#ifndef CONFIG_SPARSEMEM_VMEMMAP +struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) +{ + struct page *map; + unsigned long size; + + map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); + if (map) + return map; + + size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); + map = memblock_virt_alloc_try_nid(size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nid); + return map; +} +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + void *map; + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + + map = alloc_remap(nodeid, size * map_count); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + size = PAGE_ALIGN(size); + map = memblock_virt_alloc_try_nid(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + /* fallback */ + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER +static void __init sparse_early_mem_maps_alloc_node(void *data, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + struct page **map_map = (struct page **)data; + sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, + map_count, nodeid); +} +#else +static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) +{ + struct page *map; + struct mem_section *ms = __nr_to_section(pnum); + int nid = sparse_early_nid(ms); + + map = sparse_mem_map_populate(pnum, nid); + if (map) + return map; + + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + return NULL; +} +#endif + +void __weak __meminit vmemmap_populate_print_last(void) +{ +} + +/** + * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap + * @map: usemap_map for pageblock flags or mmap_map for vmemmap + */ +static void __init alloc_usemap_and_memmap(void (*alloc_func) + (void *, unsigned long, unsigned long, + unsigned long, int), void *data) +{ + unsigned long pnum; + unsigned long map_count; + int nodeid_begin = 0; + unsigned long pnum_begin = 0; + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; + } + map_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + map_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + alloc_func(data, pnum_begin, pnum, + map_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + map_count = 1; + } + /* ok, last chunk */ + alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +} + +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void __init sparse_init(void) +{ + unsigned long pnum; + struct page *map; + unsigned long *usemap; + unsigned long **usemap_map; + int size; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + int size2; + struct page **map_map; +#endif + + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); + + /* + * map is using big page (aka 2M in x86 64 bit) + * usemap is less one page (aka 24 bytes) + * so alloc 2M (with 2M align) and 24 bytes in turn will + * make next 2M slip to one more 2M later. + * then in big system, the memory will have a lot of holes... + * here try to allocate 2M pages continuously. + * + * powerpc need to call sparse_init_one_section right after each + * sparse_early_mem_map_alloc, so allocate usemap_map at first. + */ + size = sizeof(unsigned long *) * NR_MEM_SECTIONS; + usemap_map = memblock_virt_alloc(size, 0); + if (!usemap_map) + panic("can not allocate usemap_map\n"); + alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, + (void *)usemap_map); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + map_map = memblock_virt_alloc(size2, 0); + if (!map_map) + panic("can not allocate map_map\n"); + alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, + (void *)map_map); +#endif + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + if (!present_section_nr(pnum)) + continue; + + usemap = usemap_map[pnum]; + if (!usemap) + continue; + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + map = map_map[pnum]; +#else + map = sparse_early_mem_map_alloc(pnum); +#endif + if (!map) + continue; + + sparse_init_one_section(__nr_to_section(pnum), pnum, map, + usemap); + } + + vmemmap_populate_print_last(); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + memblock_free_early(__pa(map_map), size2); +#endif + memblock_free_early(__pa(usemap_map), size); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) +{ + /* This will make the necessary allocations eventually. */ + return sparse_mem_map_populate(pnum, nid); +} +static void __kfree_section_memmap(struct page *memmap) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end); +} +#ifdef CONFIG_MEMORY_HOTREMOVE +static void free_map_bootmem(struct page *memmap) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end); +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#else +static struct page *__kmalloc_section_memmap(void) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; + + page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + + return ret; +} + +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) +{ + return __kmalloc_section_memmap(); +} + +static void __kfree_section_memmap(struct page *memmap) +{ + if (is_vmalloc_addr(memmap)) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * PAGES_PER_SECTION)); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static void free_map_bootmem(struct page *memmap) +{ + unsigned long maps_section_nr, removing_section_nr, i; + unsigned long magic, nr_pages; + struct page *page = virt_to_page(memmap); + + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + for (i = 0; i < nr_pages; i++, page++) { + magic = (unsigned long) page->lru.next; + + BUG_ON(magic == NODE_INFO); + + maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); + removing_section_nr = page->private; + + /* + * When this function is called, the removing section is + * logical offlined state. This means all pages are isolated + * from page allocator. If removing section's memmap is placed + * on the same section, it must not be freed. + * If it is freed, page allocator may allocate it which will + * be removed physically soon. + */ + if (maps_section_nr != removing_section_nr) + put_page_bootmem(page); + } +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * returns the number of sections whose mem_maps were properly + * set. If this is <=0, then that means that the passed-in + * map was not consumed and must be freed. + */ +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +{ + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; + struct mem_section *ms; + struct page *memmap; + unsigned long *usemap; + unsigned long flags; + int ret; + + /* + * no locking for this, because it does its own + * plus, it does a kmalloc + */ + ret = sparse_index_init(section_nr, pgdat->node_id); + if (ret < 0 && ret != -EEXIST) + return ret; + memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); + if (!memmap) + return -ENOMEM; + usemap = __kmalloc_section_usemap(); + if (!usemap) { + __kfree_section_memmap(memmap); + return -ENOMEM; + } + + pgdat_resize_lock(pgdat, &flags); + + ms = __pfn_to_section(start_pfn); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) { + ret = -EEXIST; + goto out; + } + + memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); + + ms->section_mem_map |= SECTION_MARKED_PRESENT; + + ret = sparse_init_one_section(ms, section_nr, memmap, usemap); + +out: + pgdat_resize_unlock(pgdat, &flags); + if (ret <= 0) { + kfree(usemap); + __kfree_section_memmap(memmap); + } + return ret; +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +#ifdef CONFIG_MEMORY_FAILURE +static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ + int i; + + if (!memmap) + return; + + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageHWPoison(&memmap[i])) { + atomic_long_sub(1, &num_poisoned_pages); + ClearPageHWPoison(&memmap[i]); + } + } +} +#else +static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ +} +#endif + +static void free_section_usemap(struct page *memmap, unsigned long *usemap) +{ + struct page *usemap_page; + + if (!usemap) + return; + + usemap_page = virt_to_page(usemap); + /* + * Check to see if allocation came from hot-plug-add + */ + if (PageSlab(usemap_page) || PageCompound(usemap_page)) { + kfree(usemap); + if (memmap) + __kfree_section_memmap(memmap); + return; + } + + /* + * The usemap came from bootmem. This is packed with other usemaps + * on the section which has pgdat at boot time. Just keep it as is now. + */ + + if (memmap) + free_map_bootmem(memmap); +} + +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +{ + struct page *memmap = NULL; + unsigned long *usemap = NULL, flags; + struct pglist_data *pgdat = zone->zone_pgdat; + + pgdat_resize_lock(pgdat, &flags); + if (ms->section_mem_map) { + usemap = ms->pageblock_flags; + memmap = sparse_decode_mem_map(ms->section_mem_map, + __section_nr(ms)); + ms->section_mem_map = 0; + ms->pageblock_flags = NULL; + } + pgdat_resize_unlock(pgdat, &flags); + + clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); + free_section_usemap(memmap, usemap); +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/kernel/mm/swap.c b/kernel/mm/swap.c new file mode 100644 index 000000000..1785ac603 --- /dev/null +++ b/kernel/mm/swap.c @@ -0,0 +1,1168 @@ +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file contains the default values for the operation of the + * Linux VM subsystem. Fine-tuning documentation can be found in + * Documentation/sysctl/vm.txt. + * Started 18.12.91 + * Swap aging added 23.2.95, Stephen Tweedie. + * Buffermem limits added 12.3.98, Rik van Riel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include + +/* How many pages do we try to swap or page in/out together? */ +int page_cluster; + +static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); +static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); + +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock); +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock); + +/* + * This path almost never happens for VM activity - pages are normally + * freed via pagevecs. But it gets used by networking. + */ +static void __page_cache_release(struct page *page) +{ + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + struct lruvec *lruvec; + unsigned long flags; + + spin_lock_irqsave(&zone->lru_lock, flags); + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(!PageLRU(page), page); + __ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_off_lru(page)); + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + mem_cgroup_uncharge(page); +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); + free_hot_cold_page(page, false); +} + +static void __put_compound_page(struct page *page) +{ + compound_page_dtor *dtor; + + /* + * __page_cache_release() is supposed to be called for thp, not for + * hugetlb. This is because hugetlb page does never have PageLRU set + * (it's never listed to any LRU lists) and no memcg routines should + * be called for hugetlb (it has a separate hugetlb_cgroup.) + */ + if (!PageHuge(page)) + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} + +/** + * Two special cases here: we could avoid taking compound_lock_irqsave + * and could skip the tail refcounting(in _mapcount). + * + * 1. Hugetlbfs page: + * + * PageHeadHuge will remain true until the compound page + * is released and enters the buddy allocator, and it could + * not be split by __split_huge_page_refcount(). + * + * So if we see PageHeadHuge set, and we have the tail page pin, + * then we could safely put head page. + * + * 2. Slab THP page: + * + * PG_slab is cleared before the slab frees the head page, and + * tail pin cannot be the last reference left on the head page, + * because the slab code is free to reuse the compound page + * after a kfree/kmem_cache_free without having to check if + * there's any tail pin left. In turn all tail pinsmust be always + * released while the head is still pinned by the slab code + * and so we know PG_slab will be still set too. + * + * So if we see PageSlab set, and we have the tail page pin, + * then we could safely put head page. + */ +static __always_inline +void put_unrefcounted_compound_page(struct page *page_head, struct page *page) +{ + /* + * If @page is a THP tail, we must read the tail page + * flags after the head page flags. The + * __split_huge_page_refcount side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount cannot race + * here, see the comment above this function. + */ + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a slab THP page, + * the tail pin must not be the last reference + * held on the page, because the PG_slab cannot + * be cleared before all tail pins (which skips + * the _mapcount tail refcounting) have been + * released. + * + * If this is the tail of a hugetlbfs page, + * the tail pin may be the last reference on + * the page instead, because PageHeadHuge will + * not go away until the compound page enters + * the buddy allocator. + */ + VM_BUG_ON_PAGE(PageSlab(page_head), page_head); + __put_compound_page(page_head); + } + } else + /* + * __split_huge_page_refcount run before us, + * @page was a THP tail. The split @page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). + */ + if (put_page_testzero(page)) + __put_single_page(page); +} + +static __always_inline +void put_refcounted_compound_page(struct page *page_head, struct page *page) +{ + if (likely(page != page_head && get_page_unless_zero(page_head))) { + unsigned long flags; + + /* + * @page_head wasn't a dangling pointer but it may not + * be a head page anymore by the time we obtain the + * lock. That is ok as long as it can't be freed from + * under us. + */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + /* + * The @page_head may have been freed + * and reallocated as a compound page + * of smaller order and then freed + * again. All we know is that it + * cannot have become: a THP page, a + * compound page of higher order, a + * tail page. That is because we + * still hold the refcount of the + * split THP tail and page_head was + * the THP head before the split. + */ + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } +out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON_PAGE(page_head != page->first_page, page); + /* + * We can release the refcount taken by + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON_PAGE(1, page_head); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); + atomic_dec(&page->_mapcount); + VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); + VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); + compound_unlock_irqrestore(page_head, flags); + + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* @page_head is a dangling pointer */ + VM_BUG_ON_PAGE(PageTail(page), page); + goto out_put_single; + } +} + +static void put_compound_page(struct page *page) +{ + struct page *page_head; + + /* + * We see the PageCompound set and PageTail not set, so @page maybe: + * 1. hugetlbfs head page, or + * 2. THP head page. + */ + if (likely(!PageTail(page))) { + if (put_page_testzero(page)) { + /* + * By the time all refcounts have been released + * split_huge_page cannot run anymore from under us. + */ + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); + } + return; + } + + /* + * We see the PageCompound set and PageTail set, so @page maybe: + * 1. a tail hugetlbfs page, or + * 2. a tail THP page, or + * 3. a split THP page. + * + * Case 3 is possible, as we may race with + * __split_huge_page_refcount tearing down a THP page. + */ + page_head = compound_head_by_tail(page); + if (!__compound_tail_refcounted(page_head)) + put_unrefcounted_compound_page(page_head, page); + else + put_refcounted_compound_page(page_head, page); +} + +void put_page(struct page *page) +{ + if (unlikely(PageCompound(page))) + put_compound_page(page); + else if (put_page_testzero(page)) + __put_single_page(page); +} +EXPORT_SYMBOL(put_page); + +/* + * This function is exported but must not be called by anything other + * than get_page(). It implements the slow path of get_page(). + */ +bool __get_page_tail(struct page *page) +{ + /* + * This takes care of get_page() if run on a tail page + * returned by one of the get_user_pages/follow_page variants. + * get_user_pages/follow_page itself doesn't need the compound + * lock because it runs __get_page_tail_foll() under the + * proper PT lock that already serializes against + * split_huge_page(). + */ + unsigned long flags; + bool got; + struct page *page_head = compound_head(page); + + /* Ref to put_compound_page() comment. */ + if (!__compound_tail_refcounted(page_head)) { + smp_rmb(); + if (likely(PageTail(page))) { + /* + * This is a hugetlbfs page or a slab + * page. __split_huge_page_refcount + * cannot race here. + */ + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + __get_page_tail_foll(page, true); + return true; + } else { + /* + * __split_huge_page_refcount run + * before us, "page" was a THP + * tail. The split page_head has been + * freed and reallocated as slab or + * hugetlbfs page of smaller order + * (only possible if reallocated as + * slab on x86). + */ + return false; + } + } + + got = false; + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; + } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); + } + return got; +} +EXPORT_SYMBOL(__get_page_tail); + +/** + * put_pages_list() - release a list of pages + * @pages: list of pages threaded on page->lru + * + * Release a list of pages which are strung together on page.lru. Currently + * used by read_cache_pages() and related error recovery code. + */ +void put_pages_list(struct list_head *pages) +{ + while (!list_empty(pages)) { + struct page *victim; + + victim = list_entry(pages->prev, struct page, lru); + list_del(&victim->lru); + page_cache_release(victim); + } +} +EXPORT_SYMBOL(put_pages_list); + +/* + * get_kernel_pages() - pin kernel pages in memory + * @kiov: An array of struct kvec structures + * @nr_segs: number of segments to pin + * @write: pinning for read/write, currently ignored + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_segs long. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. + */ +int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, + struct page **pages) +{ + int seg; + + for (seg = 0; seg < nr_segs; seg++) { + if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) + return seg; + + pages[seg] = kmap_to_page(kiov[seg].iov_base); + page_cache_get(pages[seg]); + } + + return seg; +} +EXPORT_SYMBOL_GPL(get_kernel_pages); + +/* + * get_kernel_page() - pin a kernel page in memory + * @start: starting kernel address + * @write: pinning for read/write, currently ignored + * @pages: array that receives pointer to the page pinned. + * Must be at least nr_segs long. + * + * Returns 1 if page is pinned. If the page was not pinned, returns + * -errno. The page returned must be released with a put_page() call + * when it is finished with. + */ +int get_kernel_page(unsigned long start, int write, struct page **pages) +{ + const struct kvec kiov = { + .iov_base = (void *)start, + .iov_len = PAGE_SIZE + }; + + return get_kernel_pages(&kiov, 1, write, pages); +} +EXPORT_SYMBOL_GPL(get_kernel_page); + +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), + void *arg) +{ + int i; + struct zone *zone = NULL; + struct lruvec *lruvec; + unsigned long flags = 0; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = pagezone; + spin_lock_irqsave(&zone->lru_lock, flags); + } + + lruvec = mem_cgroup_page_lruvec(page, zone); + (*move_fn)(page, lruvec, arg); + } + if (zone) + spin_unlock_irqrestore(&zone->lru_lock, flags); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + int *pgmoved = arg; + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + enum lru_list lru = page_lru_base_type(page); + list_move_tail(&page->lru, &lruvec->lists[lru]); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + +/* + * Writeback is about to end against a page which has been marked for immediate + * reclaim. If it still appears to be reclaimable, move it to the tail of the + * inactive list. + */ +void rotate_reclaimable_page(struct page *page) +{ + if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && + !PageUnevictable(page) && PageLRU(page)) { + struct pagevec *pvec; + unsigned long flags; + + page_cache_get(page); + local_lock_irqsave(rotate_lock, flags); + pvec = this_cpu_ptr(&lru_rotate_pvecs); + if (!pagevec_add(pvec, page)) + pagevec_move_tail(pvec); + local_unlock_irqrestore(rotate_lock, flags); + } +} + +static void update_page_reclaim_stat(struct lruvec *lruvec, + int file, int rotated) +{ + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + + reclaim_stat->recent_scanned[file]++; + if (rotated) + reclaim_stat->recent_rotated[file]++; +} + +static void __activate_page(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru); + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(page, lruvec, lru); + trace_mm_lru_activate(page); + + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(lruvec, file, 1); + } +} + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + +static void activate_page_drain(int cpu) +{ + struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); +} + +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + +void activate_page(struct page *page) +{ + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_locked_var(swapvec_lock, + activate_page_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); + put_locked_var(swapvec_lock, activate_page_pvecs); + } +} + +#else +static inline void activate_page_drain(int cpu) +{ +} + +static bool need_activate_page_drain(int cpu) +{ + return false; +} + +void activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); + spin_unlock_irq(&zone->lru_lock); +} +#endif + +static void __lru_cache_activate_page(struct page *page) +{ + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec); + int i; + + /* + * Search backwards on the optimistic assumption that the page being + * activated has just been added to this pagevec. Note that only + * the local pagevec is examined as a !PageLRU page could be in the + * process of being released, reclaimed, migrated or on a remote + * pagevec that is currently being drained. Furthermore, marking + * a remote pagevec's page PageActive potentially hits a race where + * a page is marked PageActive just after it is added to the inactive + * list causing accounting errors and BUG_ON checks to trigger. + */ + for (i = pagevec_count(pvec) - 1; i >= 0; i--) { + struct page *pagevec_page = pvec->pages[i]; + + if (pagevec_page == page) { + SetPageActive(page); + break; + } + } + + put_locked_var(swapvec_lock, lru_add_pvec); +} + +/* + * Mark a page as having seen activity. + * + * inactive,unreferenced -> inactive,referenced + * inactive,referenced -> active,unreferenced + * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). + */ +void mark_page_accessed(struct page *page) +{ + if (!PageActive(page) && !PageUnevictable(page) && + PageReferenced(page)) { + + /* + * If the page is on the LRU, queue it for activation via + * activate_page_pvecs. Otherwise, assume the page is on a + * pagevec, mark it active and it'll be moved to the active + * LRU on the next drain. + */ + if (PageLRU(page)) + activate_page(page); + else + __lru_cache_activate_page(page); + ClearPageReferenced(page); + if (page_is_file_cache(page)) + workingset_activation(page); + } else if (!PageReferenced(page)) { + SetPageReferenced(page); + } +} +EXPORT_SYMBOL(mark_page_accessed); + +static void __lru_cache_add(struct page *page) +{ + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec); + + page_cache_get(page); + if (!pagevec_space(pvec)) + __pagevec_lru_add(pvec); + pagevec_add(pvec, page); + put_locked_var(swapvec_lock, lru_add_pvec); +} + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add_anon(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} + +void lru_cache_add_file(struct page *page) +{ + if (PageActive(page)) + ClearPageActive(page); + __lru_cache_add(page); +} +EXPORT_SYMBOL(lru_cache_add_file); + +/** + * lru_cache_add - add a page to a page list + * @page: the page to be added to the LRU. + * + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of lru_cache_add() + * have the page added to the active list using mark_page_accessed(). + */ +void lru_cache_add(struct page *page) +{ + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + __lru_cache_add(page); +} + +/** + * add_page_to_unevictable_list - add a page to the unevictable list + * @page: the page to be added to the unevictable list + * + * Add page directly to its zone's unevictable list. To avoid races with + * tasks that might be making the page evictable, through eg. munlock, + * munmap or exit, while it's not on the lru, we want to add the page + * while it's locked or otherwise "invisible" to other tasks. This is + * difficult to do when using the pagevec cache, so bypass that. + */ +void add_page_to_unevictable_list(struct page *page) +{ + struct zone *zone = page_zone(page); + struct lruvec *lruvec; + + spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageActive(page); + SetPageUnevictable(page); + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); + spin_unlock_irq(&zone->lru_lock); +} + +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * Place @page on the active or unevictable LRU list, depending on its + * evictability. Note that if the page is not evictable, it goes + * directly back onto it's zone's unevictable list, it does NOT use a + * per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { + SetPageActive(page); + lru_cache_add(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); +} + +/* + * If the page can not be invalidated, it is moved to the + * inactive list to speed up its reclaim. It is moved to the + * head of the list, rather than the tail, to give the flusher + * threads some time to write it out, as this is much more + * effective than the single-page writeout from reclaim. + * + * If the page isn't page_mapped and dirty/writeback, the page + * could reclaim asap using PG_reclaim. + * + * 1. active, mapped page -> none + * 2. active, dirty/writeback page -> inactive, head, PG_reclaim + * 3. inactive, mapped page -> none + * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim + * 5. inactive, clean -> inactive, tail + * 6. Others -> none + * + * In 4, why it moves inactive's head, the VM expects the page would + * be write it out by flusher threads as this is much more effective + * than the single-page writeout from reclaim. + */ +static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + int lru, file; + bool active; + + if (!PageLRU(page)) + return; + + if (PageUnevictable(page)) + return; + + /* Some processes are using the page */ + if (page_mapped(page)) + return; + + active = PageActive(page); + file = page_is_file_cache(page); + lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + active); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + if (PageWriteback(page) || PageDirty(page)) { + /* + * PG_reclaim could be raced with end_page_writeback + * It can make readahead confusing. But race window + * is _really_ small and it's non-critical problem. + */ + SetPageReclaim(page); + } else { + /* + * The page's writeback ends up during pagevec + * We moves tha page into tail of inactive. + */ + list_move_tail(&page->lru, &lruvec->lists[lru]); + __count_vm_event(PGROTATED); + } + + if (active) + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(lruvec, file, 0); +} + +/* + * Drain pages out of the cpu's pagevecs. + * Either "cpu" is the current CPU, and preemption has already been + * disabled; or "cpu" is being hot-unplugged, and is already dead. + */ +void lru_add_drain_cpu(int cpu) +{ + struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); + + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); + + pvec = &per_cpu(lru_rotate_pvecs, cpu); + if (pagevec_count(pvec)) { + unsigned long flags; + + /* No harm done if a racing interrupt already did this */ + local_lock_irqsave(rotate_lock, flags); + pagevec_move_tail(pvec); + local_unlock_irqrestore(rotate_lock, flags); + } + + pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + + activate_page_drain(cpu); +} + +/** + * deactivate_file_page - forcefully deactivate a file page + * @page: page to deactivate + * + * This function hints the VM that @page is a good reclaim candidate, + * for example if its invalidation fails due to the page being dirty + * or under writeback. + */ +void deactivate_file_page(struct page *page) +{ + /* + * In a workload with many unevictable page such as mprotect, + * unevictable page deactivation for accelerating reclaim is pointless. + */ + if (PageUnevictable(page)) + return; + + if (likely(get_page_unless_zero(page))) { + struct pagevec *pvec = &get_locked_var(swapvec_lock, + lru_deactivate_file_pvecs); + + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + put_locked_var(swapvec_lock, lru_deactivate_file_pvecs); + } +} + +void lru_add_drain(void) +{ + lru_add_drain_cpu(local_lock_cpu(swapvec_lock)); + local_unlock_cpu(swapvec_lock); +} + +static void lru_add_drain_per_cpu(struct work_struct *dummy) +{ + lru_add_drain(); +} + +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + +void lru_add_drain_all(void) +{ + static DEFINE_MUTEX(lock); + static struct cpumask has_work; + int cpu; + + mutex_lock(&lock); + get_online_cpus(); + cpumask_clear(&has_work); + + for_each_online_cpu(cpu) { + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + need_activate_page_drain(cpu)) { + INIT_WORK(work, lru_add_drain_per_cpu); + schedule_work_on(cpu, work); + cpumask_set_cpu(cpu, &has_work); + } + } + + for_each_cpu(cpu, &has_work) + flush_work(&per_cpu(lru_add_drain_work, cpu)); + + put_online_cpus(); + mutex_unlock(&lock); +} + +/** + * release_pages - batched page_cache_release() + * @pages: array of pages to release + * @nr: number of pages + * @cold: whether the pages are cache cold + * + * Decrement the reference count on all the pages in @pages. If it + * fell to zero, remove the page from the LRU and free it. + */ +void release_pages(struct page **pages, int nr, bool cold) +{ + int i; + LIST_HEAD(pages_to_free); + struct zone *zone = NULL; + struct lruvec *lruvec; + unsigned long uninitialized_var(flags); + unsigned int uninitialized_var(lock_batch); + + for (i = 0; i < nr; i++) { + struct page *page = pages[i]; + + if (unlikely(PageCompound(page))) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + put_compound_page(page); + continue; + } + + /* + * Make sure the IRQ-safe lock-holding time does not get + * excessive with a continuous string of pages from the + * same zone. The lock is held only if zone != NULL. + */ + if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + + if (!put_page_testzero(page)) + continue; + + if (PageLRU(page)) { + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irqrestore(&zone->lru_lock, + flags); + lock_batch = 0; + zone = pagezone; + spin_lock_irqsave(&zone->lru_lock, flags); + } + + lruvec = mem_cgroup_page_lruvec(page, zone); + VM_BUG_ON_PAGE(!PageLRU(page), page); + __ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, page_off_lru(page)); + } + + /* Clear Active bit in case of parallel mark_page_accessed */ + __ClearPageActive(page); + + list_add(&page->lru, &pages_to_free); + } + if (zone) + spin_unlock_irqrestore(&zone->lru_lock, flags); + + mem_cgroup_uncharge_list(&pages_to_free); + free_hot_cold_page_list(&pages_to_free, cold); +} +EXPORT_SYMBOL(release_pages); + +/* + * The pages which we're about to release may be in the deferred lru-addition + * queues. That would prevent them from really being freed right now. That's + * OK from a correctness point of view but is inefficient - those pages may be + * cache-warm and we want to give them back to the page allocator ASAP. + * + * So __pagevec_release() will drain those queues here. __pagevec_lru_add() + * and __pagevec_lru_add_active() call release_pages() directly to avoid + * mutual recursion. + */ +void __pagevec_release(struct pagevec *pvec) +{ + lru_add_drain(); + release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); + pagevec_reinit(pvec); +} +EXPORT_SYMBOL(__pagevec_release); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct page *page, struct page *page_tail, + struct lruvec *lruvec, struct list_head *list) +{ + const int file = 0; + + VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_PAGE(PageCompound(page_tail), page); + VM_BUG_ON_PAGE(PageLRU(page_tail), page); + VM_BUG_ON(NR_CPUS != 1 && + !spin_is_locked(&lruvec_zone(lruvec)->lru_lock)); + + if (!list) + SetPageLRU(page_tail); + + if (likely(PageLRU(page))) + list_add_tail(&page_tail->lru, &page->lru); + else if (list) { + /* page reclaim is reclaiming a huge page */ + get_page(page_tail); + list_add_tail(&page_tail->lru, list); + } else { + struct list_head *list_head; + /* + * Head page has not yet been counted, as an hpage, + * so we must account for each subpage individually. + * + * Use the standard add function to put page_tail on the list, + * but then correct its position so they all end up in order. + */ + add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); + list_head = page_tail->lru.prev; + list_move_tail(&page_tail->lru, list_head); + } + + if (!PageUnevictable(page)) + update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + int file = page_is_file_cache(page); + int active = PageActive(page); + enum lru_list lru = page_lru(page); + + VM_BUG_ON_PAGE(PageLRU(page), page); + + SetPageLRU(page); + add_page_to_lru_list(page, lruvec, lru); + update_page_reclaim_stat(lruvec, file, active); + trace_mm_lru_insertion(page, lru); +} + +/* + * Add the passed pages to the LRU, then drop the caller's refcount + * on them. Reinitialises the caller's pagevec. + */ +void __pagevec_lru_add(struct pagevec *pvec) +{ + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); +} +EXPORT_SYMBOL(__pagevec_lru_add); + +/** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + +/** + * pagevec_lookup - gang pagecache lookup + * @pvec: Where the resulting pages are placed + * @mapping: The address_space to search + * @start: The starting page index + * @nr_pages: The maximum number of pages + * + * pagevec_lookup() will search for and return a group of up to @nr_pages pages + * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a + * reference against the pages in @pvec. + * + * The search returns a group of mapping-contiguous pages with ascending + * indexes. There may be holes in the indices due to not-present pages. + * + * pagevec_lookup() returns the number of pages which were found. + */ +unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, + pgoff_t start, unsigned nr_pages) +{ + pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup); + +unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, + pgoff_t *index, int tag, unsigned nr_pages) +{ + pvec->nr = find_get_pages_tag(mapping, index, tag, + nr_pages, pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_tag); + +/* + * Perform any setup for the swap system + */ +void __init swap_setup(void) +{ + unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); +#ifdef CONFIG_SWAP + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) + spin_lock_init(&swapper_spaces[i].tree_lock); +#endif + + /* Use a smaller cluster for small-memory machines */ + if (megs < 16) + page_cluster = 2; + else + page_cluster = 3; + /* + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ +} diff --git a/kernel/mm/swap_cgroup.c b/kernel/mm/swap_cgroup.c new file mode 100644 index 000000000..b5f7f24b8 --- /dev/null +++ b/kernel/mm/swap_cgroup.c @@ -0,0 +1,208 @@ +#include +#include +#include + +#include /* depends on mm.h include */ + +static DEFINE_MUTEX(swap_cgroup_mutex); +struct swap_cgroup_ctrl { + struct page **map; + unsigned long length; + spinlock_t lock; +}; + +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; + +struct swap_cgroup { + unsigned short id; +}; +#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) + +/* + * SwapCgroup implements "lookup" and "exchange" operations. + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge + * against SwapCache. At swap_free(), this is accessed directly from swap. + * + * This means, + * - we have no race in "exchange" when we're accessed via SwapCache because + * SwapCache(and its swp_entry) is under lock. + * - When called via swap_free(), there is no user of this entry and no race. + * Then, we don't need lock around "exchange". + * + * TODO: we can push these buffers out to HIGHMEM. + */ + +/* + * allocate buffer for swap_cgroup. + */ +static int swap_cgroup_prepare(int type) +{ + struct page *page; + struct swap_cgroup_ctrl *ctrl; + unsigned long idx, max; + + ctrl = &swap_cgroup_ctrl[type]; + + for (idx = 0; idx < ctrl->length; idx++) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto not_enough_page; + ctrl->map[idx] = page; + } + return 0; +not_enough_page: + max = idx; + for (idx = 0; idx < max; idx++) + __free_page(ctrl->map[idx]); + + return -ENOMEM; +} + +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + +/** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @ent: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup using 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** + * swap_cgroup_record - record mem_cgroup for this swp_entry. + * @ent: swap entry to be recorded into + * @id: mem_cgroup to be recorded + * + * Returns old value at success, 0 at failure. + * (Of course, old value can be 0.) + */ +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned short old; + unsigned long flags; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + old = sc->id; + sc->id = id; + spin_unlock_irqrestore(&ctrl->lock, flags); + + return old; +} + +/** + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry + * @ent: swap entry to be looked up. + * + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + */ +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) +{ + return lookup_swap_cgroup(ent, NULL)->id; +} + +int swap_cgroup_swapon(int type, unsigned long max_pages) +{ + void *array; + unsigned long array_size; + unsigned long length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return 0; + + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); + array_size = length * sizeof(void *); + + array = vzalloc(array_size); + if (!array) + goto nomem; + + ctrl = &swap_cgroup_ctrl[type]; + mutex_lock(&swap_cgroup_mutex); + ctrl->length = length; + ctrl->map = array; + spin_lock_init(&ctrl->lock); + if (swap_cgroup_prepare(type)) { + /* memory shortage */ + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + vfree(array); + goto nomem; + } + mutex_unlock(&swap_cgroup_mutex); + + return 0; +nomem: + printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); + printk(KERN_INFO + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + return -ENOMEM; +} + +void swap_cgroup_swapoff(int type) +{ + struct page **map; + unsigned long i, length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return; + + mutex_lock(&swap_cgroup_mutex); + ctrl = &swap_cgroup_ctrl[type]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; + if (page) + __free_page(page); + } + vfree(map); + } +} diff --git a/kernel/mm/swap_state.c b/kernel/mm/swap_state.c new file mode 100644 index 000000000..8bc8e6613 --- /dev/null +++ b/kernel/mm/swap_state.c @@ -0,0 +1,486 @@ +/* + * linux/mm/swap_state.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * + * Rewritten to use page cache, (C) 1998 Stephen Tweedie + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * swapper_space is a fiction, retained to simplify the path through + * vmscan's shrink_page_list. + */ +static const struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .set_page_dirty = swap_set_page_dirty, +#ifdef CONFIG_MIGRATION + .migratepage = migrate_page, +#endif +}; + +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .i_mmap_writable = ATOMIC_INIT(0), + .a_ops = &swap_aops, + } +}; + +#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) + +static struct { + unsigned long add_total; + unsigned long del_total; + unsigned long find_success; + unsigned long find_total; +} swap_cache_info; + +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + +void show_swap_cache_info(void) +{ + printk("%lu pages in swap cache\n", total_swapcache_pages()); + printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", + swap_cache_info.add_total, swap_cache_info.del_total, + swap_cache_info.find_success, swap_cache_info.find_total); + printk("Free swap = %ldkB\n", + get_nr_swap_pages() << (PAGE_SHIFT - 10)); + printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); +} + +/* + * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * but sets SwapCache flag and private instead of mapping and index. + */ +int __add_to_swap_cache(struct page *page, swp_entry_t entry) +{ + int error; + struct address_space *address_space; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageSwapCache(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, + entry.val, page); + if (likely(!error)) { + address_space->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + } + spin_unlock_irq(&address_space->tree_lock); + + if (unlikely(error)) { + /* + * Only the context which have set SWAP_HAS_CACHE flag + * would call add_to_swap_cache(). + * So add_to_swap_cache() doesn't returns -EEXIST. + */ + VM_BUG_ON(error == -EEXIST); + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } + + return error; +} + + +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +{ + int error; + + error = radix_tree_maybe_preload(gfp_mask); + if (!error) { + error = __add_to_swap_cache(page, entry); + radix_tree_preload_end(); + } + return error; +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache. + */ +void __delete_from_swap_cache(struct page *page) +{ + swp_entry_t entry; + struct address_space *address_space; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_PAGE(PageWriteback(page), page); + + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); + set_page_private(page, 0); + ClearPageSwapCache(page); + address_space->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(del_total); +} + +/** + * add_to_swap - allocate swap space for a page + * @page: page we want to move to swap + * + * Allocate swap space for the page and add the page to the + * swap cache. Caller needs to hold the page lock. + */ +int add_to_swap(struct page *page, struct list_head *list) +{ + swp_entry_t entry; + int err; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageUptodate(page), page); + + entry = get_swap_page(); + if (!entry.val) + return 0; + + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page_to_list(page, list))) { + swapcache_free(entry); + return 0; + } + + /* + * Radix-tree node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + /* + * Add it to the swap cache and mark it dirty + */ + err = add_to_swap_cache(page, entry, + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); + + if (!err) { /* Success */ + SetPageDirty(page); + return 1; + } else { /* -ENOMEM radix-tree allocation failure */ + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + swapcache_free(entry); + return 0; + } +} + +/* + * This must be called only on pages that have + * been verified to be in the swap cache and locked. + * It will never put the page into the free list, + * the caller has a reference on the page. + */ +void delete_from_swap_cache(struct page *page) +{ + swp_entry_t entry; + struct address_space *address_space; + + entry.val = page_private(page); + + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + __delete_from_swap_cache(page); + spin_unlock_irq(&address_space->tree_lock); + + swapcache_free(entry); + page_cache_release(page); +} + +/* + * If we are the only user, then try to free up the swap cache. + * + * Its ok to check for PageSwapCache without the page lock + * here because we are going to recheck again inside + * try_to_free_swap() _with_ the lock. + * - Marcelo + */ +static inline void free_swap_cache(struct page *page) +{ + if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { + try_to_free_swap(page); + unlock_page(page); + } +} + +/* + * Perform a free_page(), also freeing any swap cache associated with + * this page if it is the last user of the page. + */ +void free_page_and_swap_cache(struct page *page) +{ + free_swap_cache(page); + page_cache_release(page); +} + +/* + * Passed an array of pages, drop them all from swapcache and then release + * them. They are removed from the LRU and freed if this is their last use. + */ +void free_pages_and_swap_cache(struct page **pages, int nr) +{ + struct page **pagep = pages; + int i; + + lru_add_drain(); + for (i = 0; i < nr; i++) + free_swap_cache(pagep[i]); + release_pages(pagep, nr, false); +} + +/* + * Lookup a swap entry in the swap cache. A found page will be returned + * unlocked and with its refcount incremented - we rely on the kernel + * lock getting page table operations atomic even if we drop the page + * lock before returning. + */ +struct page * lookup_swap_cache(swp_entry_t entry) +{ + struct page *page; + + page = find_get_page(swap_address_space(entry), entry.val); + + if (page) { + INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } + + INC_CACHE_INFO(find_total); + return page; +} + +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. + * A failure return means that either the page allocation failed or that + * the swap entry is no longer in use. + */ +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + struct page *found_page, *new_page = NULL; + int err; + + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(swap_address_space(entry), + entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page_vma(gfp_mask, vma, addr); + if (!new_page) + break; /* Out of memory */ + } + + /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); + if (err) + break; + + /* + * Swap entry may have been freed since our caller observed it. + */ + err = swapcache_prepare(entry); + if (err == -EEXIST) { + radix_tree_preload_end(); + /* + * We might race against get_swap_page() and stumble + * across a SWAP_HAS_CACHE swap_map entry whose page + * has not been brought into the swapcache yet, while + * the other end is scheduled away waiting on discard + * I/O completion at scan_swap_map(). + * + * In order to avoid turning this transitory state + * into a permanent loop around this -EEXIST case + * if !CONFIG_PREEMPT and the I/O completion happens + * to be waiting on the CPU waitqueue where we are now + * busy looping, we just conditionally invoke the + * scheduler here, if there are some more important + * tasks to run. + */ + cond_resched(); + continue; + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); + break; + } + + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + err = __add_to_swap_cache(new_page, entry); + if (likely(!err)) { + radix_tree_preload_end(); + /* + * Initiate read into locked page and return. + */ + lru_cache_add_anon(new_page); + swap_readpage(new_page); + return new_page; + } + radix_tree_preload_end(); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + swapcache_free(entry); + } while (err != -ENOMEM); + + if (new_page) + page_cache_release(new_page); + return found_page; +} + +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << READ_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vma: user vma this address belongs to + * @addr: target address for mempolicy + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; + unsigned long start_offset, end_offset; + unsigned long mask; + struct blk_plug plug; + + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + + /* Read a page_cluster sized and aligned cluster around offset. */ + start_offset = offset & ~mask; + end_offset = offset | mask; + if (!start_offset) /* First page is swap header. */ + start_offset++; + + blk_start_plug(&plug); + for (offset = start_offset; offset <= end_offset ; offset++) { + /* Ok, do the async read-ahead now */ + page = read_swap_cache_async(swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr); + if (!page) + continue; + if (offset != entry_offset) + SetPageReadahead(page); + page_cache_release(page); + } + blk_finish_plug(&plug); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: + return read_swap_cache_async(entry, gfp_mask, vma, addr); +} diff --git a/kernel/mm/swapfile.c b/kernel/mm/swapfile.c new file mode 100644 index 000000000..a7e72103f --- /dev/null +++ b/kernel/mm/swapfile.c @@ -0,0 +1,2940 @@ +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +static void free_swap_count_continuations(struct swap_info_struct *); +static sector_t map_swap_entry(swp_entry_t, struct block_device**); + +DEFINE_SPINLOCK(swap_lock); +static unsigned int nr_swapfiles; +atomic_long_t nr_swap_pages; +/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ +long total_swap_pages; +static int least_priority; + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by get_swap_page() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but get_swap_page() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static PLIST_HEAD(swap_avail_head); +static DEFINE_SPINLOCK(swap_avail_lock); + +struct swap_info_struct *swap_info[MAX_SWAPFILES]; + +static DEFINE_MUTEX(swapon_mutex); + +static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); +/* Activity counter to indicate that a swapon or swapoff has occurred */ +static atomic_t proc_poll_event = ATOMIC_INIT(0); + +static inline unsigned char swap_count(unsigned char ent) +{ + return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ +} + +/* returns 1 if swap entry is freed */ +static int +__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) +{ + swp_entry_t entry = swp_entry(si->type, offset); + struct page *page; + int ret = 0; + + page = find_get_page(swap_address_space(entry), entry.val); + if (!page) + return 0; + /* + * This function is called from scan_swap_map() and it's called + * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. + * We have to use trylock for avoiding deadlock. This is a special + * case and you should use try_to_free_swap() with explicit lock_page() + * in usual operations. + */ + if (trylock_page(page)) { + ret = try_to_free_swap(page); + unlock_page(page); + } + page_cache_release(page); + return ret; +} + +/* + * swapon tell device that all the old swap contents can be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static int discard_swap(struct swap_info_struct *si) +{ + struct swap_extent *se; + sector_t start_block; + sector_t nr_blocks; + int err = 0; + + /* Do not discard the swap header page! */ + se = &si->first_swap_extent; + start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); + nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); + if (nr_blocks) { + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL, 0); + if (err) + return err; + cond_resched(); + } + + list_for_each_entry(se, &si->first_swap_extent.list, list) { + start_block = se->start_block << (PAGE_SHIFT - 9); + nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL, 0); + if (err) + break; + + cond_resched(); + } + return err; /* That will often be -EOPNOTSUPP */ +} + +/* + * swap allocation tell device that a cluster of swap can now be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static void discard_swap_cluster(struct swap_info_struct *si, + pgoff_t start_page, pgoff_t nr_pages) +{ + struct swap_extent *se = si->curr_swap_extent; + int found_extent = 0; + + while (nr_pages) { + struct list_head *lh; + + if (se->start_page <= start_page && + start_page < se->start_page + se->nr_pages) { + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + sector_t nr_blocks = se->nr_pages - offset; + + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; + + if (!found_extent++) + si->curr_swap_extent = se; + + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO, 0)) + break; + } + + lh = se->list.next; + se = list_entry(lh, struct swap_extent, list); + } +} + +#define SWAPFILE_CLUSTER 256 +#define LATENCY_LIMIT 256 + +static inline void cluster_set_flag(struct swap_cluster_info *info, + unsigned int flag) +{ + info->flags = flag; +} + +static inline unsigned int cluster_count(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_count(struct swap_cluster_info *info, + unsigned int c) +{ + info->data = c; +} + +static inline void cluster_set_count_flag(struct swap_cluster_info *info, + unsigned int c, unsigned int f) +{ + info->flags = f; + info->data = c; +} + +static inline unsigned int cluster_next(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_next(struct swap_cluster_info *info, + unsigned int n) +{ + info->data = n; +} + +static inline void cluster_set_next_flag(struct swap_cluster_info *info, + unsigned int n, unsigned int f) +{ + info->flags = f; + info->data = n; +} + +static inline bool cluster_is_free(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_FREE; +} + +static inline bool cluster_is_null(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_NEXT_NULL; +} + +static inline void cluster_set_null(struct swap_cluster_info *info) +{ + info->flags = CLUSTER_FLAG_NEXT_NULL; + info->data = 0; +} + +/* Add a cluster to discard list and schedule it to do discard */ +static void swap_cluster_schedule_discard(struct swap_info_struct *si, + unsigned int idx) +{ + /* + * If scan_swap_map() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't + * taken by scan_swap_map(), mark the swap entries bad (occupied). It + * will be cleared after discard + */ + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + if (cluster_is_null(&si->discard_cluster_head)) { + cluster_set_next_flag(&si->discard_cluster_head, + idx, 0); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } else { + unsigned int tail = cluster_next(&si->discard_cluster_tail); + cluster_set_next(&si->cluster_info[tail], idx); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } + + schedule_work(&si->discard_work); +} + +/* + * Doing discard actually. After a cluster discard is finished, the cluster + * will be added to free cluster list. caller should hold si->lock. +*/ +static void swap_do_scheduled_discard(struct swap_info_struct *si) +{ + struct swap_cluster_info *info; + unsigned int idx; + + info = si->cluster_info; + + while (!cluster_is_null(&si->discard_cluster_head)) { + idx = cluster_next(&si->discard_cluster_head); + + cluster_set_next_flag(&si->discard_cluster_head, + cluster_next(&info[idx]), 0); + if (cluster_next(&si->discard_cluster_tail) == idx) { + cluster_set_null(&si->discard_cluster_head); + cluster_set_null(&si->discard_cluster_tail); + } + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); + cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&si->free_cluster_head)) { + cluster_set_next_flag(&si->free_cluster_head, + idx, 0); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&si->free_cluster_tail); + cluster_set_next(&info[tail], idx); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); + } +} + +static void swap_discard_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, discard_work); + + spin_lock(&si->lock); + swap_do_scheduled_discard(si); + spin_unlock(&si->lock); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + if (cluster_is_free(&cluster_info[idx])) { + VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); + cluster_set_next_flag(&p->free_cluster_head, + cluster_next(&cluster_info[idx]), 0); + if (cluster_next(&p->free_cluster_tail) == idx) { + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->free_cluster_head); + } + cluster_set_count_flag(&cluster_info[idx], 0, 0); + } + + VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) + 1); +} + +/* + * The cluster corresponding to page_nr decreases one usage. If the usage + * counter becomes 0, which means no page in the cluster is in using, we can + * optionally discard the cluster and add it to free cluster list. + */ +static void dec_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + + VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) - 1); + + if (cluster_count(&cluster_info[idx]) == 0) { + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(p, idx); + return; + } + + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } else { + unsigned int tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } + } +} + +/* + * It's possible scan_swap_map() uses a free cluster in the middle of free + * cluster list. Avoiding such abuse to avoid list corruption. + */ +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, + unsigned long offset) +{ + struct percpu_cluster *percpu_cluster; + bool conflict; + + offset /= SWAPFILE_CLUSTER; + conflict = !cluster_is_null(&si->free_cluster_head) && + offset != cluster_next(&si->free_cluster_head) && + cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + cluster_set_null(&percpu_cluster->index); + return true; +} + +/* + * Try to get a swap entry from current cpu's swap entry pool (a cluster). This + * might involve allocating a new cluster for current CPU too. + */ +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + bool found_free; + unsigned long tmp; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster_is_null(&cluster->index)) { + if (!cluster_is_null(&si->free_cluster_head)) { + cluster->index = si->free_cluster_head; + cluster->next = cluster_next(&cluster->index) * + SWAPFILE_CLUSTER; + } else if (!cluster_is_null(&si->discard_cluster_head)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + swap_do_scheduled_discard(si); + *scan_base = *offset = si->cluster_next; + goto new_cluster; + } else + return; + } + + found_free = false; + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * + SWAPFILE_CLUSTER) { + if (!si->swap_map[tmp]) { + found_free = true; + break; + } + tmp++; + } + if (!found_free) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; +} + +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) +{ + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; + int latency_ration = LATENCY_LIMIT; + + /* + * We try to cluster swap pages by allocating them sequentially + * in swap. Once we've allocated SWAPFILE_CLUSTER pages this + * way, however, we resort to first-free allocation, starting + * a new cluster. This prevents us from scattering swap pages + * all over the entire swap partition, so that we reduce + * overall disk seek times between swap pages. -- sct + * But we do now try to find an empty cluster. -Andrea + * And we let swap pages go all over an SSD partition. Hugh + */ + + si->flags += SWP_SCANNING; + scan_base = offset = si->cluster_next; + + /* SSD algorithm */ + if (si->cluster_info) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + goto checks; + } + + if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } + + spin_unlock(&si->lock); + + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. + * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info + * case, just handled by scan_swap_map_try_ssd_cluster() above. + */ + scan_base = offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + + /* Locate the first empty (unaligned) cluster */ + for (; last_in_cluster <= si->highest_bit; offset++) { + if (si->swap_map[offset]) + last_in_cluster = offset + SWAPFILE_CLUSTER; + else if (offset == last_in_cluster) { + spin_lock(&si->lock); + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + + offset = scan_base; + spin_lock(&si->lock); + si->cluster_nr = SWAPFILE_CLUSTER - 1; + } + +checks: + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + } + if (!(si->flags & SWP_WRITEOK)) + goto no_page; + if (!si->highest_bit) + goto no_page; + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + spin_unlock(&si->lock); + swap_was_freed = __try_to_reclaim_swap(si, offset); + spin_lock(&si->lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed) + goto checks; + goto scan; /* check next one */ + } + + if (si->swap_map[offset]) + goto scan; + + if (offset == si->lowest_bit) + si->lowest_bit++; + if (offset == si->highest_bit) + si->highest_bit--; + si->inuse_pages++; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); + si->cluster_next = offset + 1; + si->flags -= SWP_SCANNING; + + return offset; + +scan: + spin_unlock(&si->lock); + while (++offset <= si->highest_bit) { + if (!si->swap_map[offset]) { + spin_lock(&si->lock); + goto checks; + } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&si->lock); + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + offset = si->lowest_bit; + while (offset < scan_base) { + if (!si->swap_map[offset]) { + spin_lock(&si->lock); + goto checks; + } + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + spin_lock(&si->lock); + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + offset++; + } + spin_lock(&si->lock); + +no_page: + si->flags -= SWP_SCANNING; + return 0; +} + +swp_entry_t get_swap_page(void) +{ + struct swap_info_struct *si, *next; + pgoff_t offset; + + if (atomic_long_read(&nr_swap_pages) <= 0) + goto noswap; + atomic_long_dec(&nr_swap_pages); + + spin_lock(&swap_avail_lock); + +start_over: + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + spin_lock(&si->lock); + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_list)) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&si->lock); + goto nextsi; + } + + /* This is called for allocating swap entry for cache */ + offset = scan_swap_map(si, SWAP_HAS_CACHE); + spin_unlock(&si->lock); + if (offset) + return swp_entry(si->type, offset); + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + spin_lock(&swap_avail_lock); +nextsi: + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si + * and it filled up before we could get one; or, the si filled + * up between us dropping swap_avail_lock and taking si->lock. + * Since we dropped the swap_avail_lock, the swap_avail_head + * list may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over. + */ + if (plist_node_empty(&next->avail_list)) + goto start_over; + } + + spin_unlock(&swap_avail_lock); + + atomic_long_inc(&nr_swap_pages); +noswap: + return (swp_entry_t) {0}; +} + +/* The only caller of this function is now suspend routine */ +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + si = swap_info[type]; + spin_lock(&si->lock); + if (si && (si->flags & SWP_WRITEOK)) { + atomic_long_dec(&nr_swap_pages); + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, 1); + if (offset) { + spin_unlock(&si->lock); + return swp_entry(type, offset); + } + atomic_long_inc(&nr_swap_pages); + } + spin_unlock(&si->lock); + return (swp_entry_t) {0}; +} + +static struct swap_info_struct *swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct *p; + unsigned long offset, type; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + p = swap_info[type]; + if (!(p->flags & SWP_USED)) + goto bad_device; + offset = swp_offset(entry); + if (offset >= p->max) + goto bad_offset; + if (!p->swap_map[offset]) + goto bad_free; + spin_lock(&p->lock); + return p; + +bad_free: + pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); + goto out; +bad_offset: + pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); + goto out; +bad_device: + pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); + goto out; +bad_nofile: + pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); +out: + return NULL; +} + +static unsigned char swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) +{ + unsigned long offset = swp_offset(entry); + unsigned char count; + unsigned char has_cache; + + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + + if (usage == SWAP_HAS_CACHE) { + VM_BUG_ON(!has_cache); + has_cache = 0; + } else if (count == SWAP_MAP_SHMEM) { + /* + * Or we could insist on shmem.c using a special + * swap_shmem_free() and free_shmem_swap_and_cache()... + */ + count = 0; + } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if (count == COUNT_CONTINUED) { + if (swap_count_continued(p, offset, count)) + count = SWAP_MAP_MAX | COUNT_CONTINUED; + else + count = SWAP_MAP_MAX; + } else + count--; + } + + if (!count) + mem_cgroup_uncharge_swap(entry); + + usage = count | has_cache; + p->swap_map[offset] = usage; + + /* free if no reference */ + if (!usage) { + dec_cluster_info_page(p, p->cluster_info, offset); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; + p->highest_bit = offset; + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } + atomic_long_inc(&nr_swap_pages); + p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); + if (p->flags & SWP_BLKDEV) { + struct gendisk *disk = p->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, + offset); + } + } + + return usage; +} + +/* + * Caller has made sure that the swap device corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, entry, 1); + spin_unlock(&p->lock); + } +} + +/* + * Called after dropping swapcache to decrease refcnt to swap entries. + */ +void swapcache_free(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = swap_info_get(entry); + if (p) { + swap_entry_free(p, entry, SWAP_HAS_CACHE); + spin_unlock(&p->lock); + } +} + +/* + * How many references to page are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ +int page_swapcount(struct page *page) +{ + int count = 0; + struct swap_info_struct *p; + swp_entry_t entry; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p) { + count = swap_count(p->swap_map[swp_offset(entry)]); + spin_unlock(&p->lock); + } + return count; +} + +/* + * We can write to an anon page without COW if there are no other references + * to it. And as a side-effect, free up its swap: because the old content + * on disk will never be read, and seeking back there to write new content + * later would only waste time away from clustering. + */ +int reuse_swap_page(struct page *page) +{ + int count; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (unlikely(PageKsm(page))) + return 0; + count = page_mapcount(page); + if (count <= 1 && PageSwapCache(page)) { + count += page_swapcount(page); + if (count == 1 && !PageWriteback(page)) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + } + return count <= 1; +} + +/* + * If swap is getting full, or if there are no more mappings of this page, + * then try_to_free_swap is called to free its swap space. + */ +int try_to_free_swap(struct page *page) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (!PageSwapCache(page)) + return 0; + if (PageWriteback(page)) + return 0; + if (page_swapcount(page)) + return 0; + + /* + * Once hibernation has begun to create its image of memory, + * there's a danger that one of the calls to try_to_free_swap() + * - most probably a call from __try_to_reclaim_swap() while + * hibernation is allocating its own swap pages for the image, + * but conceivably even a call from memory reclaim - will free + * the swap from a page which has already been recorded in the + * image as a clean swapcache page, and then reuse its swap for + * another page of the image. On waking from hibernation, the + * original page might be freed under memory pressure, then + * later read back in from swap, now with the wrong data. + * + * Hibernation suspends storage while it is writing the image + * to disk so check that here. + */ + if (pm_suspended_storage()) + return 0; + + delete_from_swap_cache(page); + SetPageDirty(page); + return 1; +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +int free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct *p; + struct page *page = NULL; + + if (non_swap_entry(entry)) + return 1; + + p = swap_info_get(entry); + if (p) { + if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { + page = find_get_page(swap_address_space(entry), + entry.val); + if (page && !trylock_page(page)) { + page_cache_release(page); + page = NULL; + } + } + spin_unlock(&p->lock); + } + if (page) { + /* + * Not mapped elsewhere, or swap space full? Free it! + * Also recheck PageSwapCache now page is locked (above). + */ + if (PageSwapCache(page) && !PageWriteback(page) && + (!page_mapped(page) || vm_swap_full())) { + delete_from_swap_cache(page); + SetPageDirty(page); + } + unlock_page(page); + page_cache_release(page); + } + return p != NULL; +} + +#ifdef CONFIG_HIBERNATION +/* + * Find the swap type that corresponds to given device (if any). + * + * @offset - number of the PAGE_SIZE-sized block of the device, starting + * from 0, in which the swap header is expected to be located. + * + * This is needed for the suspend to disk (aka swsusp). + */ +int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) +{ + struct block_device *bdev = NULL; + int type; + + if (device) + bdev = bdget(device); + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; + + if (!(sis->flags & SWP_WRITEOK)) + continue; + + if (!bdev) { + if (bdev_p) + *bdev_p = bdgrab(sis->bdev); + + spin_unlock(&swap_lock); + return type; + } + if (bdev == sis->bdev) { + struct swap_extent *se = &sis->first_swap_extent; + + if (se->start_block == offset) { + if (bdev_p) + *bdev_p = bdgrab(sis->bdev); + + spin_unlock(&swap_lock); + bdput(bdev); + return type; + } + } + } + spin_unlock(&swap_lock); + if (bdev) + bdput(bdev); + + return -ENODEV; +} + +/* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int type, pgoff_t offset) +{ + struct block_device *bdev; + + if ((unsigned int)type >= nr_swapfiles) + return 0; + if (!(swap_info[type]->flags & SWP_WRITEOK)) + return 0; + return map_swap_entry(swp_entry(type, offset), &bdev); +} + +/* + * Return either the total number of swap pages of given type, or the number + * of free pages of that type (depending on @free) + * + * This is needed for software suspend + */ +unsigned int count_swap_pages(int type, int free) +{ + unsigned int n = 0; + + spin_lock(&swap_lock); + if ((unsigned int)type < nr_swapfiles) { + struct swap_info_struct *sis = swap_info[type]; + + spin_lock(&sis->lock); + if (sis->flags & SWP_WRITEOK) { + n = sis->pages; + if (free) + n -= sis->inuse_pages; + } + spin_unlock(&sis->lock); + } + spin_unlock(&swap_lock); + return n; +} +#endif /* CONFIG_HIBERNATION */ + +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + +/* + * No need to decide whether this PTE shares the swap entry with others, + * just let do_wp_page work it out if a write is requested later - to + * force COW, vm_page_prot omits write permission from any private vma. + */ +static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, swp_entry_t entry, struct page *page) +{ + struct page *swapcache; + struct mem_cgroup *memcg; + spinlock_t *ptl; + pte_t *pte; + int ret = 1; + + swapcache = page; + page = ksm_might_need_to_copy(page, vma, addr); + if (unlikely(!page)) + return -ENOMEM; + + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { + ret = -ENOMEM; + goto out_nolock; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { + mem_cgroup_cancel_charge(page, memcg); + ret = 0; + goto out; + } + + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + get_page(page); + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); + if (page == swapcache) { + page_add_anon_rmap(page, vma, addr); + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, addr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } + swap_free(entry); + /* + * Move the page to the active list so it is not + * immediately swapped out again after swapon. + */ + activate_page(page); +out: + pte_unmap_unlock(pte, ptl); +out_nolock: + if (page != swapcache) { + unlock_page(page); + put_page(page); + } + return ret; +} + +static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pte_t swp_pte = swp_entry_to_pte(entry); + pte_t *pte; + int ret = 0; + + /* + * We don't actually need pte lock while scanning for swp_pte: since + * we hold page lock and mmap_sem, swp_pte cannot be inserted into the + * page table while we're scanning; though it could get zapped, and on + * some architectures (e.g. x86_32 with PAE) we might catch a glimpse + * of unmatched parts which look like swp_pte, so unuse_pte must + * recheck under pte lock. Scanning without pte lock lets it be + * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + */ + pte = pte_offset_map(pmd, addr); + do { + /* + * swapoff spends a _lot_ of time in this loop! + * Test inline before going to call unuse_pte. + */ + if (unlikely(maybe_same_pte(*pte, swp_pte))) { + pte_unmap(pte); + ret = unuse_pte(vma, pmd, addr, entry, page); + if (ret) + goto out; + pte = pte_offset_map(pmd, addr); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); +out: + return ret; +} + +static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pmd_t *pmd; + unsigned long next; + int ret; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + continue; + ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + if (ret) + return ret; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + swp_entry_t entry, struct page *page) +{ + pud_t *pud; + unsigned long next; + int ret; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + return 0; +} + +static int unuse_vma(struct vm_area_struct *vma, + swp_entry_t entry, struct page *page) +{ + pgd_t *pgd; + unsigned long addr, end, next; + int ret; + + if (page_anon_vma(page)) { + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + return 0; + else + end = addr + PAGE_SIZE; + } else { + addr = vma->vm_start; + end = vma->vm_end; + } + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + return 0; +} + +static int unuse_mm(struct mm_struct *mm, + swp_entry_t entry, struct page *page) +{ + struct vm_area_struct *vma; + int ret = 0; + + if (!down_read_trylock(&mm->mmap_sem)) { + /* + * Activate page so shrink_inactive_list is unlikely to unmap + * its ptes while lock is dropped, so swapoff can make progress. + */ + activate_page(page); + unlock_page(page); + down_read(&mm->mmap_sem); + lock_page(page); + } + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) + break; + } + up_read(&mm->mmap_sem); + return (ret < 0)? ret: 0; +} + +/* + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. + * Recycle to start on reaching the end, returning 0 when empty. + */ +static unsigned int find_next_to_unuse(struct swap_info_struct *si, + unsigned int prev, bool frontswap) +{ + unsigned int max = si->max; + unsigned int i = prev; + unsigned char count; + + /* + * No need for swap_lock here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_lock). + */ + for (;;) { + if (++i >= max) { + if (!prev) { + i = 0; + break; + } + /* + * No entries in use at top of swap_map, + * loop back to start and recheck there. + */ + max = prev + 1; + prev = 0; + i = 1; + } + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } + count = READ_ONCE(si->swap_map[i]); + if (count && swap_count(count) != SWAP_MAP_BAD) + break; + } + return i; +} + +/* + * We completely avoid races by reading each swap page in advance, + * and then search for the process using it. All the necessary + * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false + */ +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) +{ + struct swap_info_struct *si = swap_info[type]; + struct mm_struct *start_mm; + volatile unsigned char *swap_map; /* swap_map is accessed without + * locking. Mark it as volatile + * to prevent compiler doing + * something odd. + */ + unsigned char swcount; + struct page *page; + swp_entry_t entry; + unsigned int i = 0; + int retval = 0; + + /* + * When searching mms for an entry, a good strategy is to + * start at the first mm we freed the previous entry from + * (though actually we don't notice whether we or coincidence + * freed the entry). Initialize this start_mm with a hold. + * + * A simpler strategy would be to start at the last mm we + * freed the previous entry from; but that would take less + * advantage of mmlist ordering, which clusters forked mms + * together, child after parent. If we race with dup_mmap(), we + * prefer to resolve parent before child, lest we miss entries + * duplicated after we scanned child: using last mm would invert + * that. + */ + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + + /* + * Keep on scanning until all entries have gone. Usually, + * one pass through swap_map is enough, but not necessarily: + * there are races when an instance of an entry might be missed. + */ + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + if (signal_pending(current)) { + retval = -EINTR; + break; + } + + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[i]; + entry = swp_entry(type, i); + page = read_swap_cache_async(entry, + GFP_HIGHUSER_MOVABLE, NULL, 0); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + swcount = *swap_map; + /* + * We don't hold lock here, so the swap entry could be + * SWAP_MAP_BAD (when the cluster is discarding). + * Instead of fail out, We can just skip the swap + * entry because swapoff will wait for discarding + * finish anyway. + */ + if (!swcount || swcount == SWAP_MAP_BAD) + continue; + retval = -ENOMEM; + break; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page_locked" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page_locked(page); + wait_on_page_writeback(page); + lock_page(page); + wait_on_page_writeback(page); + + /* + * Remove all references to entry. + */ + swcount = *swap_map; + if (swap_count(swcount) == SWAP_MAP_SHMEM) { + retval = shmem_unuse(entry, page); + /* page has already been unlocked and released */ + if (retval < 0) + break; + continue; + } + if (swap_count(swcount) && start_mm != &init_mm) + retval = unuse_mm(start_mm, entry, page); + + if (swap_count(*swap_map)) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *prev_mm = start_mm; + struct mm_struct *mm; + + atomic_inc(&new_start_mm->mm_users); + atomic_inc(&prev_mm->mm_users); + spin_lock(&mmlist_lock); + while (swap_count(*swap_map) && !retval && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + if (!atomic_inc_not_zero(&mm->mm_users)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + + cond_resched(); + + swcount = *swap_map; + if (!swap_count(swcount)) /* any usage ? */ + ; + else if (mm == &init_mm) + set_start_mm = 1; + else + retval = unuse_mm(mm, entry, page); + + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); + new_start_mm = mm; + set_start_mm = 0; + } + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); + mmput(prev_mm); + mmput(start_mm); + start_mm = new_start_mm; + } + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_unmap could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + * + * Given how unuse_vma() targets one particular offset + * in an anon_vma, once the anon_vma has been determined, + * this splitting happens to be just what is needed to + * handle where KSM pages have been swapped out: re-reading + * is unnecessarily slow, but we can fix that later on. + */ + if (swap_count(*swap_map) && + PageDirty(page) && PageSwapCache(page)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + swap_writepage(page, &wbc); + lock_page(page); + wait_on_page_writeback(page); + } + + /* + * It is conceivable that a racing task removed this page from + * swap cache just before we acquired the page lock at the top, + * or while we dropped it in unuse_mm(). The page might even + * be back in swap cache on another swap area: that we must not + * delete, since it may not have been written out to swap yet. + */ + if (PageSwapCache(page) && + likely(page_private(page) == entry.val)) + delete_from_swap_cache(page); + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so shrink_page_list will preserve it. + */ + SetPageDirty(page); + unlock_page(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. + */ + cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } + } + + mmput(start_mm); + return retval; +} + +/* + * After a successful try_to_unuse, if no swap is now in use, we know + * we can empty the mmlist. swap_lock must be held on entry and exit. + * Note that mmlist_lock nests inside swap_lock, and an mm must be + * added to the mmlist just after page_duplicate - before would be racy. + */ +static void drain_mmlist(void) +{ + struct list_head *p, *next; + unsigned int type; + + for (type = 0; type < nr_swapfiles; type++) + if (swap_info[type]->inuse_pages) + return; + spin_lock(&mmlist_lock); + list_for_each_safe(p, next, &init_mm.mmlist) + list_del_init(p); + spin_unlock(&mmlist_lock); +} + +/* + * Use this swapdev's extent info to locate the (PAGE_SIZE) block which + * corresponds to page offset for the specified swap entry. + * Note that the type of this function is sector_t, but it returns page offset + * into the bdev, not sector offset. + */ +static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) +{ + struct swap_info_struct *sis; + struct swap_extent *start_se; + struct swap_extent *se; + pgoff_t offset; + + sis = swap_info[swp_type(entry)]; + *bdev = sis->bdev; + + offset = swp_offset(entry); + start_se = sis->curr_swap_extent; + se = start_se; + + for ( ; ; ) { + struct list_head *lh; + + if (se->start_page <= offset && + offset < (se->start_page + se->nr_pages)) { + return se->start_block + (offset - se->start_page); + } + lh = se->list.next; + se = list_entry(lh, struct swap_extent, list); + sis->curr_swap_extent = se; + BUG_ON(se == start_se); /* It *must* be present */ + } +} + +/* + * Returns the page offset into bdev for the specified page's swap entry. + */ +sector_t map_swap_page(struct page *page, struct block_device **bdev) +{ + swp_entry_t entry; + entry.val = page_private(page); + return map_swap_entry(entry, bdev); +} + +/* + * Free all of a swapdev's extent information + */ +static void destroy_swap_extents(struct swap_info_struct *sis) +{ + while (!list_empty(&sis->first_swap_extent.list)) { + struct swap_extent *se; + + se = list_entry(sis->first_swap_extent.list.next, + struct swap_extent, list); + list_del(&se->list); + kfree(se); + } + + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + sis->flags &= ~SWP_FILE; + mapping->a_ops->swap_deactivate(swap_file); + } +} + +/* + * Add a block range (and the corresponding page range) into this swapdev's + * extent list. The extent list is kept sorted in page order. + * + * This function rather assumes that it is called in ascending page order. + */ +int +add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, + unsigned long nr_pages, sector_t start_block) +{ + struct swap_extent *se; + struct swap_extent *new_se; + struct list_head *lh; + + if (start_page == 0) { + se = &sis->first_swap_extent; + sis->curr_swap_extent = se; + se->start_page = 0; + se->nr_pages = nr_pages; + se->start_block = start_block; + return 1; + } else { + lh = sis->first_swap_extent.list.prev; /* Highest extent */ + se = list_entry(lh, struct swap_extent, list); + BUG_ON(se->start_page + se->nr_pages != start_page); + if (se->start_block + se->nr_pages == start_block) { + /* Merge it */ + se->nr_pages += nr_pages; + return 0; + } + } + + /* + * No merge. Insert a new extent, preserving ordering. + */ + new_se = kmalloc(sizeof(*se), GFP_KERNEL); + if (new_se == NULL) + return -ENOMEM; + new_se->start_page = start_page; + new_se->nr_pages = nr_pages; + new_se->start_block = start_block; + + list_add_tail(&new_se->list, &sis->first_swap_extent.list); + return 1; +} + +/* + * A `swap extent' is a simple thing which maps a contiguous range of pages + * onto a contiguous range of disk blocks. An ordered list of swap extents + * is built at swapon time and is then used at swap_writepage/swap_readpage + * time for locating where on disk a page belongs. + * + * If the swapfile is an S_ISBLK block device, a single extent is installed. + * This is done so that the main operating code can treat S_ISBLK and S_ISREG + * swap files identically. + * + * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap + * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK + * swapfiles are handled *identically* after swapon time. + * + * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks + * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If + * some stray blocks are found which do not fall within the PAGE_SIZE alignment + * requirements, they are simply tossed out - we will never use those blocks + * for swapping. + * + * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This + * prevents root from shooting her foot off by ftruncating an in-use swapfile, + * which will scribble on the fs. + * + * The amount of disk space which a single swap extent represents varies. + * Typically it is in the 1-4 megabyte range. So we can have hundreds of + * extents in the list. To avoid much list walking, we cache the previous + * search location in `curr_swap_extent', and start new searches from there. + * This is extremely effective. The average number of iterations in + * map_swap_page() has been measured at about 0.3 per page. - akpm. + */ +static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) +{ + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + int ret; + + if (S_ISBLK(inode->i_mode)) { + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + return ret; + } + + if (mapping->a_ops->swap_activate) { + ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (!ret) { + sis->flags |= SWP_FILE; + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + } + return ret; + } + + return generic_swapfile_activate(sis, swap_file, span); +} + +static void _enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) +{ + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; + /* + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + p->list.prio = -p->prio; + p->avail_list.prio = -p->prio; + p->swap_map = swap_map; + p->cluster_info = cluster_info; + p->flags |= SWP_WRITEOK; + atomic_long_add(p->pages, &nr_swap_pages); + total_swap_pages += p->pages; + + assert_spin_locked(&swap_lock); + /* + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for get_swap_page(), + * which allocates swap pages from the highest available priority + * swap_info_struct. + */ + plist_add(&p->list, &swap_active_head); + spin_lock(&swap_avail_lock); + plist_add(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); +} + +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info, + unsigned long *frontswap_map) +{ + frontswap_init(p->type, frontswap_map); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +} + +static void reinsert_swap_info(struct swap_info_struct *p) +{ + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +} + +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) +{ + struct swap_info_struct *p = NULL; + unsigned char *swap_map; + struct swap_cluster_info *cluster_info; + unsigned long *frontswap_map; + struct file *swap_file, *victim; + struct address_space *mapping; + struct inode *inode; + struct filename *pathname; + int err, found = 0; + unsigned int old_block_size; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + BUG_ON(!current->mm); + + pathname = getname(specialfile); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + err = PTR_ERR(victim); + if (IS_ERR(victim)) + goto out; + + mapping = victim->f_mapping; + spin_lock(&swap_lock); + plist_for_each_entry(p, &swap_active_head, list) { + if (p->flags & SWP_WRITEOK) { + if (p->swap_file->f_mapping == mapping) { + found = 1; + break; + } + } + } + if (!found) { + err = -EINVAL; + spin_unlock(&swap_lock); + goto out_dput; + } + if (!security_vm_enough_memory_mm(current->mm, p->pages)) + vm_unacct_memory(p->pages); + else { + err = -ENOMEM; + spin_unlock(&swap_lock); + goto out_dput; + } + spin_lock(&swap_avail_lock); + plist_del(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + spin_lock(&p->lock); + if (p->prio < 0) { + struct swap_info_struct *si = p; + + plist_for_each_entry_continue(si, &swap_active_head, list) { + si->prio++; + si->list.prio--; + si->avail_list.prio--; + } + least_priority++; + } + plist_del(&p->list, &swap_active_head); + atomic_long_sub(p->pages, &nr_swap_pages); + total_swap_pages -= p->pages; + p->flags &= ~SWP_WRITEOK; + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + + set_current_oom_origin(); + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ + clear_current_oom_origin(); + + if (err) { + /* re-insert swap space back into swap_list */ + reinsert_swap_info(p); + goto out_dput; + } + + flush_work(&p->discard_work); + + destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) + free_swap_count_continuations(p); + + mutex_lock(&swapon_mutex); + spin_lock(&swap_lock); + spin_lock(&p->lock); + drain_mmlist(); + + /* wait for anyone still in scan_swap_map */ + p->highest_bit = 0; /* cuts scans short */ + while (p->flags >= SWP_SCANNING) { + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&swap_lock); + spin_lock(&p->lock); + } + + swap_file = p->swap_file; + old_block_size = p->old_block_size; + p->swap_file = NULL; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + cluster_info = p->cluster_info; + p->cluster_info = NULL; + frontswap_map = frontswap_map_get(p); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + frontswap_invalidate_area(p->type); + frontswap_map_set(p, NULL); + mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; + vfree(swap_map); + vfree(cluster_info); + vfree(frontswap_map); + /* Destroy swap account information */ + swap_cgroup_swapoff(p->type); + + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); + set_blocksize(bdev, old_block_size); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + } else { + mutex_lock(&inode->i_mutex); + inode->i_flags &= ~S_SWAPFILE; + mutex_unlock(&inode->i_mutex); + } + filp_close(swap_file, NULL); + + /* + * Clear the SWP_USED flag after all resources are freed so that swapon + * can reuse this swap_info in alloc_swap_info() safely. It is ok to + * not hold p->lock after we cleared its SWP_WRITEOK. + */ + spin_lock(&swap_lock); + p->flags = 0; + spin_unlock(&swap_lock); + + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + +out_dput: + filp_close(victim, NULL); +out: + putname(pathname); + return err; +} + +#ifdef CONFIG_PROC_FS +static unsigned swaps_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + poll_wait(file, &proc_poll_wait, wait); + + if (seq->poll_event != atomic_read(&proc_poll_event)) { + seq->poll_event = atomic_read(&proc_poll_event); + return POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + + return POLLIN | POLLRDNORM; +} + +/* iterator */ +static void *swap_start(struct seq_file *swap, loff_t *pos) +{ + struct swap_info_struct *si; + int type; + loff_t l = *pos; + + mutex_lock(&swapon_mutex); + + if (!l) + return SEQ_START_TOKEN; + + for (type = 0; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) + continue; + if (!--l) + return si; + } + + return NULL; +} + +static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) +{ + struct swap_info_struct *si = v; + int type; + + if (v == SEQ_START_TOKEN) + type = 0; + else + type = si->type + 1; + + for (; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) + continue; + ++*pos; + return si; + } + + return NULL; +} + +static void swap_stop(struct seq_file *swap, void *v) +{ + mutex_unlock(&swapon_mutex); +} + +static int swap_show(struct seq_file *swap, void *v) +{ + struct swap_info_struct *si = v; + struct file *file; + int len; + + if (si == SEQ_START_TOKEN) { + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + return 0; + } + + file = si->swap_file; + len = seq_path(swap, &file->f_path, " \t\n\\"); + seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + len < 40 ? 40 - len : 1, " ", + S_ISBLK(file_inode(file)->i_mode) ? + "partition" : "file\t", + si->pages << (PAGE_SHIFT - 10), + si->inuse_pages << (PAGE_SHIFT - 10), + si->prio); + return 0; +} + +static const struct seq_operations swaps_op = { + .start = swap_start, + .next = swap_next, + .stop = swap_stop, + .show = swap_show +}; + +static int swaps_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &swaps_op); + if (ret) + return ret; + + seq = file->private_data; + seq->poll_event = atomic_read(&proc_poll_event); + return 0; +} + +static const struct file_operations proc_swaps_operations = { + .open = swaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .poll = swaps_poll, +}; + +static int __init procswaps_init(void) +{ + proc_create("swaps", 0, NULL, &proc_swaps_operations); + return 0; +} +__initcall(procswaps_init); +#endif /* CONFIG_PROC_FS */ + +#ifdef MAX_SWAPFILES_CHECK +static int __init max_swapfiles_check(void) +{ + MAX_SWAPFILES_CHECK(); + return 0; +} +late_initcall(max_swapfiles_check); +#endif + +static struct swap_info_struct *alloc_swap_info(void) +{ + struct swap_info_struct *p; + unsigned int type; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + if (!(swap_info[type]->flags & SWP_USED)) + break; + } + if (type >= MAX_SWAPFILES) { + spin_unlock(&swap_lock); + kfree(p); + return ERR_PTR(-EPERM); + } + if (type >= nr_swapfiles) { + p->type = type; + swap_info[type] = p; + /* + * Write swap_info[type] before nr_swapfiles, in case a + * racing procfs swap_start() or swap_next() is reading them. + * (We never shrink nr_swapfiles, we never free this entry.) + */ + smp_wmb(); + nr_swapfiles++; + } else { + kfree(p); + p = swap_info[type]; + /* + * Do not memset this entry: a racing procfs swap_next() + * would be relying on p->type to remain valid. + */ + } + INIT_LIST_HEAD(&p->first_swap_extent.list); + plist_node_init(&p->list, 0); + plist_node_init(&p->avail_list, 0); + p->flags = SWP_USED; + spin_unlock(&swap_lock); + spin_lock_init(&p->lock); + + return p; +} + +static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +{ + int error; + + if (S_ISBLK(inode->i_mode)) { + p->bdev = bdgrab(I_BDEV(inode)); + error = blkdev_get(p->bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); + if (error < 0) { + p->bdev = NULL; + return -EINVAL; + } + p->old_block_size = block_size(p->bdev); + error = set_blocksize(p->bdev, PAGE_SIZE); + if (error < 0) + return error; + p->flags |= SWP_BLKDEV; + } else if (S_ISREG(inode->i_mode)) { + p->bdev = inode->i_sb->s_bdev; + mutex_lock(&inode->i_mutex); + if (IS_SWAPFILE(inode)) + return -EBUSY; + } else + return -EINVAL; + + return 0; +} + +static unsigned long read_swap_header(struct swap_info_struct *p, + union swap_header *swap_header, + struct inode *inode) +{ + int i; + unsigned long maxpages; + unsigned long swapfilepages; + unsigned long last_page; + + if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { + pr_err("Unable to find swap-space signature\n"); + return 0; + } + + /* swap partition endianess hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version */ + if (swap_header->info.version != 1) { + pr_warn("Unable to handle swap header version %d\n", + swap_header->info.version); + return 0; + } + + p->lowest_bit = 1; + p->cluster_next = 1; + p->cluster_nr = 0; + + /* + * Find out how many pages are allowed for a single swap + * device. There are two limiting factors: 1) the number + * of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte as defined by the + * different architectures. In order to find the + * largest possible bit mask, a swap entry with swap type 0 + * and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again, and finally the swap + * offset is extracted. This will mask all the bits from + * the initial ~0UL mask that can't be encoded in either + * the swp_entry_t or the architecture definition of a + * swap pte. + */ + maxpages = swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + last_page = swap_header->info.last_page; + if (last_page > maxpages) { + pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", + maxpages << (PAGE_SHIFT - 10), + last_page << (PAGE_SHIFT - 10)); + } + if (maxpages > last_page) { + maxpages = last_page + 1; + /* p->max is an unsigned int: don't overflow it */ + if ((unsigned int)maxpages == 0) + maxpages = UINT_MAX; + } + p->highest_bit = maxpages - 1; + + if (!maxpages) + return 0; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; + if (swapfilepages && maxpages > swapfilepages) { + pr_warn("Swap area shorter than signature indicates\n"); + return 0; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + return 0; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + return 0; + + return maxpages; +} + +static int setup_swap_map_and_extents(struct swap_info_struct *p, + union swap_header *swap_header, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info, + unsigned long maxpages, + sector_t *span) +{ + int i; + unsigned int nr_good_pages; + int nr_extents; + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; + + nr_good_pages = maxpages - 1; /* omit header page */ + + cluster_set_null(&p->free_cluster_head); + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->discard_cluster_head); + cluster_set_null(&p->discard_cluster_tail); + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) + return -EINVAL; + if (page_nr < maxpages) { + swap_map[page_nr] = SWAP_MAP_BAD; + nr_good_pages--; + /* + * Haven't marked the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, page_nr); + } + } + + /* Haven't marked the cluster free yet, no list operation involved */ + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(p, cluster_info, i); + + if (nr_good_pages) { + swap_map[0] = SWAP_MAP_BAD; + /* + * Not mark the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, 0); + p->max = maxpages; + p->pages = nr_good_pages; + nr_extents = setup_swap_extents(p, span); + if (nr_extents < 0) + return nr_extents; + nr_good_pages = p->pages; + } + if (!nr_good_pages) { + pr_warn("Empty swap-file\n"); + return -EINVAL; + } + + if (!cluster_info) + return nr_extents; + + for (i = 0; i < nr_clusters; i++) { + if (!cluster_count(&cluster_info[idx])) { + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, + idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } + } + idx++; + if (idx == nr_clusters) + idx = 0; + } + return nr_extents; +} + +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +{ + struct swap_info_struct *p; + struct filename *name; + struct file *swap_file = NULL; + struct address_space *mapping; + int i; + int prio; + int error; + union swap_header *swap_header; + int nr_extents; + sector_t span; + unsigned long maxpages; + unsigned char *swap_map = NULL; + struct swap_cluster_info *cluster_info = NULL; + unsigned long *frontswap_map = NULL; + struct page *page = NULL; + struct inode *inode = NULL; + + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = alloc_swap_info(); + if (IS_ERR(p)) + return PTR_ERR(p); + + INIT_WORK(&p->discard_work, swap_discard_work); + + name = getname(specialfile); + if (IS_ERR(name)) { + error = PTR_ERR(name); + name = NULL; + goto bad_swap; + } + swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swap_file)) { + error = PTR_ERR(swap_file); + swap_file = NULL; + goto bad_swap; + } + + p->swap_file = swap_file; + mapping = swap_file->f_mapping; + + for (i = 0; i < nr_swapfiles; i++) { + struct swap_info_struct *q = swap_info[i]; + + if (q == p || !q->swap_file) + continue; + if (mapping == q->swap_file->f_mapping) { + error = -EBUSY; + goto bad_swap; + } + } + + inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + error = claim_swapfile(p, inode); + if (unlikely(error)) + goto bad_swap; + + /* + * Read the swap header. + */ + if (!mapping->a_ops->readpage) { + error = -EINVAL; + goto bad_swap; + } + page = read_mapping_page(mapping, 0, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto bad_swap; + } + swap_header = kmap(page); + + maxpages = read_swap_header(p, swap_header, inode); + if (unlikely(!maxpages)) { + error = -EINVAL; + goto bad_swap; + } + + /* OK, set up the swap map and apply the bad block list */ + swap_map = vzalloc(maxpages); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + /* + * select a random position to start with to help wear leveling + * SSD + */ + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + + cluster_info = vzalloc(DIV_ROUND_UP(maxpages, + SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + if (!cluster_info) { + error = -ENOMEM; + goto bad_swap; + } + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap; + } + for_each_possible_cpu(i) { + struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster_set_null(&cluster->index); + } + } + + error = swap_cgroup_swapon(p->type, maxpages); + if (error) + goto bad_swap; + + nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, + cluster_info, maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; + goto bad_swap; + } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); + + if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + pr_err("swapon: discard_swap(%p): %d\n", + p, err); + } + } + + mutex_lock(&swapon_mutex); + prio = -1; + if (swap_flags & SWAP_FLAG_PREFER) + prio = + (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); + + pr_info("Adding %uk swap on %s. " + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + p->pages<<(PAGE_SHIFT-10), name->name, p->prio, + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + (p->flags & SWP_SOLIDSTATE) ? "SS" : "", + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", + (frontswap_map) ? "FS" : ""); + + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + + if (S_ISREG(inode->i_mode)) + inode->i_flags |= S_SWAPFILE; + error = 0; + goto out; +bad_swap: + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; + if (inode && S_ISBLK(inode->i_mode) && p->bdev) { + set_blocksize(p->bdev, p->old_block_size); + blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + } + destroy_swap_extents(p); + swap_cgroup_swapoff(p->type); + spin_lock(&swap_lock); + p->swap_file = NULL; + p->flags = 0; + spin_unlock(&swap_lock); + vfree(swap_map); + vfree(cluster_info); + if (swap_file) { + if (inode && S_ISREG(inode->i_mode)) { + mutex_unlock(&inode->i_mutex); + inode = NULL; + } + filp_close(swap_file, NULL); + } +out: + if (page && !IS_ERR(page)) { + kunmap(page); + page_cache_release(page); + } + if (name) + putname(name); + if (inode && S_ISREG(inode->i_mode)) + mutex_unlock(&inode->i_mutex); + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int type; + unsigned long nr_to_be_unused = 0; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; + } + val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + spin_unlock(&swap_lock); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Returns error code in following case. + * - success -> 0 + * - swp_entry is invalid -> EINVAL + * - swp_entry is migration entry -> EINVAL + * - swap-cache reference is requested but there is already one. -> EEXIST + * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM + */ +static int __swap_duplicate(swp_entry_t entry, unsigned char usage) +{ + struct swap_info_struct *p; + unsigned long offset, type; + unsigned char count; + unsigned char has_cache; + int err = -EINVAL; + + if (non_swap_entry(entry)) + goto out; + + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_file; + p = swap_info[type]; + offset = swp_offset(entry); + + spin_lock(&p->lock); + if (unlikely(offset >= p->max)) + goto unlock_out; + + count = p->swap_map[offset]; + + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } + + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + err = 0; + + if (usage == SWAP_HAS_CACHE) { + + /* set SWAP_HAS_CACHE if there is no cache and entry is used */ + if (!has_cache && count) + has_cache = SWAP_HAS_CACHE; + else if (has_cache) /* someone else added cache */ + err = -EEXIST; + else /* no users remaining */ + err = -ENOENT; + + } else if (count || has_cache) { + + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + count += usage; + else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) + err = -EINVAL; + else if (swap_count_continued(p, offset, count)) + count = COUNT_CONTINUED; + else + err = -ENOMEM; + } else + err = -ENOENT; /* unused swap entry */ + + p->swap_map[offset] = count | has_cache; + +unlock_out: + spin_unlock(&p->lock); +out: + return err; + +bad_file: + pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); + goto out; +} + +/* + * Help swapoff by noting that swap entry belongs to shmem/tmpfs + * (in which case its reference count is never incremented). + */ +void swap_shmem_alloc(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP_SHMEM); +} + +/* + * Increase reference count of swap entry by 1. + * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required + * but could not be atomically allocated. Returns 0, just as if it succeeded, + * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which + * might occur if a page table entry has got corrupted. + */ +int swap_duplicate(swp_entry_t entry) +{ + int err = 0; + + while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + return err; +} + +/* + * @entry: swap entry for which we allocate swap cache. + * + * Called when allocating swap cache for existing swap entry, + * This can return error codes. Returns 0 at success. + * -EBUSY means there is a swap cache. + * Note: return code is different from swap_duplicate(). + */ +int swapcache_prepare(swp_entry_t entry) +{ + return __swap_duplicate(entry, SWAP_HAS_CACHE); +} + +struct swap_info_struct *page_swap_info(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + BUG_ON(!PageSwapCache(page)); + return swap_info[swp_type(swap)]; +} + +/* + * out-of-line __page_file_ methods to avoid include hell. + */ +struct address_space *__page_file_mapping(struct page *page) +{ + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + return page_swap_info(page)->swap_file->f_mapping; +} +EXPORT_SYMBOL_GPL(__page_file_mapping); + +pgoff_t __page_file_index(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + return swp_offset(swap); +} +EXPORT_SYMBOL_GPL(__page_file_index); + +/* + * add_swap_count_continuation - called when a swap count is duplicated + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's + * page of the original vmalloc'ed swap_map, to hold the continuation count + * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called + * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. + * + * These continuation pages are seldom referenced: the common paths all work + * on the original swap_map, only referring to a continuation page when the + * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. + * + * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding + * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) + * can be called after dropping locks. + */ +int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) +{ + struct swap_info_struct *si; + struct page *head; + struct page *page; + struct page *list_page; + pgoff_t offset; + unsigned char count; + + /* + * When debugging, it's easier to use __GFP_ZERO here; but it's better + * for latency not to zero a page while GFP_ATOMIC and holding locks. + */ + page = alloc_page(gfp_mask | __GFP_HIGHMEM); + + si = swap_info_get(entry); + if (!si) { + /* + * An acceptable race has occurred since the failing + * __swap_duplicate(): the swap entry has been freed, + * perhaps even the whole swap_map cleared for swapoff. + */ + goto outer; + } + + offset = swp_offset(entry); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; + + if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { + /* + * The higher the swap count, the more likely it is that tasks + * will race to add swap count continuation: we need to avoid + * over-provisioning. + */ + goto out; + } + + if (!page) { + spin_unlock(&si->lock); + return -ENOMEM; + } + + /* + * We are fortunate that although vmalloc_to_page uses pte_offset_map, + * no architecture is using highmem pages for kernel page tables: so it + * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. + */ + head = vmalloc_to_page(si->swap_map + offset); + offset &= ~PAGE_MASK; + + /* + * Page allocation does not initialize the page's lru field, + * but it does always reset its private field. + */ + if (!page_private(head)) { + BUG_ON(count & COUNT_CONTINUED); + INIT_LIST_HEAD(&head->lru); + set_page_private(head, SWP_CONTINUED); + si->flags |= SWP_CONTINUED; + } + + list_for_each_entry(list_page, &head->lru, lru) { + unsigned char *map; + + /* + * If the previous map said no continuation, but we've found + * a continuation page, free our allocation and use this one. + */ + if (!(count & COUNT_CONTINUED)) + goto out; + + map = kmap_atomic(list_page) + offset; + count = *map; + kunmap_atomic(map); + + /* + * If this continuation count now has some space in it, + * free our allocation and use this one. + */ + if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) + goto out; + } + + list_add_tail(&page->lru, &head->lru); + page = NULL; /* now it's attached, don't free it */ +out: + spin_unlock(&si->lock); +outer: + if (page) + __free_page(page); + return 0; +} + +/* + * swap_count_continued - when the original swap_map count is incremented + * from SWAP_MAP_MAX, check if there is already a continuation page to carry + * into, carry if so, or else fail until a new continuation page is allocated; + * when the original swap_map count is decremented from 0 with continuation, + * borrow from the continuation and report whether it still holds more. + * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + */ +static bool swap_count_continued(struct swap_info_struct *si, + pgoff_t offset, unsigned char count) +{ + struct page *head; + struct page *page; + unsigned char *map; + + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head) != SWP_CONTINUED) { + BUG_ON(count & COUNT_CONTINUED); + return false; /* need to add count continuation */ + } + + offset &= ~PAGE_MASK; + page = list_entry(head->lru.next, struct page, lru); + map = kmap_atomic(page) + offset; + + if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ + goto init_map; /* jump over SWAP_CONT_MAX checks */ + + if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ + /* + * Think of how you add 1 to 999 + */ + while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { + kunmap_atomic(map); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page) + offset; + } + if (*map == SWAP_CONT_MAX) { + kunmap_atomic(map); + page = list_entry(page->lru.next, struct page, lru); + if (page == head) + return false; /* add count continuation */ + map = kmap_atomic(page) + offset; +init_map: *map = 0; /* we didn't zero the page */ + } + *map += 1; + kunmap_atomic(map); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page) + offset; + *map = COUNT_CONTINUED; + kunmap_atomic(map); + page = list_entry(page->lru.prev, struct page, lru); + } + return true; /* incremented */ + + } else { /* decrementing */ + /* + * Think of how you subtract 1 from 1000 + */ + BUG_ON(count != COUNT_CONTINUED); + while (*map == COUNT_CONTINUED) { + kunmap_atomic(map); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page) + offset; + } + BUG_ON(*map == 0); + *map -= 1; + if (*map == 0) + count = 0; + kunmap_atomic(map); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page) + offset; + *map = SWAP_CONT_MAX | count; + count = COUNT_CONTINUED; + kunmap_atomic(map); + page = list_entry(page->lru.prev, struct page, lru); + } + return count == COUNT_CONTINUED; + } +} + +/* + * free_swap_count_continuations - swapoff free all the continuation pages + * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. + */ +static void free_swap_count_continuations(struct swap_info_struct *si) +{ + pgoff_t offset; + + for (offset = 0; offset < si->max; offset += PAGE_SIZE) { + struct page *head; + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head)) { + struct list_head *this, *next; + list_for_each_safe(this, next, &head->lru) { + struct page *page; + page = list_entry(this, struct page, lru); + list_del(this); + __free_page(page); + } + } + } +} diff --git a/kernel/mm/truncate.c b/kernel/mm/truncate.c new file mode 100644 index 000000000..09598db42 --- /dev/null +++ b/kernel/mm/truncate.c @@ -0,0 +1,800 @@ +/* + * mm/truncate.c - code for taking down pages from address_spaces + * + * Copyright (C) 2002, Linus Torvalds + * + * 10Sep2002 Andrew Morton + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* grr. try_to_release_page, + do_invalidatepage */ +#include +#include +#include "internal.h" + +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrshadows--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) { + local_lock(workingset_shadow_lock); + list_lru_del(&__workingset_shadow_nodes, &node->private_list); + local_unlock(workingset_shadow_lock); + } + __radix_tree_delete_node(&mapping->page_tree, node); +unlock: + spin_unlock_irq(&mapping->tree_lock); +} + +/** + * do_invalidatepage - invalidate part or all of a page + * @page: the page which is affected + * @offset: start of the range to invalidate + * @length: length of the range to invalidate + * + * do_invalidatepage() is called when all or part of the page has become + * invalidated by a truncate operation. + * + * do_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +void do_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + void (*invalidatepage)(struct page *, unsigned int, unsigned int); + + invalidatepage = page->mapping->a_ops->invalidatepage; +#ifdef CONFIG_BLOCK + if (!invalidatepage) + invalidatepage = block_invalidatepage; +#endif + if (invalidatepage) + (*invalidatepage)(page, offset, length); +} + +/* + * If truncate cannot remove the fs-private metadata from the page, the page + * becomes orphaned. It will be left on the LRU and may even be mapped into + * user pagetables if we're racing with filemap_fault(). + * + * We need to bale out if page->mapping is no longer equal to the original + * mapping. This happens a) when the VM reclaimed the page while we waited on + * its lock, b) when a concurrent invalidate_mapping_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +static int +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return -EIO; + + if (page_has_private(page)) + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * Hence dirty accounting check is placed after invalidation. + */ + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); + + ClearPageMappedToDisk(page); + delete_from_page_cache(page); + return 0; +} + +/* + * This is for invalidate_mapping_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can + * be marked dirty at any time too, so use remove_mapping which safely + * discards clean, unused pages. + * + * Returns non-zero if the page was successfully invalidated. + */ +static int +invalidate_complete_page(struct address_space *mapping, struct page *page) +{ + int ret; + + if (page->mapping != mapping) + return 0; + + if (page_has_private(page) && !try_to_release_page(page, 0)) + return 0; + + ret = remove_mapping(mapping, page); + + return ret; +} + +int truncate_inode_page(struct address_space *mapping, struct page *page) +{ + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + return truncate_complete_page(mapping, page); +} + +/* + * Used to get rid of pages on hardware memory corruption. + */ +int generic_error_remove_page(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return -EINVAL; + /* + * Only punch for normal data pages for now. + * Handling other types like directories would need more auditing. + */ + if (!S_ISREG(mapping->host->i_mode)) + return -EIO; + return truncate_inode_page(mapping, page); +} +EXPORT_SYMBOL(generic_error_remove_page); + +/* + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. The page must be locked. + * + * Returns 1 if the page is successfully invalidated, otherwise 0. + */ +int invalidate_inode_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + if (!mapping) + return 0; + if (PageDirty(page) || PageWriteback(page)) + return 0; + if (page_mapped(page)) + return 0; + return invalidate_complete_page(mapping, page); +} + +/** + * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * @lend: offset to which to truncate (inclusive) + * + * Truncate the page cache, removing the pages that are between + * specified offsets (and zeroing out partial pages + * if lstart or lend + 1 is not page aligned). + * + * Truncate takes two passes - the first pass is nonblocking. It will not + * block on page locks and it will not block on writeback. The second pass + * will wait. This is to prevent as much IO as possible in the affected region. + * The first pass will remove most pages, so the search cost of the second pass + * is low. + * + * We pass down the cache-hot hint to the page freeing code. Even if the + * mapping is large, it is probably the case that the final pages are the most + * recently touched, and freeing happens in ascending file offset order. + * + * Note that since ->invalidatepage() accepts range to invalidate + * truncate_inode_pages_range is able to handle cases where lend + 1 is not + * page aligned properly. + */ +void truncate_inode_pages_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + pgoff_t start; /* inclusive */ + pgoff_t end; /* exclusive */ + unsigned int partial_start; /* inclusive */ + unsigned int partial_end; /* exclusive */ + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index; + int i; + + cleancache_invalidate_inode(mapping); + if (mapping->nrpages == 0 && mapping->nrshadows == 0) + return; + + /* Offsets within partial pages */ + partial_start = lstart & (PAGE_CACHE_SIZE - 1); + partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + + /* + * 'start' and 'end' always covers the range of pages to be fully + * truncated. Partial pages are covered with 'partial_start' at the + * start of the range and 'partial_end' at the end of the range. + * Note that 'end' is exclusive while 'lend' is inclusive. + */ + start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (lend == -1) + /* + * lend == -1 indicates end-of-file so we have to set 'end' + * to the highest possible pgoff_t and since the type is + * unsigned we're using -1. + */ + end = -1; + else + end = (lend + 1) >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + index = start; + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ + index = indices[i]; + if (index >= end) + break; + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); + if (PageWriteback(page)) { + unlock_page(page); + continue; + } + truncate_inode_page(mapping, page); + unlock_page(page); + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + cond_resched(); + index++; + } + + if (partial_start) { + struct page *page = find_lock_page(mapping, start - 1); + if (page) { + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + /* Truncation within a single page */ + top = partial_end; + partial_end = 0; + } + wait_on_page_writeback(page); + zero_user_segment(page, partial_start, top); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, partial_start, + top - partial_start); + unlock_page(page); + page_cache_release(page); + } + } + if (partial_end) { + struct page *page = find_lock_page(mapping, end); + if (page) { + wait_on_page_writeback(page); + zero_user_segment(page, 0, partial_end); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, 0, + partial_end); + unlock_page(page); + page_cache_release(page); + } + } + /* + * If the truncation happened within a single page no pages + * will be released, just zeroed, so we can bail out now. + */ + if (start >= end) + return; + + index = start; + for ( ; ; ) { + cond_resched(); + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* If all gone from start onwards, we're done */ + if (index == start) + break; + /* Otherwise restart to make sure all gone */ + index = start; + continue; + } + if (index == start && indices[0] >= end) { + /* All gone out of hole to be punched, we're done */ + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + break; + } + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ + index = indices[i]; + if (index >= end) { + /* Restart punch to make sure all gone */ + index = start - 1; + break; + } + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + + lock_page(page); + WARN_ON(page->index != index); + wait_on_page_writeback(page); + truncate_inode_page(mapping, page); + unlock_page(page); + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + index++; + } + cleancache_invalidate_inode(mapping); +} +EXPORT_SYMBOL(truncate_inode_pages_range); + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Called under (and serialised by) inode->i_mutex. + * + * Note: When this function returns, there can be a page in the process of + * deletion (inside __delete_from_page_cache()) in the specified range. Thus + * mapping->nrpages can be non-zero when this function returns even after + * truncation of the whole mapping. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + truncate_inode_pages_range(mapping, lstart, (loff_t)-1); +} +EXPORT_SYMBOL(truncate_inode_pages); + +/** + * truncate_inode_pages_final - truncate *all* pages before inode dies + * @mapping: mapping to truncate + * + * Called under (and serialized by) inode->i_mutex. + * + * Filesystems have to use this in the .evict_inode path to inform the + * VM that this is the final truncate and the inode is going away. + */ +void truncate_inode_pages_final(struct address_space *mapping) +{ + unsigned long nrshadows; + unsigned long nrpages; + + /* + * Page reclaim can not participate in regular inode lifetime + * management (can't call iput()) and thus can race with the + * inode teardown. Tell it when the address space is exiting, + * so that it does not install eviction information after the + * final truncate has begun. + */ + mapping_set_exiting(mapping); + + /* + * When reclaim installs eviction entries, it increases + * nrshadows first, then decreases nrpages. Make sure we see + * this in the right order or we might miss an entry. + */ + nrpages = mapping->nrpages; + smp_rmb(); + nrshadows = mapping->nrshadows; + + if (nrpages || nrshadows) { + /* + * As truncation uses a lockless tree lookup, cycle + * the tree lock to make sure any ongoing tree + * modification that does not see AS_EXITING is + * completed before starting the final truncate. + */ + spin_lock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + + truncate_inode_pages(mapping, 0); + } +} +EXPORT_SYMBOL(truncate_inode_pages_final); + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + pgoff_t index = start; + unsigned long ret; + unsigned long count = 0; + int i; + + pagevec_init(&pvec, 0); + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ + index = indices[i]; + if (index > end) + break; + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); + ret = invalidate_inode_page(page); + unlock_page(page); + /* + * Invalidation is a hint that the page is no longer + * of interest and try to speed up its reclaim. + */ + if (!ret) + deactivate_file_page(page); + count += ret; + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + cond_resched(); + index++; + } + return count; +} +EXPORT_SYMBOL(invalidate_mapping_pages); + +/* + * This is like invalidate_complete_page(), except it ignores the page's + * refcount. We do this because invalidate_inode_pages2() needs stronger + * invalidation guarantees, and cannot afford to leave pages behind because + * shrink_page_list() has a temp ref on them, or because they're transiently + * sitting in the lru_cache_add() pagevecs. + */ +static int +invalidate_complete_page2(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return 0; + + if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) + return 0; + + spin_lock_irq(&mapping->tree_lock); + if (PageDirty(page)) + goto failed; + + BUG_ON(page_has_private(page)); + __delete_from_page_cache(page, NULL); + spin_unlock_irq(&mapping->tree_lock); + + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(page); + + page_cache_release(page); /* pagecache ref */ + return 1; +failed: + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +static int do_launder_page(struct address_space *mapping, struct page *page) +{ + if (!PageDirty(page)) + return 0; + if (page->mapping != mapping || mapping->a_ops->launder_page == NULL) + return 0; + return mapping->a_ops->launder_page(page); +} + +/** + * invalidate_inode_pages2_range - remove range of pages from an address_space + * @mapping: the address_space + * @start: the page offset 'from' which to invalidate + * @end: the page offset 'to' which to invalidate (inclusive) + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Returns -EBUSY if any pages could not be invalidated. + */ +int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + pgoff_t index; + int i; + int ret = 0; + int ret2 = 0; + int did_range_unmap = 0; + + cleancache_invalidate_inode(mapping); + pagevec_init(&pvec, 0); + index = start; + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ + index = indices[i]; + if (index > end) + break; + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + + lock_page(page); + WARN_ON(page->index != index); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + wait_on_page_writeback(page); + if (page_mapped(page)) { + if (!did_range_unmap) { + /* + * Zap the rest of the file in one hit. + */ + unmap_mapping_range(mapping, + (loff_t)index << PAGE_CACHE_SHIFT, + (loff_t)(1 + end - index) + << PAGE_CACHE_SHIFT, + 0); + did_range_unmap = 1; + } else { + /* + * Just zap this page + */ + unmap_mapping_range(mapping, + (loff_t)index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + } + BUG_ON(page_mapped(page)); + ret2 = do_launder_page(mapping, page); + if (ret2 == 0) { + if (!invalidate_complete_page2(mapping, page)) + ret2 = -EBUSY; + } + if (ret2 < 0) + ret = ret2; + unlock_page(page); + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + cond_resched(); + index++; + } + cleancache_invalidate_inode(mapping); + return ret; +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); + +/** + * invalidate_inode_pages2 - remove all pages from an address_space + * @mapping: the address_space + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Returns -EBUSY if any pages could not be invalidated. + */ +int invalidate_inode_pages2(struct address_space *mapping) +{ + return invalidate_inode_pages2_range(mapping, 0, -1); +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @newsize: new file size + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t newsize) +{ + struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(newsize, PAGE_SIZE); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, holebegin, 0, 1); + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * truncate_setsize - update inode and pagecache for a new file size + * @inode: inode + * @newsize: new file size + * + * truncate_setsize updates i_size and performs pagecache truncation (if + * necessary) to @newsize. It will be typically be called from the filesystem's + * setattr function when ATTR_SIZE is passed in. + * + * Must be called with a lock serializing truncates and writes (generally + * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * specific block truncation has been performed. + */ +void truncate_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize = inode->i_size; + + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); +} +EXPORT_SYMBOL(truncate_setsize); + +/** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + +/** + * truncate_pagecache_range - unmap and remove pagecache that is hole-punched + * @inode: inode + * @lstart: offset of beginning of hole + * @lend: offset of last byte of hole + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + /* + * This rounding is currently just for example: unmap_mapping_range + * expands its hole outwards, whereas we want it to contract the hole + * inwards. However, existing callers of truncate_pagecache_range are + * doing their own page rounding first. Note that unmap_mapping_range + * allows holelen 0 for all, and we allow lend -1 for end of file. + */ + + /* + * Unlike in truncate_pagecache, unmap_mapping_range is called only + * once (before truncating pagecache), and without "even_cows" flag: + * hole-punching should not remove private COWed pages from the hole. + */ + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + truncate_inode_pages_range(mapping, lstart, lend); +} +EXPORT_SYMBOL(truncate_pagecache_range); diff --git a/kernel/mm/util.c b/kernel/mm/util.c new file mode 100644 index 000000000..68ff8a536 --- /dev/null +++ b/kernel/mm/util.c @@ -0,0 +1,465 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +static inline int is_kernel_rodata(unsigned long addr) +{ + return addr >= (unsigned long)__start_rodata && + addr < (unsigned long)__end_rodata; +} + +/** + * kfree_const - conditionally free memory + * @x: pointer to the memory + * + * Function calls kfree only if @x is not in .rodata section. + */ +void kfree_const(const void *x) +{ + if (!is_kernel_rodata((unsigned long)x)) + kfree(x); +} +EXPORT_SYMBOL(kfree_const); + +/** + * kstrdup - allocate space for and copy an existing string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrdup(const char *s, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strlen(s) + 1; + buf = kmalloc_track_caller(len, gfp); + if (buf) + memcpy(buf, s, len); + return buf; +} +EXPORT_SYMBOL(kstrdup); + +/** + * kstrdup_const - conditionally duplicate an existing const string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Function returns source string if it is in .rodata section otherwise it + * fallbacks to kstrdup. + * Strings allocated by kstrdup_const should be freed by kfree_const. + */ +const char *kstrdup_const(const char *s, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)s)) + return s; + + return kstrdup(s, gfp); +} +EXPORT_SYMBOL(kstrdup_const); + +/** + * kstrndup - allocate space for and copy an existing string + * @s: the string to duplicate + * @max: read at most @max chars from @s + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kstrndup(const char *s, size_t max, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strnlen(s, max); + buf = kmalloc_track_caller(len+1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kstrndup); + +/** + * kmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + */ +void *kmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = kmalloc_track_caller(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} +EXPORT_SYMBOL(kmemdup); + +/** + * memdup_user - duplicate memory region from user space + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Returns an ERR_PTR() on failure. + */ +void *memdup_user(const void __user *src, size_t len) +{ + void *p; + + /* + * Always use GFP_KERNEL, since copy_from_user() can sleep and + * cause pagefault, which makes it pointless to use GFP_NOFS + * or GFP_ATOMIC. + */ + p = kmalloc_track_caller(len, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + return p; +} +EXPORT_SYMBOL(memdup_user); + +/* + * strndup_user - duplicate an existing string from user space + * @s: The string to duplicate + * @n: Maximum number of bytes to copy, including the trailing NUL. + */ +char *strndup_user(const char __user *s, long n) +{ + char *p; + long length; + + length = strnlen_user(s, n); + + if (!length) + return ERR_PTR(-EFAULT); + + if (length > n) + return ERR_PTR(-EINVAL); + + p = memdup_user(s, length); + + if (IS_ERR(p)) + return p; + + p[length - 1] = '\0'; + + return p; +} +EXPORT_SYMBOL(strndup_user); + +void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) +{ + struct vm_area_struct *next; + + vma->vm_prev = prev; + if (prev) { + next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + next = rb_entry(rb_parent, + struct vm_area_struct, vm_rb); + else + next = NULL; + } + vma->vm_next = next; + if (next) + next->vm_prev = vma; +} + +/* Check if the vma is being used as a stack by this task */ +static int vm_is_stack_for_task(struct task_struct *t, + struct vm_area_struct *vma) +{ + return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); +} + +/* + * Check if the vma is being used as a stack. + * If is_group is non-zero, check in the entire thread group or else + * just check in the current task. Returns the task_struct of the task + * that the vma is stack for. Must be called under rcu_read_lock(). + */ +struct task_struct *task_of_stack(struct task_struct *task, + struct vm_area_struct *vma, bool in_group) +{ + if (vm_is_stack_for_task(task, vma)) + return task; + + if (in_group) { + struct task_struct *t; + + for_each_thread(task, t) { + if (vm_is_stack_for_task(t, vma)) + return t; + } + } + + return NULL; +} + +#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; +} +#endif + +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + * If the architecture not support this function, simply return with no + * page pinned + */ +int __weak __get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + return 0; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + * + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm, with force=0 and vma=NULL. However + * unlike get_user_pages, it must be called without mmap_sem held. + * + * get_user_pages_fast may take mmap_sem and page table locks, so no + * assumptions can be made about lack of locking. get_user_pages_fast is to be + * implemented in a way that is advantageous (vs get_user_pages()) when the + * user memory area is already faulted in and present in ptes. However if the + * pages have to be faulted in, it may turn out to be slightly slower so + * callers need to carefully consider what to use. On many architectures, + * get_user_pages_fast simply falls back to get_user_pages. + */ +int __weak get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + return get_user_pages_unlocked(current, mm, start, nr_pages, + write, 0, pages); +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); + +unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) +{ + unsigned long ret; + struct mm_struct *mm = current->mm; + unsigned long populate; + + ret = security_mmap_file(file, prot, flag); + if (!ret) { + down_write(&mm->mmap_sem); + ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, + &populate); + up_write(&mm->mmap_sem); + if (populate) + mm_populate(ret, populate); + } + return ret; +} + +unsigned long vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + if (unlikely(offset + PAGE_ALIGN(len) < offset)) + return -EINVAL; + if (unlikely(offset & ~PAGE_MASK)) + return -EINVAL; + + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +} +EXPORT_SYMBOL(vm_mmap); + +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +static inline void *__page_rmapping(struct page *page) +{ + unsigned long mapping; + + mapping = (unsigned long)page->mapping; + mapping &= ~PAGE_MAPPING_FLAGS; + + return (void *)mapping; +} + +/* Neutral page->mapping pointer to address_space or anon_vma or other */ +void *page_rmapping(struct page *page) +{ + page = compound_head(page); + return __page_rmapping(page); +} + +struct anon_vma *page_anon_vma(struct page *page) +{ + unsigned long mapping; + + page = compound_head(page); + mapping = (unsigned long)page->mapping; + if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + return NULL; + return __page_rmapping(page); +} + +struct address_space *page_mapping(struct page *page) +{ + unsigned long mapping; + + /* This happens if someone calls flush_dcache_page on slab page */ + if (unlikely(PageSlab(page))) + return NULL; + + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + return swap_address_space(entry); + } + + mapping = (unsigned long)page->mapping; + if (mapping & PAGE_MAPPING_FLAGS) + return NULL; + return page->mapping; +} + +int overcommit_ratio_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + +/* + * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used + */ +unsigned long vm_commit_limit(void) +{ + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; +} + +/** + * get_cmdline() - copy the cmdline value to a buffer. + * @task: the task whose cmdline value to copy. + * @buffer: the buffer to copy to. + * @buflen: the length of the buffer. Larger cmdline values are truncated + * to this length. + * Returns the size of the cmdline field copied. Note that the copy does + * not guarantee an ending NULL byte. + */ +int get_cmdline(struct task_struct *task, char *buffer, int buflen) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + len = mm->arg_end - mm->arg_start; + + if (len > buflen) + len = buflen; + + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + + /* + * If the nul at the end of args has been overwritten, then + * assume application is using setproctitle(3). + */ + if (res > 0 && buffer[res-1] != '\0' && len < buflen) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = mm->env_end - mm->env_start; + if (len > buflen - res) + len = buflen - res; + res += access_process_vm(task, mm->env_start, + buffer+res, len, 0); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} diff --git a/kernel/mm/vmacache.c b/kernel/mm/vmacache.c new file mode 100644 index 000000000..b6e3662fe --- /dev/null +++ b/kernel/mm/vmacache.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2014 Davidlohr Bueso. + */ +#include +#include +#include + +/* + * Flush vma caches for threads that share a given mm. + * + * The operation is safe because the caller holds the mmap_sem + * exclusively and other threads accessing the vma cache will + * have mmap_sem held at least for read, so no extra locking + * is required to maintain the vma cache. + */ +void vmacache_flush_all(struct mm_struct *mm) +{ + struct task_struct *g, *p; + + count_vm_vmacache_event(VMACACHE_FULL_FLUSHES); + + /* + * Single threaded tasks need not iterate the entire + * list of process. We can avoid the flushing as well + * since the mm's seqnum was increased and don't have + * to worry about other threads' seqnum. Current's + * flush will occur upon the next lookup. + */ + if (atomic_read(&mm->mm_users) == 1) + return; + + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * Only flush the vmacache pointers as the + * mm seqnum is already set and curr's will + * be set upon invalidation when the next + * lookup is done. + */ + if (mm == p->mm) + vmacache_flush(p); + } + rcu_read_unlock(); +} + +/* + * This task may be accessing a foreign mm via (for example) + * get_user_pages()->find_vma(). The vmacache is task-local and this + * task's vmacache pertains to a different mm (ie, its own). There is + * nothing we can do here. + * + * Also handle the case where a kernel thread has adopted this mm via use_mm(). + * That kernel thread's vmacache is not applicable to this mm. + */ +static bool vmacache_valid_mm(struct mm_struct *mm) +{ + return current->mm == mm && !(current->flags & PF_KTHREAD); +} + +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) +{ + if (vmacache_valid_mm(newvma->vm_mm)) + current->vmacache[VMACACHE_HASH(addr)] = newvma; +} + +static bool vmacache_valid(struct mm_struct *mm) +{ + struct task_struct *curr; + + if (!vmacache_valid_mm(mm)) + return false; + + curr = current; + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + /* + * First attempt will always be invalid, initialize + * the new cache for this task here. + */ + curr->vmacache_seqnum = mm->vmacache_seqnum; + vmacache_flush(curr); + return false; + } + return true; +} + +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (!vma) + continue; + if (WARN_ON_ONCE(vma->vm_mm != mm)) + break; + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } + } + + return NULL; +} + +#ifndef CONFIG_MMU +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start == start && vma->vm_end == end) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); + return vma; + } + } + + return NULL; +} +#endif diff --git a/kernel/mm/vmalloc.c b/kernel/mm/vmalloc.c new file mode 100644 index 000000000..f87a29f1e --- /dev/null +++ b/kernel/mm/vmalloc.c @@ -0,0 +1,2742 @@ +/* + * linux/mm/vmalloc.c + * + * Copyright (C) 1993 Linus Torvalds + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 + * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 + * Numa awareness, Christoph Lameter, SGI, June 2005 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct vfree_deferred { + struct llist_head list; + struct work_struct wq; +}; +static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); + +static void __vunmap(const void *, int); + +static void free_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *llnode = llist_del_all(&p->list); + while (llnode) { + void *p = llnode; + llnode = llist_next(llnode); + __vunmap(p, 1); + } +} + +/*** Page table manipulation functions ***/ + +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); + WARN_ON(!pte_none(ptent) && !pte_present(ptent)); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_clear_huge(pmd)) + continue; + if (pmd_none_or_clear_bad(pmd)) + continue; + vunmap_pte_range(pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} + +static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_clear_huge(pud)) + continue; + if (pud_none_or_clear_bad(pud)) + continue; + vunmap_pmd_range(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void vunmap_page_range(unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + vunmap_pud_range(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + pte_t *pte; + + /* + * nr is a running index into the array which helps higher level + * callers keep track of where we're up to. + */ + + pte = pte_alloc_kernel(pmd, addr); + if (!pte) + return -ENOMEM; + do { + struct page *page = pages[*nr]; + + if (WARN_ON(!pte_none(*pte))) + return -EBUSY; + if (WARN_ON(!page)) + return -ENOMEM; + set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); + (*nr)++; + } while (pte++, addr += PAGE_SIZE, addr != end); + return 0; +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc(&init_mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +/* + * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and + * will have pfns corresponding to the "pages" array. + * + * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + */ +static int vmap_page_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + pgd_t *pgd; + unsigned long next; + unsigned long addr = start; + int err = 0; + int nr = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); + if (err) + return err; + } while (pgd++, addr = next, addr != end); + + return nr; +} + +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + int ret; + + ret = vmap_page_range_noflush(start, end, prot, pages); + flush_cache_vmap(start, end); + return ret; +} + +int is_vmalloc_or_module_addr(const void *x) +{ + /* + * ARM, x86-64 and sparc64 put modules in a special place, + * and fall back on vmalloc() if that fails. Others + * just put it in the vmalloc space. + */ +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) + unsigned long addr = (unsigned long)x; + if (addr >= MODULES_VADDR && addr < MODULES_END) + return 1; +#endif + return is_vmalloc_addr(x); +} + +/* + * Walk a vmap address to the struct page it maps. + */ +struct page *vmalloc_to_page(const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + + /* + * XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space + */ + VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); + + if (!pgd_none(*pgd)) { + pud_t *pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd_t *pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + pte_t *ptep, pte; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } + } + } + return page; +} +EXPORT_SYMBOL(vmalloc_to_page); + +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + + +/*** Global kva allocator ***/ + +#define VM_LAZY_FREE 0x01 +#define VM_LAZY_FREEING 0x02 +#define VM_VM_AREA 0x04 + +static DEFINE_SPINLOCK(vmap_area_lock); +/* Export for kexec only */ +LIST_HEAD(vmap_area_list); +static struct rb_root vmap_area_root = RB_ROOT; + +/* The vmap cache globals are protected by vmap_area_lock */ +static struct rb_node *free_vmap_cache; +static unsigned long cached_hole_size; +static unsigned long cached_vstart; +static unsigned long cached_align; + +static unsigned long vmap_area_pcpu_hole; + +static struct vmap_area *__find_vmap_area(unsigned long addr) +{ + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr >= va->va_end) + n = n->rb_right; + else + return va; + } + + return NULL; +} + +static void __insert_vmap_area(struct vmap_area *va) +{ + struct rb_node **p = &vmap_area_root.rb_node; + struct rb_node *parent = NULL; + struct rb_node *tmp; + + while (*p) { + struct vmap_area *tmp_va; + + parent = *p; + tmp_va = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp_va->va_end) + p = &(*p)->rb_left; + else if (va->va_end > tmp_va->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&va->rb_node, parent, p); + rb_insert_color(&va->rb_node, &vmap_area_root); + + /* address-sort this list */ + tmp = rb_prev(&va->rb_node); + if (tmp) { + struct vmap_area *prev; + prev = rb_entry(tmp, struct vmap_area, rb_node); + list_add_rcu(&va->list, &prev->list); + } else + list_add_rcu(&va->list, &vmap_area_list); +} + +static void purge_vmap_area_lazy(void); + +/* + * Allocate a region of KVA of the specified size and alignment, within the + * vstart and vend. + */ +static struct vmap_area *alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *va; + struct rb_node *n; + unsigned long addr; + int purged = 0; + struct vmap_area *first; + + BUG_ON(!size); + BUG_ON(size & ~PAGE_MASK); + BUG_ON(!is_power_of_2(align)); + + va = kmalloc_node(sizeof(struct vmap_area), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + +retry: + spin_lock(&vmap_area_lock); + /* + * Invalidate cache if we have more permissive parameters. + * cached_hole_size notes the largest hole noticed _below_ + * the vmap_area cached in free_vmap_cache: if size fits + * into that hole, we want to scan from vstart to reuse + * the hole instead of allocating above free_vmap_cache. + * Note that __free_vmap_area may update free_vmap_cache + * without updating cached_hole_size or cached_align. + */ + if (!free_vmap_cache || + size < cached_hole_size || + vstart < cached_vstart || + align < cached_align) { +nocache: + cached_hole_size = 0; + free_vmap_cache = NULL; + } + /* record if we encounter less permissive parameters */ + cached_vstart = vstart; + cached_align = align; + + /* find starting point for our search */ + if (free_vmap_cache) { + first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + addr = ALIGN(first->va_end, align); + if (addr < vstart) + goto nocache; + if (addr + size < addr) + goto overflow; + + } else { + addr = ALIGN(vstart, align); + if (addr + size < addr) + goto overflow; + + n = vmap_area_root.rb_node; + first = NULL; + + while (n) { + struct vmap_area *tmp; + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end >= addr) { + first = tmp; + if (tmp->va_start <= addr) + break; + n = n->rb_left; + } else + n = n->rb_right; + } + + if (!first) + goto found; + } + + /* from the starting point, walk areas until a suitable hole is found */ + while (addr + size > first->va_start && addr + size <= vend) { + if (addr + cached_hole_size < first->va_start) + cached_hole_size = first->va_start - addr; + addr = ALIGN(first->va_end, align); + if (addr + size < addr) + goto overflow; + + if (list_is_last(&first->list, &vmap_area_list)) + goto found; + + first = list_entry(first->list.next, + struct vmap_area, list); + } + +found: + if (addr + size > vend) + goto overflow; + + va->va_start = addr; + va->va_end = addr + size; + va->flags = 0; + __insert_vmap_area(va); + free_vmap_cache = &va->rb_node; + spin_unlock(&vmap_area_lock); + + BUG_ON(va->va_start & (align-1)); + BUG_ON(va->va_start < vstart); + BUG_ON(va->va_end > vend); + + return va; + +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + pr_warn("vmap allocation for size %lu failed: " + "use vmalloc= to increase size.\n", size); + kfree(va); + return ERR_PTR(-EBUSY); +} + +static void __free_vmap_area(struct vmap_area *va) +{ + BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + + if (free_vmap_cache) { + if (va->va_end < cached_vstart) { + free_vmap_cache = NULL; + } else { + struct vmap_area *cache; + cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + if (va->va_start <= cache->va_start) { + free_vmap_cache = rb_prev(&va->rb_node); + /* + * We don't try to update cached_hole_size or + * cached_align, but it won't go very wrong. + */ + } + } + } + rb_erase(&va->rb_node, &vmap_area_root); + RB_CLEAR_NODE(&va->rb_node); + list_del_rcu(&va->list); + + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ + if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + + kfree_rcu(va, rcu_head); +} + +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + spin_lock(&vmap_area_lock); + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); +} + +/* + * Clear the pagetable entries of a given vmap_area + */ +static void unmap_vmap_area(struct vmap_area *va) +{ + vunmap_page_range(va->va_start, va->va_end); +} + +static void vmap_debug_free_range(unsigned long start, unsigned long end) +{ + /* + * Unmap page tables and force a TLB flush immediately if + * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free + * bugs similarly to those in linear kernel virtual address + * space after a page has been freed. + * + * All the lazy freeing logic is still retained, in order to + * minimise intrusiveness of this debugging feature. + * + * This is going to be *slow* (linear kernel virtual address + * debugging doesn't do a broadcast TLB flush so it is a lot + * faster). + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); +#endif +} + +/* + * lazy_max_pages is the maximum amount of virtual address space we gather up + * before attempting to purge with a TLB flush. + * + * There is a tradeoff here: a larger number will cover more kernel page tables + * and take slightly longer to purge, but it will linearly reduce the number of + * global TLB flushes that must be performed. It would seem natural to scale + * this number up linearly with the number of CPUs (because vmapping activity + * could also scale linearly with the number of CPUs), however it is likely + * that in practice, workloads might be constrained in other ways that mean + * vmap activity will not scale linearly with CPUs. Also, I want to be + * conservative and not introduce a big latency on huge systems, so go with + * a less aggressive log scale. It will still be an improvement over the old + * code, and it will be simple to change the scale factor if we find that it + * becomes a problem on bigger systems. + */ +static unsigned long lazy_max_pages(void) +{ + unsigned int log; + + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +} + +static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); + +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + +/* + * called before a call to iounmap() if the caller wants vm_area_struct's + * immediately freed. + */ +void set_iounmap_nonlazy(void) +{ + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); +} + +/* + * Purges all lazily-freed vmap areas. + * + * If sync is 0 then don't purge if there is already a purge in progress. + * If force_flush is 1, then flush kernel TLBs between *start and *end even + * if we found no lazy vmap areas to unmap (callers can use this to optimise + * their own TLB flushing). + * Returns with *start = min(*start, lowest purged address) + * *end = max(*end, highest purged address) + */ +static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, + int sync, int force_flush) +{ + static DEFINE_SPINLOCK(purge_lock); + LIST_HEAD(valist); + struct vmap_area *va; + struct vmap_area *n_va; + int nr = 0; + + /* + * If sync is 0 but force_flush is 1, we'll go sync anyway but callers + * should not expect such behaviour. This just simplifies locking for + * the case that isn't actually used at the moment anyway. + */ + if (!sync && !force_flush) { + if (!spin_trylock(&purge_lock)) + return; + } else + spin_lock(&purge_lock); + + if (sync) + purge_fragmented_blocks_allcpus(); + + rcu_read_lock(); + list_for_each_entry_rcu(va, &vmap_area_list, list) { + if (va->flags & VM_LAZY_FREE) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + list_add_tail(&va->purge_list, &valist); + va->flags |= VM_LAZY_FREEING; + va->flags &= ~VM_LAZY_FREE; + } + } + rcu_read_unlock(); + + if (nr) + atomic_sub(nr, &vmap_lazy_nr); + + if (nr || force_flush) + flush_tlb_kernel_range(*start, *end); + + if (nr) { + spin_lock(&vmap_area_lock); + list_for_each_entry_safe(va, n_va, &valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + } + spin_unlock(&purge_lock); +} + +/* + * Kick off a purge of the outstanding lazy areas. Don't bother if somebody + * is already purging. + */ +static void try_purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 0, 0); +} + +/* + * Kick off a purge of the outstanding lazy areas. + */ +static void purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 1, 0); +} + +/* + * Free a vmap area, caller ensuring that the area has been unmapped + * and flush_cache_vunmap had been called for the correct range + * previously. + */ +static void free_vmap_area_noflush(struct vmap_area *va) +{ + va->flags |= VM_LAZY_FREE; + atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); + if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + try_purge_vmap_area_lazy(); +} + +/* + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. + */ +static void free_unmap_vmap_area_noflush(struct vmap_area *va) +{ + unmap_vmap_area(va); + free_vmap_area_noflush(va); +} + +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + flush_cache_vunmap(va->va_start, va->va_end); + free_unmap_vmap_area_noflush(va); +} + +static struct vmap_area *find_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void free_unmap_vmap_area_addr(unsigned long addr) +{ + struct vmap_area *va; + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); +} + + +/*** Per cpu kva allocator ***/ + +/* + * vmap space is limited especially on 32 bit architectures. Ensure there is + * room for at least 16 percpu vmap blocks per CPU. + */ +/* + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess + * instead (we just need a rough idea) + */ +#if BITS_PER_LONG == 32 +#define VMALLOC_SPACE (128UL*1024*1024) +#else +#define VMALLOC_SPACE (128UL*1024*1024*1024) +#endif + +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ +#define VMAP_BBMAP_BITS \ + VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) + +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) + +static bool vmap_initialized __read_mostly = false; + +struct vmap_block_queue { + spinlock_t lock; + struct list_head free; +}; + +struct vmap_block { + spinlock_t lock; + struct vmap_area *va; + unsigned long free, dirty; + unsigned long dirty_min, dirty_max; /*< dirty range */ + struct list_head free_list; + struct rcu_head rcu_head; + struct list_head purge; +}; + +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); + +/* + * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * in the free path. Could get rid of this if we change the API to return a + * "cookie" from alloc, to be passed to free. But no big deal yet. + */ +static DEFINE_SPINLOCK(vmap_block_tree_lock); +static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); + +/* + * We should probably have a fallback mechanism to allocate virtual memory + * out of partially filled vmap blocks. However vmap block sizing should be + * fairly reasonable according to the vmalloc size, so it shouldn't be a + * big problem. + */ + +static unsigned long addr_to_vb_idx(unsigned long addr) +{ + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); + addr /= VMAP_BLOCK_SIZE; + return addr; +} + +static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) +{ + unsigned long addr; + + addr = va_start + (pages_off << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); + return (void *)addr; +} + +/** + * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this + * block. Of course pages number can't exceed VMAP_BBMAP_BITS + * @order: how many 2^order pages should be occupied in newly allocated block + * @gfp_mask: flags for the page level allocator + * + * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) + */ +static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; + int node, err, cpu; + void *vaddr; + + node = numa_node_id(); + + vb = kmalloc_node(sizeof(struct vmap_block), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!vb)) + return ERR_PTR(-ENOMEM); + + va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, + VMALLOC_START, VMALLOC_END, + node, gfp_mask); + if (IS_ERR(va)) { + kfree(vb); + return ERR_CAST(va); + } + + err = radix_tree_preload(gfp_mask); + if (unlikely(err)) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } + + vaddr = vmap_block_vaddr(va->va_start, 0); + spin_lock_init(&vb->lock); + vb->va = va; + /* At least something should be left free */ + BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + vb->free = VMAP_BBMAP_BITS - (1UL << order); + vb->dirty = 0; + vb->dirty_min = VMAP_BBMAP_BITS; + vb->dirty_max = 0; + INIT_LIST_HEAD(&vb->free_list); + + vb_idx = addr_to_vb_idx(va->va_start); + spin_lock(&vmap_block_tree_lock); + err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(err); + radix_tree_preload_end(); + + cpu = get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); + spin_lock(&vbq->lock); + list_add_tail_rcu(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); + put_cpu_light(); + + return vaddr; +} + +static void free_vmap_block(struct vmap_block *vb) +{ + struct vmap_block *tmp; + unsigned long vb_idx; + + vb_idx = addr_to_vb_idx(vb->va->va_start); + spin_lock(&vmap_block_tree_lock); + tmp = radix_tree_delete(&vmap_block_tree, vb_idx); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(tmp != vb); + + free_vmap_area_noflush(vb->va); + kfree_rcu(vb, rcu_head); +} + +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + vb->dirty_min = 0; + vb->dirty_max = VMAP_BBMAP_BITS; + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + +static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + void *vaddr = NULL; + unsigned int order; + int cpu; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + if (WARN_ON(size == 0)) { + /* + * Allocating 0 bytes isn't what caller wants since + * get_order(0) returns funny result. Just warn and terminate + * early. + */ + return NULL; + } + order = get_order(size); + + rcu_read_lock(); + cpu = get_cpu_light(); + vbq = this_cpu_ptr(&vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long pages_off; + + spin_lock(&vb->lock); + if (vb->free < (1UL << order)) { + spin_unlock(&vb->lock); + continue; + } + + pages_off = VMAP_BBMAP_BITS - vb->free; + vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + } + + spin_unlock(&vb->lock); + break; + } + + put_cpu_light(); + rcu_read_unlock(); + + /* Allocate new block if nothing was found */ + if (!vaddr) + vaddr = new_vmap_block(order, gfp_mask); + + return vaddr; +} + +static void vb_free(const void *addr, unsigned long size) +{ + unsigned long offset; + unsigned long vb_idx; + unsigned int order; + struct vmap_block *vb; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + + flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + + order = get_order(size); + + offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + offset >>= PAGE_SHIFT; + + vb_idx = addr_to_vb_idx((unsigned long)addr); + rcu_read_lock(); + vb = radix_tree_lookup(&vmap_block_tree, vb_idx); + rcu_read_unlock(); + BUG_ON(!vb); + + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + + spin_lock(&vb->lock); + + /* Expand dirty range */ + vb->dirty_min = min(vb->dirty_min, offset); + vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); + + vb->dirty += 1UL << order; + if (vb->dirty == VMAP_BBMAP_BITS) { + BUG_ON(vb->free); + spin_unlock(&vb->lock); + free_vmap_block(vb); + } else + spin_unlock(&vb->lock); +} + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int cpu; + int flush = 0; + + if (unlikely(!vmap_initialized)) + return; + + for_each_possible_cpu(cpu) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + struct vmap_block *vb; + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + spin_lock(&vb->lock); + if (vb->dirty) { + unsigned long va_start = vb->va->va_start; + unsigned long s, e; + + s = va_start + (vb->dirty_min << PAGE_SHIFT); + e = va_start + (vb->dirty_max << PAGE_SHIFT); + + start = min(s, start); + end = max(e, end); + + flush = 1; + } + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + } + + __purge_vmap_area_lazy(&start, &end, 1, flush); +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/** + * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram + * @mem: the pointer returned by vm_map_ram + * @count: the count passed to that vm_map_ram call (cannot unmap partial) + */ +void vm_unmap_ram(const void *mem, unsigned int count) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr = (unsigned long)mem; + + BUG_ON(!addr); + BUG_ON(addr < VMALLOC_START); + BUG_ON(addr > VMALLOC_END); + BUG_ON(addr & (PAGE_SIZE-1)); + + debug_check_no_locks_freed(mem, size); + vmap_debug_free_range(addr, addr+size); + + if (likely(count <= VMAP_MAX_ALLOC)) + vb_free(mem, size); + else + free_unmap_vmap_area_addr(addr); +} +EXPORT_SYMBOL(vm_unmap_ram); + +/** + * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) + * @pages: an array of pointers to the pages to be mapped + * @count: number of pages + * @node: prefer to allocate data structures on this node + * @prot: memory protection to use. PAGE_KERNEL for regular RAM + * + * If you use this function for less than VMAP_MAX_ALLOC pages, it could be + * faster than vmap so it's good. But if you mix long-life and short-life + * objects with vm_map_ram(), it could consume lots of address space through + * fragmentation (especially on a 32bit machine). You could see failures in + * the end. Please use this function for short-lived objects. + * + * Returns: a pointer to the address that has been mapped, or %NULL on failure + */ +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr; + void *mem; + + if (likely(count <= VMAP_MAX_ALLOC)) { + mem = vb_alloc(size, GFP_KERNEL); + if (IS_ERR(mem)) + return NULL; + addr = (unsigned long)mem; + } else { + struct vmap_area *va; + va = alloc_vmap_area(size, PAGE_SIZE, + VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + if (IS_ERR(va)) + return NULL; + + addr = va->va_start; + mem = (void *)addr; + } + if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + vm_unmap_ram(mem, count); + return NULL; + } + return mem; +} +EXPORT_SYMBOL(vm_map_ram); + +static struct vm_struct *vmlist __initdata; +/** + * vm_area_add_early - add vmap area early during boot + * @vm: vm_struct to add + * + * This function is used to add fixed kernel vm area to vmlist before + * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags + * should contain proper values and the other fields should be zero. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_add_early(struct vm_struct *vm) +{ + struct vm_struct *tmp, **p; + + BUG_ON(vmap_initialized); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) { + BUG_ON(tmp->addr < vm->addr + vm->size); + break; + } else + BUG_ON(tmp->addr + tmp->size > vm->addr); + } + vm->next = *p; + *p = vm; +} + +/** + * vm_area_register_early - register vmap area early during boot + * @vm: vm_struct to register + * @align: requested alignment + * + * This function is used to register kernel vm area before + * vmalloc_init() is called. @vm->size and @vm->flags should contain + * proper values on entry and other fields should be zero. On return, + * vm->addr contains the allocated address. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_register_early(struct vm_struct *vm, size_t align) +{ + static size_t vm_init_off __initdata; + unsigned long addr; + + addr = ALIGN(VMALLOC_START + vm_init_off, align); + vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; + + vm->addr = (void *)addr; + + vm_area_add_early(vm); +} + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + struct vfree_deferred *p; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, free_work); + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + va->flags = VM_VM_AREA; + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + va->vm = tmp; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = VMALLOC_END; + + vmap_initialized = true; +} + +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vmap() on to-be-mapped areas + * before calling this function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_page_range_noflush(addr, addr + size, prot, pages); +} + +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vunmap() on to-be-mapped areas + * before calling this function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +{ + vunmap_page_range(addr, addr + size); +} +EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); + +/** + * unmap_kernel_range - unmap kernel VM area and flush cache and TLB + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Similar to unmap_kernel_range_noflush() but flushes vcache before + * the unmapping and tlb after. + */ +void unmap_kernel_range(unsigned long addr, unsigned long size) +{ + unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); + vunmap_page_range(addr, end); + flush_tlb_kernel_range(addr, end); +} +EXPORT_SYMBOL_GPL(unmap_kernel_range); + +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long end = addr + get_vm_area_size(area); + int err; + + err = vmap_page_range(addr, end, prot, pages); + + return err > 0 ? 0 : err; +} +EXPORT_SYMBOL_GPL(map_vm_area); + +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, const void *caller) +{ + spin_lock(&vmap_area_lock); + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->vm = vm; + va->flags |= VM_VM_AREA; + spin_unlock(&vmap_area_lock); +} + +static void clear_vm_uninitialized_flag(struct vm_struct *vm) +{ + /* + * Before removing VM_UNINITIALIZED, + * we should make sure that vm has proper values. + * Pair with smp_rmb() in show_numa_info(). + */ + smp_wmb(); + vm->flags &= ~VM_UNINITIALIZED; +} + +static struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, const void *caller) +{ + struct vmap_area *va; + struct vm_struct *area; + + BUG_ON(in_interrupt()); + if (flags & VM_IOREMAP) + align = 1ul << clamp_t(int, fls_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); + + size = PAGE_ALIGN(size); + if (unlikely(!size)) + return NULL; + + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!area)) + return NULL; + + if (!(flags & VM_NO_GUARD)) + size += PAGE_SIZE; + + va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + if (IS_ERR(va)) { + kfree(area); + return NULL; + } + + setup_vmalloc_vm(area, va, flags, caller); + + return area; +} + +struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end) +{ + return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, + GFP_KERNEL, __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(__get_vm_area); + +struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + const void *caller) +{ + return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, + GFP_KERNEL, caller); +} + +/** + * get_vm_area - reserve a contiguous kernel virtual area + * @size: size of the area + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC + * + * Search an area of @size in the kernel virtual mapping area, + * and reserved it for out purposes. Returns the area descriptor + * on success or %NULL on failure. + */ +struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) +{ + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); +} + +struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, + const void *caller) +{ + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, caller); +} + +/** + * find_vm_area - find a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. + */ +struct vm_struct *find_vm_area(const void *addr) +{ + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) + return va->vm; + + return NULL; +} + +/** + * remove_vm_area - find and remove a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines, except for its size or flags. + */ +struct vm_struct *remove_vm_area(const void *addr) +{ + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) { + struct vm_struct *vm = va->vm; + + spin_lock(&vmap_area_lock); + va->vm = NULL; + va->flags &= ~VM_VM_AREA; + spin_unlock(&vmap_area_lock); + + vmap_debug_free_range(va->va_start, va->va_end); + kasan_free_shadow(vm); + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + + return vm; + } + return NULL; +} + +static void __vunmap(const void *addr, int deallocate_pages) +{ + struct vm_struct *area; + + if (!addr) + return; + + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) + return; + + area = remove_vm_area(addr); + if (unlikely(!area)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + debug_check_no_locks_freed(addr, area->size); + debug_check_no_obj_freed(addr, area->size); + + if (deallocate_pages) { + int i; + + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + + BUG_ON(!page); + __free_page(page); + } + + if (area->flags & VM_VPAGES) + vfree(area->pages); + else + kfree(area->pages); + } + + kfree(area); + return; +} + +/** + * vfree - release memory allocated by vmalloc() + * @addr: memory base address + * + * Free the virtually continuous memory area starting at @addr, as + * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is + * NULL, no operation is performed. + * + * Must not be called in NMI context (strictly speaking, only if we don't + * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-depenedent would be a really bad idea) + * + * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) + */ +void vfree(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + if (unlikely(in_interrupt())) { + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); + } else + __vunmap(addr, 1); +} +EXPORT_SYMBOL(vfree); + +/** + * vunmap - release virtual mapping obtained by vmap() + * @addr: memory base address + * + * Free the virtually contiguous memory area starting at @addr, + * which was created from the page array passed to vmap(). + * + * Must not be called in interrupt context. + */ +void vunmap(const void *addr) +{ + BUG_ON(in_interrupt()); + might_sleep(); + if (addr) + __vunmap(addr, 0); +} +EXPORT_SYMBOL(vunmap); + +/** + * vmap - map an array of pages into virtually contiguous space + * @pages: array of page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + + might_sleep(); + + if (count > totalram_pages) + return NULL; + + area = get_vm_area_caller((count << PAGE_SHIFT), flags, + __builtin_return_address(0)); + if (!area) + return NULL; + + if (map_vm_area(area, prot, pages)) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} +EXPORT_SYMBOL(vmap); + +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, const void *caller); +static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot, int node) +{ + const int order = 0; + struct page **pages; + unsigned int nr_pages, array_size, i; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + + nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + array_size = (nr_pages * sizeof(struct page *)); + + area->nr_pages = nr_pages; + /* Please note that the recursion is strictly bounded. */ + if (array_size > PAGE_SIZE) { + pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, + PAGE_KERNEL, node, area->caller); + area->flags |= VM_VPAGES; + } else { + pages = kmalloc_node(array_size, nested_gfp, node); + } + area->pages = pages; + if (!area->pages) { + remove_vm_area(area->addr); + kfree(area); + return NULL; + } + + for (i = 0; i < area->nr_pages; i++) { + struct page *page; + + if (node == NUMA_NO_NODE) + page = alloc_page(alloc_mask); + else + page = alloc_pages_node(node, alloc_mask, order); + + if (unlikely(!page)) { + /* Successfully allocated i pages, free them in __vunmap() */ + area->nr_pages = i; + goto fail; + } + area->pages[i] = page; + if (gfp_mask & __GFP_WAIT) + cond_resched(); + } + + if (map_vm_area(area, prot, pages)) + goto fail; + return area->addr; + +fail: + warn_alloc_failed(gfp_mask, order, + "vmalloc: allocation failure, allocated %ld of %ld bytes\n", + (area->nr_pages*PAGE_SIZE), area->size); + vfree(area->addr); + return NULL; +} + +/** + * __vmalloc_node_range - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @start: vm area range start + * @end: vm area range end + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) +{ + struct vm_struct *area; + void *addr; + unsigned long real_size = size; + + size = PAGE_ALIGN(size); + if (!size || (size >> PAGE_SHIFT) > totalram_pages) + goto fail; + + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | + vm_flags, start, end, node, gfp_mask, caller); + if (!area) + goto fail; + + addr = __vmalloc_area_node(area, gfp_mask, prot, node); + if (!addr) + return NULL; + + /* + * In this function, newly allocated vm_struct has VM_UNINITIALIZED + * flag. It means that vm_struct is not fully initialized. + * Now, it is fully initialized, so remove this flag here. + */ + clear_vm_uninitialized_flag(area); + + /* + * A ref_count = 2 is needed because vm_struct allocated in + * __get_vm_area_node() contains a reference to the virtual address of + * the vmalloc'ed block. + */ + kmemleak_alloc(addr, real_size, 2, gfp_mask); + + return addr; + +fail: + warn_alloc_failed(gfp_mask, 0, + "vmalloc: allocation failure: %lu bytes\n", + real_size); + return NULL; +} + +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, const void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, 0, node, caller); +} + +void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +{ + return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__vmalloc); + +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + +/** + * vmalloc - allocate virtually contiguous memory + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc_node_flags(size, NUMA_NO_NODE, + GFP_KERNEL | __GFP_HIGHMEM); +} +EXPORT_SYMBOL(vmalloc); + +/** + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc_node_flags(size, NUMA_NO_NODE, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace + * @size: allocation size + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + */ +void *vmalloc_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc_node(size, SHMLBA, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + if (ret) { + area = find_vm_area(ret); + area->flags |= VM_USERMAP; + } + return ret; +} +EXPORT_SYMBOL(vmalloc_user); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_node); + +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc_node() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return __vmalloc_node_flags(size, node, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc_node); + +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + NUMA_NO_NODE, __builtin_return_address(0)); +} + +#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) +#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +#define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL +#else +#define GFP_VMALLOC32 GFP_KERNEL +#endif + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + */ +void *vmalloc_32_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); + if (ret) { + area = find_vm_area(ret); + area->flags |= VM_USERMAP; + } + return ret; +} +EXPORT_SYMBOL(vmalloc_32_user); + +/* + * small helper routine , copy contents to buf from addr. + * If the page is not present, fill zero. + */ + +static int aligned_vread(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p); + memcpy(buf, map + offset, length); + kunmap_atomic(map); + } else + memset(buf, 0, length); + + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +static int aligned_vwrite(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p); + memcpy(map + offset, buf, length); + kunmap_atomic(map); + } + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +/** + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be increased. + * (same number to @count). Returns 0 if [addr...addr+count) doesn't + * includes any intersect with alive vmalloc area. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + */ + +long vread(char *buf, char *addr, unsigned long count) +{ + struct vmap_area *va; + struct vm_struct *vm; + char *vaddr, *buf_start = buf; + unsigned long buflen = count; + unsigned long n; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + get_vm_area_size(vm)) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + *buf = '\0'; + buf++; + addr++; + count--; + } + n = vaddr + get_vm_area_size(vm) - addr; + if (n > count) + n = count; + if (!(vm->flags & VM_IOREMAP)) + aligned_vread(buf, addr, n); + else /* IOREMAP area is treated as memory hole */ + memset(buf, 0, n); + buf += n; + addr += n; + count -= n; + } +finished: + spin_unlock(&vmap_area_lock); + + if (buf == buf_start) + return 0; + /* zero-fill memory holes */ + if (buf != buf_start + buflen) + memset(buf, 0, buflen - (buf - buf_start)); + + return buflen; +} + +/** + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be incresed. + * (same number to @count). + * If [addr...addr+count) doesn't includes any intersect with valid + * vmalloc area, returns 0. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. + * + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + */ + +long vwrite(char *buf, char *addr, unsigned long count) +{ + struct vmap_area *va; + struct vm_struct *vm; + char *vaddr; + unsigned long n, buflen; + int copied = 0; + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + buflen = count; + + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &vmap_area_list, list) { + if (!count) + break; + + if (!(va->flags & VM_VM_AREA)) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + get_vm_area_size(vm)) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + buf++; + addr++; + count--; + } + n = vaddr + get_vm_area_size(vm) - addr; + if (n > count) + n = count; + if (!(vm->flags & VM_IOREMAP)) { + aligned_vwrite(buf, addr, n); + copied++; + } + buf += n; + addr += n; + count -= n; + } +finished: + spin_unlock(&vmap_area_lock); + if (!copied) + return 0; + return buflen; +} + +/** + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long size) +{ + struct vm_struct *area; + + size = PAGE_ALIGN(size); + + if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) + return -EINVAL; + + area = find_vm_area(kaddr); + if (!area) + return -EINVAL; + + if (!(area->flags & VM_USERMAP)) + return -EINVAL; + + if (kaddr + size > area->addr + area->size) + return -EINVAL; + + do { + struct page *page = vmalloc_to_page(kaddr); + int ret; + + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PAGE_SIZE; + kaddr += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size > 0); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range_partial); + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_range_partial(vma, vma->vm_start, + addr + (pgoff << PAGE_SHIFT), + vma->vm_end - vma->vm_start); +} +EXPORT_SYMBOL(remap_vmalloc_range); + +/* + * Implement a stub for vmalloc_sync_all() if the architecture chose not to + * have one. + */ +void __weak vmalloc_sync_all(void) +{ +} + + +static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) +{ + pte_t ***p = data; + + if (p) { + *(*p) = pte; + (*p)++; + } + return 0; +} + +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * @ptes: returns the PTEs for the address space + * + * Returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. + * + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. + */ +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) +{ + struct vm_struct *area; + + area = get_vm_area_caller(size, VM_IOREMAP, + __builtin_return_address(0)); + if (area == NULL) + return NULL; + + /* + * This ensures that page tables are constructed for this region + * of kernel virtual address space and mapped into init_mm. + */ + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + size, f, ptes ? &ptes : NULL)) { + free_vm_area(area); + return NULL; + } + + return area; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + struct vm_struct *ret; + ret = remove_vm_area(area->addr); + BUG_ON(ret != area); + kfree(area); +} +EXPORT_SYMBOL_GPL(free_vm_area); + +#ifdef CONFIG_SMP +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; +} + +/** + * pvm_find_next_prev - find the next and prev vmap_area surrounding @end + * @end: target address + * @pnext: out arg for the next vmap_area + * @pprev: out arg for the previous vmap_area + * + * Returns: %true if either or both of next and prev are found, + * %false if no vmap_area exists + * + * Find vmap_areas end addresses of which enclose @end. ie. if not + * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + */ +static bool pvm_find_next_prev(unsigned long end, + struct vmap_area **pnext, + struct vmap_area **pprev) +{ + struct rb_node *n = vmap_area_root.rb_node; + struct vmap_area *va = NULL; + + while (n) { + va = rb_entry(n, struct vmap_area, rb_node); + if (end < va->va_end) + n = n->rb_left; + else if (end > va->va_end) + n = n->rb_right; + else + break; + } + + if (!va) + return false; + + if (va->va_end > end) { + *pnext = va; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } else { + *pprev = va; + *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); + } + return true; +} + +/** + * pvm_determine_end - find the highest aligned address between two vmap_areas + * @pnext: in/out arg for the next vmap_area + * @pprev: in/out arg for the previous vmap_area + * @align: alignment + * + * Returns: determined end address + * + * Find the highest aligned address between *@pnext and *@pprev below + * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned + * down address is between the end addresses of the two vmap_areas. + * + * Please note that the address returned by this function may fall + * inside *@pnext vmap_area. The caller is responsible for checking + * that. + */ +static unsigned long pvm_determine_end(struct vmap_area **pnext, + struct vmap_area **pprev, + unsigned long align) +{ + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (*pnext) + addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); + else + addr = vmalloc_end; + + while (*pprev && (*pprev)->va_end > addr) { + *pnext = *pprev; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } + + return addr; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans areas from the end looking for + * matching slot. While scanning, if any of the areas overlaps with + * existing vmap_area, the base address is pulled down to fit the + * area. Scanning is repeated till all the areas fit and then all + * necessary data structres are inserted and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *prev, *next; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, end, last_end; + bool purged = false; + + /* verify parameters and allocate data structures */ + BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = 0; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + if (area2 == area) + continue; + + BUG_ON(start2 >= start && start2 < end); + BUG_ON(end2 <= end && end2 > start); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); + vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); + if (!vas || !vms) + goto err_free2; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { + base = vmalloc_end - last_end; + goto found; + } + base = pvm_determine_end(&next, &prev, align) - end; + + while (true) { + BUG_ON(next && next->va_end <= base + end); + BUG_ON(prev && prev->va_end > base + end); + + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + goto retry; + } + goto err_free; + } + + /* + * If next overlaps, move base downwards so that it's + * right below next and then recheck. + */ + if (next && next->va_start < base + end) { + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * If prev overlaps, shift down next and prev and move + * base so that it's right below new next and then + * recheck. + */ + if (prev && prev->va_end > base + start) { + next = prev; + prev = node_to_va(rb_prev(&next->rb_node)); + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + start = offsets[area]; + end = start + sizes[area]; + pvm_find_next_prev(base + end, &next, &prev); + } +found: + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + struct vmap_area *va = vas[area]; + + va->va_start = base + offsets[area]; + va->va_end = va->va_start + sizes[area]; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = base + offsets[last_area]; + + spin_unlock(&vmap_area_lock); + + /* insert all vm's */ + for (area = 0; area < nr_vms; area++) + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + + kfree(vas); + return vms; + +err_free: + for (area = 0; area < nr_vms; area++) { + kfree(vas[area]); + kfree(vms[area]); + } +err_free2: + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PROC_FS +static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmap_area_lock) +{ + loff_t n = *pos; + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = list_entry((&vmap_area_list)->next, typeof(*va), list); + while (n > 0 && &va->list != &vmap_area_list) { + n--; + va = list_entry(va->list.next, typeof(*va), list); + } + if (!n && &va->list != &vmap_area_list) + return va; + + return NULL; + +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct vmap_area *va = p, *next; + + ++*pos; + next = list_entry(va->list.next, typeof(*va), list); + if (&next->list != &vmap_area_list) + return next; + + return NULL; +} + +static void s_stop(struct seq_file *m, void *p) + __releases(&vmap_area_lock) +{ + spin_unlock(&vmap_area_lock); +} + +static void show_numa_info(struct seq_file *m, struct vm_struct *v) +{ + if (IS_ENABLED(CONFIG_NUMA)) { + unsigned int nr, *counters = m->private; + + if (!counters) + return; + + if (v->flags & VM_UNINITIALIZED) + return; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + + for (nr = 0; nr < v->nr_pages; nr++) + counters[page_to_nid(v->pages[nr])]++; + + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); + } +} + +static int s_show(struct seq_file *m, void *p) +{ + struct vmap_area *va = p; + struct vm_struct *v; + + /* + * s_show can encounter race with remove_vm_area, !VM_VM_AREA on + * behalf of vmap area is being tear down or vm_map_ram allocation. + */ + if (!(va->flags & VM_VM_AREA)) + return 0; + + v = va->vm; + + seq_printf(m, "0x%pK-0x%pK %7ld", + v->addr, v->addr + v->size, v->size); + + if (v->caller) + seq_printf(m, " %pS", v->caller); + + if (v->nr_pages) + seq_printf(m, " pages=%d", v->nr_pages); + + if (v->phys_addr) + seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); + + if (v->flags & VM_IOREMAP) + seq_puts(m, " ioremap"); + + if (v->flags & VM_ALLOC) + seq_puts(m, " vmalloc"); + + if (v->flags & VM_MAP) + seq_puts(m, " vmap"); + + if (v->flags & VM_USERMAP) + seq_puts(m, " user"); + + if (v->flags & VM_VPAGES) + seq_puts(m, " vpages"); + + show_numa_info(m, v); + seq_putc(m, '\n'); + return 0; +} + +static const struct seq_operations vmalloc_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int vmalloc_open(struct inode *inode, struct file *file) +{ + if (IS_ENABLED(CONFIG_NUMA)) + return seq_open_private(file, &vmalloc_op, + nr_node_ids * sizeof(unsigned int)); + else + return seq_open(file, &vmalloc_op); +} + +static const struct file_operations proc_vmalloc_operations = { + .open = vmalloc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init proc_vmalloc_init(void) +{ + proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); + return 0; +} +module_init(proc_vmalloc_init); + +void get_vmalloc_info(struct vmalloc_info *vmi) +{ + struct vmap_area *va; + unsigned long free_area_size; + unsigned long prev_end; + + vmi->used = 0; + vmi->largest_chunk = 0; + + prev_end = VMALLOC_START; + + rcu_read_lock(); + + if (list_empty(&vmap_area_list)) { + vmi->largest_chunk = VMALLOC_TOTAL; + goto out; + } + + list_for_each_entry_rcu(va, &vmap_area_list, list) { + unsigned long addr = va->va_start; + + /* + * Some archs keep another range for modules in vmalloc space + */ + if (addr < VMALLOC_START) + continue; + if (addr >= VMALLOC_END) + break; + + if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) + continue; + + vmi->used += (va->va_end - va->va_start); + + free_area_size = addr - prev_end; + if (vmi->largest_chunk < free_area_size) + vmi->largest_chunk = free_area_size; + + prev_end = va->va_end; + } + + if (VMALLOC_END - prev_end > vmi->largest_chunk) + vmi->largest_chunk = VMALLOC_END - prev_end; + +out: + rcu_read_unlock(); +} +#endif + diff --git a/kernel/mm/vmpressure.c b/kernel/mm/vmpressure.c new file mode 100644 index 000000000..c5afd573d --- /dev/null +++ b/kernel/mm/vmpressure.c @@ -0,0 +1,382 @@ +/* + * Linux VM pressure + * + * Copyright 2012 Linaro Ltd. + * Anton Vorontsov + * + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The window size (vmpressure_win) is the number of scanned pages before + * we try to analyze scanned/reclaimed ratio. So the window is used as a + * rate-limit tunable for the "low" level notification, and also for + * averaging the ratio for medium/critical levels. Using small window + * sizes can cause lot of false positives, but too big window size will + * delay the notifications. + * + * As the vmscan reclaimer logic works with chunks which are multiple of + * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. + * + * TODO: Make the window size depend on machine size, as we do for vmstat + * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). + */ +static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + +/* + * These thresholds are used when we account memory pressure through + * scanned/reclaimed ratio. The current values were chosen empirically. In + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ +static const unsigned int vmpressure_level_med = 60; +static const unsigned int vmpressure_level_critical = 95; + +/* + * When there are too little pages left to scan, vmpressure() may miss the + * critical pressure as number of pages will be less than "window size". + * However, in that case the vmscan priority will raise fast as the + * reclaimer will try to scan LRUs more deeply. + * + * The vmscan logic considers these special priorities: + * + * prio == DEF_PRIORITY (12): reclaimer starts with that value + * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed + * prio == 0 : close to OOM, kernel scans every page in an lru + * + * Any value in this range is acceptable for this tunable (i.e. from 12 to + * 0). Current value for the vmpressure_level_critical_prio is chosen + * empirically, but the number, in essence, means that we consider + * critical level when scanning depth is ~10% of the lru size (vmscan + * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one + * eights). + */ +static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); + +static struct vmpressure *work_to_vmpressure(struct work_struct *work) +{ + return container_of(work, struct vmpressure, work); +} + +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return memcg_to_vmpressure(memcg); +} + +enum vmpressure_levels { + VMPRESSURE_LOW = 0, + VMPRESSURE_MEDIUM, + VMPRESSURE_CRITICAL, + VMPRESSURE_NUM_LEVELS, +}; + +static const char * const vmpressure_str_levels[] = { + [VMPRESSURE_LOW] = "low", + [VMPRESSURE_MEDIUM] = "medium", + [VMPRESSURE_CRITICAL] = "critical", +}; + +static enum vmpressure_levels vmpressure_level(unsigned long pressure) +{ + if (pressure >= vmpressure_level_critical) + return VMPRESSURE_CRITICAL; + else if (pressure >= vmpressure_level_med) + return VMPRESSURE_MEDIUM; + return VMPRESSURE_LOW; +} + +static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, + unsigned long reclaimed) +{ + unsigned long scale = scanned + reclaimed; + unsigned long pressure; + + /* + * We calculate the ratio (in percents) of how many pages were + * scanned vs. reclaimed in a given time frame (window). Note that + * time is in VM reclaimer's "ticks", i.e. number of pages + * scanned. This makes it possible to set desired reaction time + * and serves as a ratelimit. + */ + pressure = scale - (reclaimed * scale / scanned); + pressure = pressure * 100 / scale; + + pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, + scanned, reclaimed); + + return vmpressure_level(pressure); +} + +struct vmpressure_event { + struct eventfd_ctx *efd; + enum vmpressure_levels level; + struct list_head node; +}; + +static bool vmpressure_event(struct vmpressure *vmpr, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure_event *ev; + enum vmpressure_levels level; + bool signalled = false; + + level = vmpressure_calc_level(scanned, reclaimed); + + mutex_lock(&vmpr->events_lock); + + list_for_each_entry(ev, &vmpr->events, node) { + if (level >= ev->level) { + eventfd_signal(ev->efd, 1); + signalled = true; + } + } + + mutex_unlock(&vmpr->events_lock); + + return signalled; +} + +static void vmpressure_work_fn(struct work_struct *work) +{ + struct vmpressure *vmpr = work_to_vmpressure(work); + unsigned long scanned; + unsigned long reclaimed; + + spin_lock(&vmpr->sr_lock); + /* + * Several contexts might be calling vmpressure(), so it is + * possible that the work was rescheduled again before the old + * work context cleared the counters. In that case we will run + * just after the old work returns, but then scanned might be zero + * here. No need for any locks here since we don't care if + * vmpr->reclaimed is in sync. + */ + scanned = vmpr->scanned; + if (!scanned) { + spin_unlock(&vmpr->sr_lock); + return; + } + + reclaimed = vmpr->reclaimed; + vmpr->scanned = 0; + vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + do { + if (vmpressure_event(vmpr, scanned, reclaimed)) + break; + /* + * If not handled, propagate the event upward into the + * hierarchy. + */ + } while ((vmpr = vmpressure_parent(vmpr))); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + + /* + * Here we only want to account pressure that userland is able to + * help us with. For example, suppose that DMA zone is under + * pressure; if we notify userland about that kind of pressure, + * then it will be mostly a waste as it will trigger unnecessary + * freeing of memory by userland (since userland is more likely to + * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That + * is why we include only movable, highmem and FS/IO pages. + * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so + * we account it too. + */ + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + /* + * If we got here with no pages scanned, then that is an indicator + * that reclaimer was unable to find any shrinkable LRUs at the + * current scanning depth. But it does not mean that we should + * report the critical pressure, yet. If the scanning priority + * (scanning depth) goes too high (deep), we will be notified + * through vmpressure_prio(). But so far, keep calm. + */ + if (!scanned) + return; + + spin_lock(&vmpr->sr_lock); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; + scanned = vmpr->scanned; + spin_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); +} + +/** + * vmpressure_prio() - Account memory pressure through reclaimer priority level + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @prio: reclaimer's priority + * + * This function should be called from the reclaim path every time when + * the vmscan's reclaiming priority (scanning depth) changes. + * + * This function does not return any value. + */ +void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) +{ + /* + * We only use prio for accounting critical level. For more info + * see comment for vmpressure_level_critical_prio variable above. + */ + if (prio > vmpressure_level_critical_prio) + return; + + /* + * OK, the prio is below the threshold, updating vmpressure + * information before shrinker dives into long shrinking of long + * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 + * to the vmpressure() basically means that we signal 'critical' + * level. + */ + vmpressure(gfp, memcg, vmpressure_win, 0); +} + +/** + * vmpressure_register_event() - Bind vmpressure notifications to an eventfd + * @memcg: memcg that is interested in vmpressure notifications + * @eventfd: eventfd context to link notifications with + * @args: event arguments (used to set up a pressure level threshold) + * + * This function associates eventfd context with the vmpressure + * infrastructure, so that the notifications will be delivered to the + * @eventfd. The @args parameter is a string that denotes pressure level + * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or + * "critical"). + * + * To be used as memcg event method. + */ +int vmpressure_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure_event *ev; + int level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { + if (!strcmp(vmpressure_str_levels[level], args)) + break; + } + + if (level >= VMPRESSURE_NUM_LEVELS) + return -EINVAL; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + + ev->efd = eventfd; + ev->level = level; + + mutex_lock(&vmpr->events_lock); + list_add(&ev->node, &vmpr->events); + mutex_unlock(&vmpr->events_lock); + + return 0; +} + +/** + * vmpressure_unregister_event() - Unbind eventfd from vmpressure + * @memcg: memcg handle + * @eventfd: eventfd context that was used to link vmpressure with the @cg + * + * This function does internal manipulations to detach the @eventfd from + * the vmpressure notifications, and then frees internal resources + * associated with the @eventfd (but the @eventfd itself is not freed). + * + * To be used as memcg event method. + */ +void vmpressure_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure_event *ev; + + mutex_lock(&vmpr->events_lock); + list_for_each_entry(ev, &vmpr->events, node) { + if (ev->efd != eventfd) + continue; + list_del(&ev->node); + kfree(ev); + break; + } + mutex_unlock(&vmpr->events_lock); +} + +/** + * vmpressure_init() - Initialize vmpressure control structure + * @vmpr: Structure to be initialized + * + * This function should be called on every allocated vmpressure structure + * before any usage. + */ +void vmpressure_init(struct vmpressure *vmpr) +{ + spin_lock_init(&vmpr->sr_lock); + mutex_init(&vmpr->events_lock); + INIT_LIST_HEAD(&vmpr->events); + INIT_WORK(&vmpr->work, vmpressure_work_fn); +} + +/** + * vmpressure_cleanup() - shuts down vmpressure control structure + * @vmpr: Structure to be cleaned up + * + * This function should be called before the structure in which it is + * embedded is cleaned up. + */ +void vmpressure_cleanup(struct vmpressure *vmpr) +{ + /* + * Make sure there is no pending work before eventfd infrastructure + * goes away. + */ + flush_work(&vmpr->work); +} diff --git a/kernel/mm/vmscan.c b/kernel/mm/vmscan.c new file mode 100644 index 000000000..5e8eadd71 --- /dev/null +++ b/kernel/mm/vmscan.c @@ -0,0 +1,3828 @@ +/* + * linux/mm/vmscan.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Removed kswapd_ctl limits, and swap out as many pages as needed + * to bring the system back to freepages.high: 2.4.97, Rik van Riel. + * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for try_to_release_page(), + buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include + +struct scan_control { + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + /* Allocation order */ + int order; + + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; + + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* Can cgroups be reclaimed below their normal consumption range? */ + unsigned int may_thrash:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; +}; + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +#ifdef ARCH_HAS_PREFETCH +#define prefetch_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetch(&prev->_field); \ + } \ + } while (0) +#else +#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +#ifdef ARCH_HAS_PREFETCHW +#define prefetchw_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetchw(&prev->_field); \ + } \ + } while (0) +#else +#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +/* + * From 0 .. 100. Higher means more swappy. + */ +int vm_swappiness = 60; +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; + +static LIST_HEAD(shrinker_list); +static DECLARE_RWSEM(shrinker_rwsem); + +#ifdef CONFIG_MEMCG +static bool global_reclaim(struct scan_control *sc) +{ + return !sc->target_mem_cgroup; +} +#else +static bool global_reclaim(struct scan_control *sc) +{ + return true; +} +#endif + +static unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; +} + +bool zone_reclaimable(struct zone *zone) +{ + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; +} + +static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_lru_size(lruvec, lru); + + return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); +} + +/* + * Add a shrinker callback to be called from the vm. + */ +int register_shrinker(struct shrinker *shrinker) +{ + size_t size = sizeof(*shrinker->nr_deferred); + + /* + * If we only have one possible node in the system anyway, save + * ourselves the trouble and disable NUMA aware behavior. This way we + * will save memory and some small loop time later. + */ + if (nr_node_ids == 1) + shrinker->flags &= ~SHRINKER_NUMA_AWARE; + + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + up_write(&shrinker_rwsem); + return 0; +} +EXPORT_SYMBOL(register_shrinker); + +/* + * Remove one + */ +void unregister_shrinker(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); +} +EXPORT_SYMBOL(unregister_shrinker); + +#define SHRINK_BATCH 128 + +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) + return 0; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + total_scan = nr; + delta = (4 * nr_scanned) / shrinker->seeks; + delta *= freeable; + do_div(delta, nr_eligible + 1); + total_scan += delta; + if (total_scan < 0) { + pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); + total_scan = freeable; + } + + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * freeable. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (total_scan > freeable * 2) + total_scan = freeable * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_scanned, nr_eligible, + freeable, delta, total_scan); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; + + cond_resched(); + } + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + return freed; +} + +/** + * shrink_slab - shrink slab caches + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target + * @nr_scanned: pressure numerator + * @nr_eligible: pressure denominator + * + * Call the shrink functions to age shrinkable caches. + * + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. + * + * @memcg specifies the memory cgroup to target. If it is not NULL, + * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan + * objects from the memory cgroup specified. Otherwise all shrinkers + * are called, and memcg aware shrinkers are supposed to scan the + * global list then. + * + * @nr_scanned and @nr_eligible form a ratio that indicate how much of + * the available objects should be scanned. Page reclaim for example + * passes the number of pages scanned and the number of pages on the + * LRU lists that it considered on @nid, plus a bias in @nr_scanned + * when it encountered mapped pages. The ratio is further biased by + * the ->seeks setting of the shrink function, which indicates the + * cost to recreate an object relative to that of an LRU page. + * + * Returns the number of reclaimed slab objects. + */ +static unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + unsigned long nr_scanned, + unsigned long nr_eligible) +{ + struct shrinker *shrinker; + unsigned long freed = 0; + + if (memcg && !memcg_kmem_is_active(memcg)) + return 0; + + if (nr_scanned == 0) + nr_scanned = SWAP_CLUSTER_MAX; + + if (!down_read_trylock(&shrinker_rwsem)) { + /* + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. + */ + freed = 1; + goto out; + } + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + continue; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + sc.nid = 0; + + freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + } + + up_read(&shrinker_rwsem); +out: + cond_resched(); + return freed; +} + +void drop_slab_node(int nid) +{ + unsigned long freed; + + do { + struct mem_cgroup *memcg = NULL; + + freed = 0; + do { + freed += shrink_slab(GFP_KERNEL, nid, memcg, + 1000, 1000); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + } while (freed > 10); +} + +void drop_slab(void) +{ + int nid; + + for_each_online_node(nid) + drop_slab_node(nid); +} + +static inline int is_page_cache_freeable(struct page *page) +{ + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; +} + +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) +{ + if (current->flags & PF_SWAPWRITE) + return 1; + if (!bdi_write_congested(bdi)) + return 1; + if (bdi == current->backing_dev_info) + return 1; + return 0; +} + +/* + * We detected a synchronous write error writing a page out. Probably + * -ENOSPC. We need to propagate that into the address_space for a subsequent + * fsync(), msync() or close(). + * + * The tricky part is that after writepage we cannot touch the mapping: nothing + * prevents it from being freed up. But we have a ref on the page and once + * that page is locked, the mapping is pinned. + * + * We're allowed to run sleeping lock_page() here because we know the caller has + * __GFP_FS. + */ +static void handle_write_error(struct address_space *mapping, + struct page *page, int error) +{ + lock_page(page); + if (page_mapping(page) == mapping) + mapping_set_error(mapping, error); + unlock_page(page); +} + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +/* + * pageout is called by shrink_page_list() for each dirty page. + * Calls ->writepage(). + */ +static pageout_t pageout(struct page *page, struct address_space *mapping, + struct scan_control *sc) +{ + /* + * If the page is dirty, only perform writeback if that write + * will be non-blocking. To prevent this allocation from being + * stalled by pagecache activity. But note that there may be + * stalls if we need to run get_block(). We could test + * PagePrivate for that. + * + * If this process is currently in __generic_file_write_iter() against + * this page's queue, we can perform writeback even if that + * will block. + * + * If the page is swapcache, write it back even if that would + * block, for some throttling. This happens by accident, because + * swap_backing_dev_info is bust: it doesn't reflect the + * congestion state of the swapdevs. Easy to fix, if needed. + */ + if (!is_page_cache_freeable(page)) + return PAGE_KEEP; + if (!mapping) { + /* + * Some data journaling orphaned pages can have + * page->mapping == NULL while being dirty with clean buffers. + */ + if (page_has_private(page)) { + if (try_to_free_buffers(page)) { + ClearPageDirty(page); + pr_info("%s: orphaned page\n", __func__); + return PAGE_CLEAN; + } + } + return PAGE_KEEP; + } + if (mapping->a_ops->writepage == NULL) + return PAGE_ACTIVATE; + if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) + return PAGE_KEEP; + + if (clear_page_dirty_for_io(page)) { + int res; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = SWAP_CLUSTER_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + .for_reclaim = 1, + }; + + SetPageReclaim(page); + res = mapping->a_ops->writepage(page, &wbc); + if (res < 0) + handle_write_error(mapping, page, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + ClearPageReclaim(page); + return PAGE_ACTIVATE; + } + + if (!PageWriteback(page)) { + /* synchronous write or broken a_ops? */ + ClearPageReclaim(page); + } + trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); + inc_zone_page_state(page, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; + } + + return PAGE_CLEAN; +} + +/* + * Same as remove_mapping, but if the page is removed from the mapping, it + * gets returned with a refcount of 0. + */ +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) +{ + BUG_ON(!PageLocked(page)); + BUG_ON(mapping != page_mapping(page)); + + spin_lock_irq(&mapping->tree_lock); + /* + * The non racy check for a busy page. + * + * Must be careful with the order of the tests. When someone has + * a ref to the page, it may be possible that they dirty it then + * drop the reference. So if PageDirty is tested before page_count + * here, then the following race may occur: + * + * get_user_pages(&page); + * [user mapping goes away] + * write_to(page); + * !PageDirty(page) [good] + * SetPageDirty(page); + * put_page(page); + * !page_count(page) [good, discard it] + * + * [oops, our write_to data is lost] + * + * Reversing the order of the tests ensures such a situation cannot + * escape unnoticed. The smp_rmb is needed to ensure the page->flags + * load is not satisfied before that of page->_count. + * + * Note that if SetPageDirty is always performed via set_page_dirty, + * and thus under tree_lock, then this ordering is not required. + */ + if (!page_freeze_refs(page, 2)) + goto cannot_free; + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_unfreeze_refs(page, 2); + goto cannot_free; + } + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; + mem_cgroup_swapout(page, swap); + __delete_from_swap_cache(page); + spin_unlock_irq(&mapping->tree_lock); + swapcache_free(swap); + } else { + void (*freepage)(struct page *); + void *shadow = NULL; + + freepage = mapping->a_ops->freepage; + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); + spin_unlock_irq(&mapping->tree_lock); + + if (freepage != NULL) + freepage(page); + } + + return 1; + +cannot_free: + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ +int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (__remove_mapping(mapping, page, false)) { + /* + * Unfreezing the refcount with 1 rather than 2 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + page_unfreeze_refs(page, 1); + return 1; + } + return 0; +} + +/** + * putback_lru_page - put previously isolated page onto appropriate LRU list + * @page: page to be put back to appropriate lru list + * + * Add previously isolated @page to appropriate LRU list. + * Page may still be unevictable for other reasons. + * + * lru_lock must not be held, interrupts must be enabled. + */ +void putback_lru_page(struct page *page) +{ + bool is_unevictable; + int was_unevictable = PageUnevictable(page); + + VM_BUG_ON_PAGE(PageLRU(page), page); + +redo: + ClearPageUnevictable(page); + + if (page_evictable(page)) { + /* + * For evictable pages, we can use the cache. + * In event of a race, worst case is we end up with an + * unevictable page on [in]active list. + * We know how to handle that. + */ + is_unevictable = false; + lru_cache_add(page); + } else { + /* + * Put unevictable pages directly on zone's unevictable + * list. + */ + is_unevictable = true; + add_page_to_unevictable_list(page); + /* + * When racing with an mlock or AS_UNEVICTABLE clearing + * (page is unlocked) make sure that if the other thread + * does not observe our setting of PG_lru and fails + * isolation/check_move_unevictable_pages, + * we see PG_mlocked/AS_UNEVICTABLE cleared below and move + * the page back to the evictable list. + * + * The other side is TestClearPageMlocked() or shmem_lock(). + */ + smp_mb(); + } + + /* + * page's status can change while we move it among lru. If an evictable + * page is on unevictable list, it never be freed. To avoid that, + * check after we added it to the list, again. + */ + if (is_unevictable && page_evictable(page)) { + if (!isolate_lru_page(page)) { + put_page(page); + goto redo; + } + /* This means someone else dropped this page from LRU + * So, it will be freed or putback to LRU again. There is + * nothing to do here. + */ + } + + if (was_unevictable && !is_unevictable) + count_vm_event(UNEVICTABLE_PGRESCUED); + else if (!was_unevictable && is_unevictable) + count_vm_event(UNEVICTABLE_PGCULLED); + + put_page(page); /* drop ref from isolate */ +} + +enum page_references { + PAGEREF_RECLAIM, + PAGEREF_RECLAIM_CLEAN, + PAGEREF_KEEP, + PAGEREF_ACTIVATE, +}; + +static enum page_references page_check_references(struct page *page, + struct scan_control *sc) +{ + int referenced_ptes, referenced_page; + unsigned long vm_flags; + + referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, + &vm_flags); + referenced_page = TestClearPageReferenced(page); + + /* + * Mlock lost the isolation race with us. Let try_to_unmap() + * move the page to the unevictable list. + */ + if (vm_flags & VM_LOCKED) + return PAGEREF_RECLAIM; + + if (referenced_ptes) { + if (PageSwapBacked(page)) + return PAGEREF_ACTIVATE; + /* + * All mapped pages start out with page table + * references from the instantiating fault, so we need + * to look twice if a mapped file page is used more + * than once. + * + * Mark it and spare it for another trip around the + * inactive list. Another page table reference will + * lead to its activation. + * + * Note: the mark is set for activated pages as well + * so that recently deactivated but used pages are + * quickly recovered. + */ + SetPageReferenced(page); + + if (referenced_page || referenced_ptes > 1) + return PAGEREF_ACTIVATE; + + /* + * Activate file-backed executable pages after first usage. + */ + if (vm_flags & VM_EXEC) + return PAGEREF_ACTIVATE; + + return PAGEREF_KEEP; + } + + /* Reclaim if clean, defer dirty pages to writeback */ + if (referenced_page && !PageSwapBacked(page)) + return PAGEREF_RECLAIM_CLEAN; + + return PAGEREF_RECLAIM; +} + +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct address_space *mapping; + + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); +} + +/* + * shrink_page_list() returns the number of reclaimed pages + */ +static unsigned long shrink_page_list(struct list_head *page_list, + struct zone *zone, + struct scan_control *sc, + enum ttu_flags ttu_flags, + unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, + unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, + bool force_reclaim) +{ + LIST_HEAD(ret_pages); + LIST_HEAD(free_pages); + int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_reclaimed = 0; + unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; + + cond_resched(); + + while (!list_empty(page_list)) { + struct address_space *mapping; + struct page *page; + int may_enter_fs; + enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; + + cond_resched(); + + page = lru_to_page(page_list); + list_del(&page->lru); + + if (!trylock_page(page)) + goto keep; + + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + + sc->nr_scanned++; + + if (unlikely(!page_evictable(page))) + goto cull_mlocked; + + if (!sc->may_unmap && page_mapped(page)) + goto keep_locked; + + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + sc->nr_scanned++; + + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if (((dirty || writeback) && mapping && + bdi_write_congested(inode_to_bdi(mapping->host))) || + (writeback && PageReclaim(page))) + nr_congested++; + + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ + if (PageWriteback(page)) { + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + test_bit(ZONE_WRITEBACK, &zone->flags)) { + nr_immediate++; + goto keep_locked; + + /* Case 2 above */ + } else if (global_reclaim(sc) || + !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + /* + * This is slightly racy - end_page_writeback() + * might have just cleared PageReclaim, then + * setting PageReclaim here end up interpreted + * as PageReadahead - but that does not matter + * enough to care. What we do want is for this + * page to have PageReclaim set next time memcg + * reclaim reaches the tests above, so it will + * then wait_on_page_writeback() to avoid OOM; + * and it's also appropriate in global reclaim. + */ + SetPageReclaim(page); + nr_writeback++; + + goto keep_locked; + + /* Case 3 above */ + } else { + wait_on_page_writeback(page); + } + } + + if (!force_reclaim) + references = page_check_references(page, sc); + + switch (references) { + case PAGEREF_ACTIVATE: + goto activate_locked; + case PAGEREF_KEEP: + goto keep_locked; + case PAGEREF_RECLAIM: + case PAGEREF_RECLAIM_CLEAN: + ; /* try to reclaim the page below */ + } + + /* + * Anonymous process memory has backing store? + * Try to allocate it some swap space here. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (!add_to_swap(page, page_list)) + goto activate_locked; + may_enter_fs = 1; + + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } + + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page_mapped(page) && mapping) { + switch (try_to_unmap(page, ttu_flags)) { + case SWAP_FAIL: + goto activate_locked; + case SWAP_AGAIN: + goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page)) { + /* + * Only kswapd can writeback filesystem pages to + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. + */ + if (page_is_file_cache(page) && + (!current_is_kswapd() || + !test_bit(ZONE_DIRTY, &zone->flags))) { + /* + * Immediately reclaim when written back. + * Similar in principal to deactivate_page() + * except we already have the page isolated + * and know it's dirty + */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page); + + goto keep_locked; + } + + if (references == PAGEREF_RECLAIM_CLEAN) + goto keep_locked; + if (!may_enter_fs) + goto keep_locked; + if (!sc->may_writepage) + goto keep_locked; + + /* Page is dirty, try to write it out here */ + switch (pageout(page, mapping, sc)) { + case PAGE_KEEP: + goto keep_locked; + case PAGE_ACTIVATE: + goto activate_locked; + case PAGE_SUCCESS: + if (PageWriteback(page)) + goto keep; + if (PageDirty(page)) + goto keep; + + /* + * A synchronous write - probably a ramdisk. Go + * ahead and try to reclaim the page. + */ + if (!trylock_page(page)) + goto keep; + if (PageDirty(page) || PageWriteback(page)) + goto keep_locked; + mapping = page_mapping(page); + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + * + * We do this even if the page is PageDirty(). + * try_to_release_page() does not perform I/O, but it is + * possible for a page to have PageDirty set, but it is actually + * clean (all its buffers are clean). This happens if the + * buffers were written out directly, with submit_bh(). ext3 + * will do this, as well as the blockdev mapping. + * try_to_release_page() will discover that cleanness and will + * drop the buffers and mark the page clean - it can be freed. + * + * Rarely, pages can have buffers and no ->mapping. These are + * the pages which were not successfully invalidated in + * truncate_complete_page(). We try to drop those buffers here + * and if that worked, and the page is no longer mapped into + * process address space (page_count == 1) it can be freed. + * Otherwise, leave the page on the LRU so it is swappable. + */ + if (page_has_private(page)) { + if (!try_to_release_page(page, sc->gfp_mask)) + goto activate_locked; + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed++; + continue; + } + } + } + + if (!mapping || !__remove_mapping(mapping, page, true)) + goto keep_locked; + + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __clear_page_locked(page); +free_it: + nr_reclaimed++; + + /* + * Is there need to periodically free_page_list? It would + * appear not as the counts should be low + */ + list_add(&page->lru, &free_pages); + continue; + +cull_mlocked: + if (PageSwapCache(page)) + try_to_free_swap(page); + unlock_page(page); + putback_lru_page(page); + continue; + +activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (PageSwapCache(page) && vm_swap_full()) + try_to_free_swap(page); + VM_BUG_ON_PAGE(PageActive(page), page); + SetPageActive(page); + pgactivate++; +keep_locked: + unlock_page(page); +keep: + list_add(&page->lru, &ret_pages); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); + } + + mem_cgroup_uncharge_list(&free_pages); + free_hot_cold_page_list(&free_pages, true); + + list_splice(&ret_pages, page_list); + count_vm_events(PGACTIVATE, pgactivate); + + *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; + *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; + return nr_reclaimed; +} + +unsigned long reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_unmap = 1, + }; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + struct page *page, *next; + LIST_HEAD(clean_pages); + + list_for_each_entry_safe(page, next, page_list, lru) { + if (page_is_file_cache(page) && !PageDirty(page) && + !isolated_balloon_page(page)) { + ClearPageActive(page); + list_move(&page->lru, &clean_pages); + } + } + + ret = shrink_page_list(&clean_pages, zone, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + list_splice(&clean_pages, page_list); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + return ret; +} + +/* + * Attempt to remove the specified page from its LRU. Only take this page + * if it is of the appropriate PageActive status. Pages which are being + * freed elsewhere are also ignored. + * + * page: page to consider + * mode: one of the LRU isolation modes defined above + * + * returns 0 on success, -ve errno on failure. + */ +int __isolate_lru_page(struct page *page, isolate_mode_t mode) +{ + int ret = -EINVAL; + + /* Only take pages on the LRU. */ + if (!PageLRU(page)) + return ret; + + /* Compaction should not handle unevictable pages but CMA can do so */ + if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) + return ret; + + ret = -EBUSY; + + /* + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking + */ + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; + + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } + + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) + return ret; + + if (likely(get_page_unless_zero(page))) { + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + ClearPageLRU(page); + ret = 0; + } + + return ret; +} + +/* + * zone->lru_lock is heavily contended. Some of the functions that + * shrink the lists perform better by taking out a batch of pages + * and working on them outside the LRU lock. + * + * For pagecache intensive workloads, this function is the hottest + * spot in the kernel (apart from copy_*_user functions). + * + * Appropriate locks must be held before calling this function. + * + * @nr_to_scan: The number of pages to look through on the list. + * @lruvec: The LRU vector to pull pages from. + * @dst: The temp list to put pages on to. + * @nr_scanned: The number of pages that were scanned. + * @sc: The scan_control struct for this reclaim session + * @mode: One of the LRU isolation modes + * @lru: LRU list id for isolating + * + * returns how many pages were moved onto *@dst. + */ +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct lruvec *lruvec, struct list_head *dst, + unsigned long *nr_scanned, struct scan_control *sc, + isolate_mode_t mode, enum lru_list lru) +{ + struct list_head *src = &lruvec->lists[lru]; + unsigned long nr_taken = 0; + unsigned long scan; + + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + struct page *page; + int nr_pages; + + page = lru_to_page(src); + prefetchw_prev_lru_page(page, src, flags); + + VM_BUG_ON_PAGE(!PageLRU(page), page); + + switch (__isolate_lru_page(page, mode)) { + case 0: + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); + list_move(&page->lru, dst); + nr_taken += nr_pages; + break; + + case -EBUSY: + /* else it is being freed elsewhere */ + list_move(&page->lru, src); + continue; + + default: + BUG(); + } + } + + *nr_scanned = scan; + trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + nr_taken, mode, is_file_lru(lru)); + return nr_taken; +} + +/** + * isolate_lru_page - tries to isolate a page from its LRU list + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU and adjusts the + * vmstat statistic corresponding to whatever LRU list the page was on. + * + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + * + * The returned page will have PageLRU() cleared. If it was found on + * the active list, it will have PageActive set. If it was found on + * the unevictable list, it will have the PageUnevictable bit set. That flag + * may need to be cleared by the caller before letting the page go. + * + * The vmstat statistic corresponding to the list on which the page was + * found will be decremented. + * + * Restrictions: + * (1) Must be called with an elevated refcount on the page. This is a + * fundamentnal difference from isolate_lru_pages (which is called + * without a stable reference). + * (2) the lru_lock must not be held. + * (3) interrupts must be enabled. + */ +int isolate_lru_page(struct page *page) +{ + int ret = -EBUSY; + + VM_BUG_ON_PAGE(!page_count(page), page); + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + struct lruvec *lruvec; + + spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(page, zone); + if (PageLRU(page)) { + int lru = page_lru(page); + get_page(page); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, lru); + ret = 0; + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + +/* + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get resheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. + */ +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) +{ + unsigned long inactive, isolated; + + if (current_is_kswapd()) + return 0; + + if (!global_reclaim(sc)) + return 0; + + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + inactive >>= 3; + + return isolated > inactive; +} + +static noinline_for_stack void +putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) +{ + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + struct zone *zone = lruvec_zone(lruvec); + LIST_HEAD(pages_to_free); + + /* + * Put back any unfreeable pages. + */ + while (!list_empty(page_list)) { + struct page *page = lru_to_page(page_list); + int lru; + + VM_BUG_ON_PAGE(PageLRU(page), page); + list_del(&page->lru); + if (unlikely(!page_evictable(page))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + + lruvec = mem_cgroup_page_lruvec(page, zone); + + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(page, lruvec, lru); + + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; + } + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(page, lruvec, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, &pages_to_free); + } + } + + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, page_list); +} + +/* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); +} + +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static noinline_for_stack unsigned long +shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) +{ + LIST_HEAD(page_list); + unsigned long nr_scanned; + unsigned long nr_reclaimed = 0; + unsigned long nr_taken; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_unqueued_dirty = 0; + unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + + lru_add_drain(); + + if (!sc->may_unmap) + isolate_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + isolate_mode |= ISOLATE_CLEAN; + + spin_lock_irq(&zone->lru_lock); + + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + &nr_scanned, sc, isolate_mode, lru); + + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + + if (global_reclaim(sc)) { + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); + } + spin_unlock_irq(&zone->lru_lock); + + if (nr_taken == 0) + return 0; + + nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); + + spin_lock_irq(&zone->lru_lock); + + reclaim_stat->recent_scanned[file] += nr_taken; + + if (global_reclaim(sc)) { + if (current_is_kswapd()) + __count_zone_vm_events(PGSTEAL_KSWAPD, zone, + nr_reclaimed); + else + __count_zone_vm_events(PGSTEAL_DIRECT, zone, + nr_reclaimed); + } + + putback_inactive_pages(lruvec, &page_list); + + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + + spin_unlock_irq(&zone->lru_lock); + + mem_cgroup_uncharge_list(&page_list); + free_hot_cold_page_list(&page_list, true); + + /* + * If reclaim is isolating dirty pages under writeback, it implies + * that the long-lived page allocation rate is exceeding the page + * laundering rate. Either the global limits are not being effective + * at throttling processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing device. The + * only option is to throttle from reclaim context which is not ideal + * as there is no guarantee the dirtying process is throttled in the + * same way balance_dirty_pages() manages. + * + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. + */ + if (nr_writeback && nr_writeback == nr_taken) + set_bit(ZONE_WRITEBACK, &zone->flags); + + /* + * memcg will stall in page writeback so only consider forcibly + * stalling for global reclaim + */ + if (global_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + set_bit(ZONE_CONGESTED, &zone->flags); + + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_DIRTY and kswapd will start writing pages from + * reclaim context. + */ + if (nr_unqueued_dirty == nr_taken) + set_bit(ZONE_DIRTY, &zone->flags); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_immediate && current_may_throttle()) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + sc->priority, + trace_shrink_flags(file)); + return nr_reclaimed; +} + +/* + * This moves pages from the active list to the inactive list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone->lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone->lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_count against each page. + * But we had to alter page->flags anyway. + */ + +static void move_active_pages_to_lru(struct lruvec *lruvec, + struct list_head *list, + struct list_head *pages_to_free, + enum lru_list lru) +{ + struct zone *zone = lruvec_zone(lruvec); + unsigned long pgmoved = 0; + struct page *page; + int nr_pages; + + while (!list_empty(list)) { + page = lru_to_page(list); + lruvec = mem_cgroup_page_lruvec(page, zone); + + VM_BUG_ON_PAGE(PageLRU(page), page); + SetPageLRU(page); + + nr_pages = hpage_nr_pages(page); + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + list_move(&page->lru, &lruvec->lists[lru]); + pgmoved += nr_pages; + + if (put_page_testzero(page)) { + __ClearPageLRU(page); + __ClearPageActive(page); + del_page_from_lru_list(page, lruvec, lru); + + if (unlikely(PageCompound(page))) { + spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); + (*get_compound_page_dtor(page))(page); + spin_lock_irq(&zone->lru_lock); + } else + list_add(&page->lru, pages_to_free); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} + +static void shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc, + enum lru_list lru) +{ + unsigned long nr_taken; + unsigned long nr_scanned; + unsigned long vm_flags; + LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_active); + LIST_HEAD(l_inactive); + struct page *page; + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + unsigned long nr_rotated = 0; + isolate_mode_t isolate_mode = 0; + int file = is_file_lru(lru); + struct zone *zone = lruvec_zone(lruvec); + + lru_add_drain(); + + if (!sc->may_unmap) + isolate_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + isolate_mode |= ISOLATE_CLEAN; + + spin_lock_irq(&zone->lru_lock); + + nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, isolate_mode, lru); + if (global_reclaim(sc)) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + + reclaim_stat->recent_scanned[file] += nr_taken; + + __count_zone_vm_events(PGREFILL, zone, nr_scanned); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + spin_unlock_irq(&zone->lru_lock); + + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + + if (unlikely(!page_evictable(page))) { + putback_lru_page(page); + continue; + } + + if (unlikely(buffer_heads_over_limit)) { + if (page_has_private(page) && trylock_page(page)) { + if (page_has_private(page)) + try_to_release_page(page, 0); + unlock_page(page); + } + } + + if (page_referenced(page, 0, sc->target_mem_cgroup, + &vm_flags)) { + nr_rotated += hpage_nr_pages(page); + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { + list_add(&page->lru, &l_active); + continue; + } + } + + ClearPageActive(page); /* we are de-activating */ + list_add(&page->lru, &l_inactive); + } + + /* + * Move pages back to the lru list. + */ + spin_lock_irq(&zone->lru_lock); + /* + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_count. + */ + reclaim_stat->recent_rotated[file] += nr_rotated; + + move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); + move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&zone->lru_lock); + + mem_cgroup_uncharge_list(&l_hold); + free_hot_cold_page_list(&l_hold, true); +} + +#ifdef CONFIG_SWAP +static int inactive_anon_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_ANON); + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + + if (inactive * zone->inactive_ratio < active) + return 1; + + return 0; +} + +/** + * inactive_anon_is_low - check if anonymous pages need to be deactivated + * @lruvec: LRU vector to check + * + * Returns true if the zone does not have enough inactive anon pages, + * meaning some active anon pages need to be deactivated. + */ +static int inactive_anon_is_low(struct lruvec *lruvec) +{ + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!total_swap_pages) + return 0; + + if (!mem_cgroup_disabled()) + return mem_cgroup_inactive_anon_is_low(lruvec); + + return inactive_anon_is_low_global(lruvec_zone(lruvec)); +} +#else +static inline int inactive_anon_is_low(struct lruvec *lruvec) +{ + return 0; +} +#endif + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @lruvec: LRU vector to check + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct lruvec *lruvec) +{ + unsigned long inactive; + unsigned long active; + + inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); + active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + + return active > inactive; +} + +static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) +{ + if (is_file_lru(lru)) + return inactive_file_is_low(lruvec); + else + return inactive_anon_is_low(lruvec); +} + +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (inactive_list_is_low(lruvec, lru)) + shrink_active_list(nr_to_scan, lruvec, sc, lru); + return 0; + } + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined + * by looking at the fraction of the pages scanned we did rotate back + * onto the active list instead of evict. + * + * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan + * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan + */ +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) +{ + struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + u64 fraction[2]; + u64 denominator = 0; /* gcc */ + struct zone *zone = lruvec_zone(lruvec); + unsigned long anon_prio, file_prio; + enum scan_balance scan_balance; + unsigned long anon, file; + bool force_scan = false; + unsigned long ap, fp; + enum lru_list lru; + bool some_scanned; + int pass; + + /* + * If the zone or memcg is small, nr[l] can be 0. This + * results in no scanning on this priority and a potential + * priority drop. Global direct reclaim can go to the next + * zone and tends to have no problems. Global kswapd is for + * zone balancing and it needs to scan a minimum amount. When + * reclaiming for a memcg, a priority drop can cause high + * latencies, so it's better to scan a minimum amount there as + * well. + */ + if (current_is_kswapd()) { + if (!zone_reclaimable(zone)) + force_scan = true; + if (!mem_cgroup_lruvec_online(lruvec)) + force_scan = true; + } + if (!global_reclaim(sc)) + force_scan = true; + + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Global reclaim will swap to prevent OOM even with no + * swappiness, but memcg users want to use this knob to + * disable swapping for individual groups completely when + * using the memory controller's swap limit feature would be + * too expensive. + */ + if (!global_reclaim(sc) && !swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (global_reclaim(sc)) { + unsigned long zonefile; + unsigned long zonefree; + + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { + scan_balance = SCAN_ANON; + goto out; + } + } + + /* + * There is enough inactive page cache, do not reclaim + * anything from the anonymous working set right now. + */ + if (!inactive_file_is_low(lruvec)) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = swappiness; + file_prio = 200 - anon_prio; + + /* + * OK, so we have swap space and a fair amount of page cache + * pages. We use the recently rotated / recently scanned + * ratios to determine how valuable each cache is. + * + * Because workloads change over time (and to avoid overflow) + * we keep these statistics as a floating average, which ends + * up weighing recent references more than old ones. + * + * anon in [0], file in [1] + */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + + spin_lock_irq(&zone->lru_lock); + if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { + reclaim_stat->recent_scanned[0] /= 2; + reclaim_stat->recent_rotated[0] /= 2; + } + + if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { + reclaim_stat->recent_scanned[1] /= 2; + reclaim_stat->recent_rotated[1] /= 2; + } + + /* + * The amount of pressure on anon vs file pages is inversely + * proportional to the fraction of recently scanned pages on + * each list that were recently referenced and in active use. + */ + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); + ap /= reclaim_stat->recent_rotated[0] + 1; + + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); + fp /= reclaim_stat->recent_rotated[1] + 1; + spin_unlock_irq(&zone->lru_lock); + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp + 1; +out: + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; + + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; + + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) { + size = 0; + scan = 0; + } + break; + default: + /* Look ma, no brain */ + BUG(); + } + + *lru_pages += size; + nr[lru] = scan; + + /* + * Skip the second pass and don't force_scan, + * if we found something to scan. + */ + some_scanned |= !!scan; + } + } +} + +/* + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *lru_pages) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; + bool scan_adjusted; + + get_scan_count(lruvec, swappiness, sc, nr, lru_pages); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + throttle_vm_writeout(sc->gfp_mask); +} + +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) +{ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + sc->priority < DEF_PRIORITY - 2)) + return true; + + return false; +} + +/* + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_zone() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!in_reclaim_compaction(sc)) + return false; + + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order, 0, 0)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + +static bool shrink_zone(struct zone *zone, struct scan_control *sc, + bool is_classzone) +{ + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; + + do { + struct mem_cgroup *root = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .zone = zone, + .priority = sc->priority, + }; + unsigned long zone_lru_pages = 0; + struct mem_cgroup *memcg; + + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + unsigned long lru_pages; + unsigned long scanned; + struct lruvec *lruvec; + int swappiness; + + if (mem_cgroup_low(root, memcg)) { + if (!sc->may_thrash) + continue; + mem_cgroup_events(memcg, MEMCG_LOW, 1); + } + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); + scanned = sc->nr_scanned; + + shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + zone_lru_pages += lru_pages; + + if (memcg && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), + memcg, sc->nr_scanned - scanned, + lru_pages); + + /* + * Direct reclaim and kswapd have to scan all memory + * cgroups to fulfill the overall scan target for the + * zone. + * + * Limit reclaim, on the other hand, only cares about + * nr_to_reclaim pages to be reclaimed and it will + * retry with decreasing priority if one round over the + * whole hierarchy is not sufficient. + */ + if (!global_reclaim(sc) && + sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + break; + } + } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + + /* + * Shrink the slab caches in the same proportion that + * the eligible LRU pages were scanned. + */ + if (global_reclaim(sc) && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; +} + +/* + * Returns true if compaction should go ahead for a high-order request, or + * the high-order allocation would succeed without compaction. + */ +static inline bool compaction_ready(struct zone *zone, int order) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page + */ + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone, order)) + return watermark_ok; + + /* + * If compaction is not ready to start and allocation is not likely + * to succeed without it, then keep reclaiming. + */ + if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) + return false; + + return watermark_ok; +} + +/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. + * + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: + * a) The caller may be trying to free *extra* pages to satisfy a higher-order + * allocation or + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. + * + * Returns true if a zone was reclaimable. + */ +static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; + gfp_t orig_mask; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; + + /* + * If the number of buffer_heads in the machine exceeds the maximum + * allowed level, force direct reclaim to scan the highmem zone as + * highmem pages could be pinning lowmem pages storing buffer_heads + */ + orig_mask = sc->gfp_mask; + if (buffer_heads_over_limit) + sc->gfp_mask |= __GFP_HIGHMEM; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + requested_highidx, sc->nodemask) { + enum zone_type classzone_idx; + + if (!populated_zone(zone)) + continue; + + classzone_idx = requested_highidx; + while (!populated_zone(zone->zone_pgdat->node_zones + + classzone_idx)) + classzone_idx--; + + /* + * Take care memory controller reclaiming has small influence + * to global LRU. + */ + if (global_reclaim(sc)) { + if (!cpuset_zone_allowed(zone, + GFP_KERNEL | __GFP_HARDWALL)) + continue; + + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) + continue; /* Let kswapd poll it */ + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; + } + + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; + /* need some check for avoid more shrink_zone() */ + } + + if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) + reclaimable = true; + + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; + } + + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + + return reclaimable; +} + +/* + * This is the main entry point to direct page reclaim. + * + * If a full scan of the inactive list fails to free enough memory then we + * are "out of memory" and something needs to be killed. + * + * If the caller is !__GFP_FS then the probability of a failure is reasonably + * high - the zone may be full of dirty or under-writeback pages, which this + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. + * + * returns: 0, if no pages reclaimed + * else, the number of pages reclaimed + */ +static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + struct scan_control *sc) +{ + int initial_priority = sc->priority; + unsigned long total_scanned = 0; + unsigned long writeback_threshold; + bool zones_reclaimable; +retry: + delayacct_freepages_start(); + + if (global_reclaim(sc)) + count_vm_event(ALLOCSTALL); + + do { + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); + sc->nr_scanned = 0; + zones_reclaimable = shrink_zones(zonelist, sc); + + total_scanned += sc->nr_scanned; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) + break; + + if (sc->compaction_ready) + break; + + /* + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. + */ + if (sc->priority < DEF_PRIORITY - 2) + sc->may_writepage = 1; + + /* + * Try to write back as many pages as we just scanned. This + * tends to cause slow streaming writers to write data to the + * disk smoothly, at the dirtying rate, which is nice. But + * that's undesirable in laptop mode, where we *want* lumpy + * writeout. So in laptop mode, write out the whole world. + */ + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, + WB_REASON_TRY_TO_FREE_PAGES); + sc->may_writepage = 1; + } + } while (--sc->priority >= 0); + + delayacct_freepages_end(); + + if (sc->nr_reclaimed) + return sc->nr_reclaimed; + + /* Aborted reclaim to try compaction? don't OOM, then */ + if (sc->compaction_ready) + return 1; + + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (!sc->may_thrash) { + sc->priority = initial_priority; + sc->may_thrash = 1; + goto retry; + } + + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) + return 1; + + return 0; +} + +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + bool wmark_ok; + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + pgdat->classzone_idx = min(pgdat->classzone_idx, + (enum zone_type)ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); + } + + return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. + */ +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zoneref *z; + struct zone *zone; + pg_data_t *pgdat = NULL; + + /* + * Kernel threads should not be throttled as they may be indirectly + * responsible for cleaning pages necessary for reclaim to make forward + * progress. kjournald for example may enter direct reclaim while + * committing a transaction where throttling it could forcing other + * processes to block on log_wait_commit(). + */ + if (current->flags & PF_KTHREAD) + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; + + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(gfp_mask), nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) + goto out; + + /* Account for the throttling */ + count_vm_event(PGSCAN_DIRECT_THROTTLE); + + /* + * If the caller cannot enter the filesystem, it's possible that it + * is due to the caller holding an FS lock or performing a journal + * transaction in the case of a filesystem like ext[3|4]. In this case, + * it is not safe to block on pfmemalloc_wait as kswapd could be + * blocked waiting on the same lock. Instead, throttle for up to a + * second before continuing. + */ + if (!(gfp_mask & __GFP_FS)) { + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat), HZ); + + goto check_pending; + } + + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat)); + +check_pending: + if (fatal_signal_pending(current)) + return true; + +out: + return false; +} + +unsigned long try_to_free_pages(struct zonelist *zonelist, int order, + gfp_t gfp_mask, nodemask_t *nodemask) +{ + unsigned long nr_reclaimed; + struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + }; + + /* + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. + */ + if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + return 1; + + trace_mm_vmscan_direct_reclaim_begin(order, + sc.may_writepage, + gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + + return nr_reclaimed; +} + +#ifdef CONFIG_MEMCG + +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, + gfp_t gfp_mask, bool noswap, + struct zone *zone, + unsigned long *nr_scanned) +{ + struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + }; + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); + unsigned long lru_pages; + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, + sc.may_writepage, + sc.gfp_mask); + + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + + *nr_scanned = sc.nr_scanned; + return sc.nr_reclaimed; +} + +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + bool may_swap) +{ + struct zonelist *zonelist; + unsigned long nr_reclaimed; + int nid; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = may_swap, + }; + + /* + * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't + * take care of from where we get pages. So the node where we start the + * scan does not need to be the current node. + */ + nid = mem_cgroup_select_victim_node(memcg); + + zonelist = NODE_DATA(nid)->node_zonelists; + + trace_mm_vmscan_memcg_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + + return nr_reclaimed; +} +#endif + +static void age_active_anon(struct zone *zone, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + + if (!total_swap_pages) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + + if (inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); +} + +static bool zone_balanced(struct zone *zone, int order, + unsigned long balance_gap, int classzone_idx) +{ + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + + balance_gap, classzone_idx, 0)) + return false; + + if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, + order, 0, classzone_idx) == COMPACT_SKIPPED) + return false; + + return true; +} + +/* + * pgdat_balanced() is used when checking if a node is balanced. + * + * For order-0, all zones must be balanced! + * + * For high-order allocations only zones that meet watermarks and are in a + * zone allowed by the callers classzone_idx are added to balanced_pages. The + * total of balanced pages must be at least 25% of the zones allowed by + * classzone_idx for the node to be considered balanced. Forcing all zones to + * be balanced for high orders can cause excessive reclaim when there are + * imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * percentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) +{ + unsigned long managed_pages = 0; + unsigned long balanced_pages = 0; + int i; + + /* Check the watermark levels */ + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + managed_pages += zone->managed_pages; + + /* + * A special case here: + * + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well! + */ + if (!zone_reclaimable(zone)) { + balanced_pages += zone->managed_pages; + continue; + } + + if (zone_balanced(zone, order, 0, i)) + balanced_pages += zone->managed_pages; + else if (!order) + return false; + } + + if (order) + return balanced_pages >= (managed_pages >> 2); + else + return true; +} + +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) +{ + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return false; + + /* + * The throttled processes are normally woken up in balance_pgdat() as + * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * race between when kswapd checks the watermarks and a process gets + * throttled. There is also a potential race if processes get + * throttled, kswapd wakes, a large process exits thereby balancing the + * zones, which causes kswapd to exit balance_pgdat() before reaching + * the wake up checks. If kswapd is going to sleep, no process should + * be sleeping on pfmemalloc_wait, so wake them now if necessary. If + * the wake up is premature, processes will wake kswapd and get + * throttled again. The difference from wake ups in balance_pgdat() is + * that here we are under prepare_to_wait(). + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait)) + wake_up_all(&pgdat->pfmemalloc_wait); + + return pgdat_balanced(pgdat, order, classzone_idx); +} + +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long *nr_attempted) +{ + int testorder = sc->order; + unsigned long balance_gap; + bool lowmem_pressure; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order, 0, classzone_idx) + != COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + clear_bit(ZONE_WRITEBACK, &zone->flags); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (zone_reclaimable(zone) && + zone_balanced(zone, testorder, 0, classzone_idx)) { + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +/* + * For kswapd, balance_pgdat() will work across all this node's zones until + * they are all at high_wmark_pages(zone). + * + * Returns the final order kswapd was reclaiming at + * + * There is special handling here for zones which are full of pinned pages. + * This can happen if the pages are all mlocked, or if they are all used by + * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. + * What we do is to detect the case where all pages in the zone have been + * scanned twice and there has been zero successful reclaim. Mark the zone as + * dead and from now on, only perform a short scan. Basically we're polling + * the zone for when the problem goes away. + * + * kswapd scans the zones in the highmem->normal->dma direction. It skips + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. + */ +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int *classzone_idx) +{ + int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = order, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + }; + count_vm_event(PAGEOUTRUN); + + do { + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); + + sc.nr_reclaimed = 0; + + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) + continue; + + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + age_active_anon(zone, &sc); + + /* + * If the number of buffer_heads in the machine + * exceeds the maximum allowed level and this node + * has a highmem zone, force kswapd to reclaim from + * it to relieve lowmem pressure. + */ + if (buffer_heads_over_limit && is_highmem_idx(i)) { + end_zone = i; + break; + } + + if (!zone_balanced(zone, order, 0, 0)) { + end_zone = i; + break; + } else { + /* + * If balanced, clear the dirty and congested + * flags + */ + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); + } + } + + if (i < 0) + goto out; + + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; + } + + /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* + * Now scan the zone in the dma->highmem direction, stopping + * at the last zone which needs scanning. + * + * We do this because the page allocator works in the opposite + * direction. This prevents the page allocator from allocating + * pages behind kswapd's direction of progress, which would + * cause too much scanning of the lower zones. + */ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) + continue; + + sc.nr_scanned = 0; + + nr_soft_scanned = 0; + /* + * Call soft limit reclaim before calling shrink_zone. + */ + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + + /* + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. + */ + if (kswapd_shrink_zone(zone, end_zone, + &sc, &nr_attempted)) + raise_priority = false; + } + + /* + * If the low watermark is met there is no need for processes + * to be throttled on pfmemalloc_wait as they should not be + * able to safely make forward progress. Wake them + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat)) + wake_up_all(&pgdat->pfmemalloc_wait); + + /* + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. + */ + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; + + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; + + /* + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted + */ + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) + compact_pgdat(pgdat, order); + + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); + +out: + /* + * Return the order we were reclaiming at so prepare_kswapd_sleep() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level + */ + *classzone_idx = end_zone; + return order; +} + +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* Try to sleep for a short interval */ + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + if (!kthread_should_stop()) + schedule(); + + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); +} + +/* + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically trickles out pages so that we have _some_ + * free memory available even if there is no other activity + * that frees anything up. This is needed for things like routing + * etc, where we otherwise might have all activity going on in + * asynchronous contexts that cannot page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. + */ +static int kswapd(void *p) +{ + unsigned long order, new_order; + unsigned balanced_order; + int classzone_idx, new_classzone_idx; + int balanced_classzone_idx; + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + lockdep_set_current_reclaim_state(GFP_KERNEL); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + current->reclaim_state = &reclaim_state; + + /* + * Tell the memory management that we're a "memory allocator", + * and that if we need more memory we should get access to it + * regardless (see "__alloc_pages()"). "kswapd" should + * never get caught in the normal page freeing logic. + * + * (Kswapd normally doesn't need memory anyway, but sometimes + * you need a small amount of memory in order to be able to + * page out something else, and this flag essentially protects + * us from recursively trying to free more memory as we're + * trying to free the first piece of memory in the first place). + */ + tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + set_freezable(); + + order = new_order = 0; + balanced_order = 0; + classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; + balanced_classzone_idx = classzone_idx; + for ( ; ; ) { + bool ret; + + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon + * so consider going to sleep on the basis we reclaimed at + */ + if (balanced_classzone_idx >= new_classzone_idx && + balanced_order == new_order) { + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + + if (order < new_order || classzone_idx > new_classzone_idx) { + /* + * Don't sleep if someone wants a larger 'order' + * allocation or has tigher zone constraints + */ + order = new_order; + classzone_idx = new_classzone_idx; + } else { + kswapd_try_to_sleep(pgdat, balanced_order, + balanced_classzone_idx); + order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; + new_order = order; + new_classzone_idx = classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) { + trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); + balanced_classzone_idx = classzone_idx; + balanced_order = balance_pgdat(pgdat, order, + &balanced_classzone_idx); + } + } + + tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); + current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + + return 0; +} + +/* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) +{ + pg_data_t *pgdat; + + if (!populated_zone(zone)) + return; + + if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) + return; + pgdat = zone->zone_pgdat; + if (pgdat->kswapd_max_order < order) { + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } + if (!waitqueue_active(&pgdat->kswapd_wait)) + return; + if (zone_balanced(zone, order, 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + wake_up_interruptible(&pgdat->kswapd_wait); +} + +#ifdef CONFIG_HIBERNATION +/* + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of + * freed pages. + * + * Rather than trying to age LRUs the aim is to preserve the overall + * LRU order by reclaiming preferentially + * inactive > active > active referenced > active mapped + */ +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) +{ + struct reclaim_state reclaim_state; + struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .hibernation_mode = 1, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; + + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; + + return nr_reclaimed; +} +#endif /* CONFIG_HIBERNATION */ + +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, we get changed to run anywhere: as the first one comes back, + restore their cpu bindings. */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kswapd, mask); + } + } + return NOTIFY_OK; +} + +/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ +int kswapd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kswapd) + return 0; + + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state == SYSTEM_BOOTING); + pr_err("Failed to start kswapd on node %d\n", nid); + ret = PTR_ERR(pgdat->kswapd); + pgdat->kswapd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) { + kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } +} + +static int __init kswapd_init(void) +{ + int nid; + + swap_setup(); + for_each_node_state(nid, N_MEMORY) + kswapd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} + +module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Zone reclaim mode + * + * If non-zero call zone_reclaim when the number of free pages falls below + * the watermarks. + */ +int zone_reclaim_mode __read_mostly; + +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ + +/* + * Priority for ZONE_RECLAIM. This determines the fraction of pages + * of a node considered for each zone_reclaim. 4 scans 1/16th of + * a zone. + */ +#define ZONE_RECLAIM_PRIORITY 4 + +/* + * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * occur. + */ +int sysctl_min_unmapped_ratio = 1; + +/* + * If the number of slab pages in a zone grows beyond this percentage then + * slab reclaim needs to occur. + */ +int sysctl_min_slab_ratio = 5; + +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + +/* + * Try to free up some pages from this zone through reclaim. + */ +static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct task_struct *p = current; + struct reclaim_state reclaim_state; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, + }; + + cond_resched(); + /* + * We need to be able to allocate from the reserves for RECLAIM_SWAP + * and we also need to be able to write out pages for RECLAIM_WRITE + * and RECLAIM_SWAP. + */ + p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { + /* + * Free memory by calling shrink zone with increasing + * priorities until we have enough memory freed. + */ + do { + shrink_zone(zone, &sc, true); + } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); + } + + p->reclaim_state = NULL; + current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + lockdep_clear_current_reclaim_state(); + return sc.nr_reclaimed >= nr_pages; +} + +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + int node_id; + int ret; + + /* + * Zone reclaim reclaims unmapped file backed pages and + * slab pages if we are over the defined limits. + * + * A small portion of unmapped file backed pages is needed for + * file I/O otherwise pages read by file I/O will be immediately + * thrown out if the zone is overallocated. So we do not reclaim + * if less than a specified percentage of the zone is used by + * unmapped file backed pages. + */ + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) + return ZONE_RECLAIM_FULL; + + if (!zone_reclaimable(zone)) + return ZONE_RECLAIM_FULL; + + /* + * Do not scan if the allocation should not be delayed. + */ + if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + return ZONE_RECLAIM_NOSCAN; + + /* + * Only run zone reclaim on the local zone or on zones that do not + * have associated processors. This will favor the local processor + * over remote processors and spread off node memory allocations + * as wide as possible. + */ + node_id = zone_to_nid(zone); + if (node_state(node_id, N_CPU) && node_id != numa_node_id()) + return ZONE_RECLAIM_NOSCAN; + + if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) + return ZONE_RECLAIM_NOSCAN; + + ret = __zone_reclaim(zone, gfp_mask, order); + clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); + + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + + return ret; +} +#endif + +/* + * page_evictable - test whether a page is evictable + * @page: the page to test + * + * Test whether page is evictable--i.e., should be placed on active/inactive + * lists vs unevictable list. + * + * Reasons page might not be evictable: + * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA + * + */ +int page_evictable(struct page *page) +{ + return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); +} + +#ifdef CONFIG_SHMEM +/** + * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list + * @pages: array of pages to check + * @nr_pages: number of pages to check + * + * Checks pages for evictability and moves them to the appropriate lru list. + * + * This function is only used for SysV IPC SHM_UNLOCK. + */ +void check_move_unevictable_pages(struct page **pages, int nr_pages) +{ + struct lruvec *lruvec; + struct zone *zone = NULL; + int pgscanned = 0; + int pgrescued = 0; + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + struct zone *pagezone; + + pgscanned++; + pagezone = page_zone(page); + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + lruvec = mem_cgroup_page_lruvec(page, zone); + + if (!PageLRU(page) || !PageUnevictable(page)) + continue; + + if (page_evictable(page)) { + enum lru_list lru = page_lru_base_type(page); + + VM_BUG_ON_PAGE(PageActive(page), page); + ClearPageUnevictable(page); + del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); + add_page_to_lru_list(page, lruvec, lru); + pgrescued++; + } + } + + if (zone) { + __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); + spin_unlock_irq(&zone->lru_lock); + } +} +#endif /* CONFIG_SHMEM */ diff --git a/kernel/mm/vmstat.c b/kernel/mm/vmstat.c new file mode 100644 index 000000000..86f0e2e3f --- /dev/null +++ b/kernel/mm/vmstat.c @@ -0,0 +1,1705 @@ +/* + * linux/mm/vmstat.c + * + * Manages VM statistics + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * zoned VM statistics + * Copyright (C) 2006 Silicon Graphics, Inc., + * Christoph Lameter + * Copyright (C) 2008-2014 Christoph Lameter + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#ifdef CONFIG_VM_EVENT_COUNTERS +DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +EXPORT_PER_CPU_SYMBOL(vm_event_states); + +static void sum_vm_events(unsigned long *ret) +{ + int cpu; + int i; + + memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + + for_each_online_cpu(cpu) { + struct vm_event_state *this = &per_cpu(vm_event_states, cpu); + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + ret[i] += this->event[i]; + } +} + +/* + * Accumulate the vm event counters across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. +*/ +void all_vm_events(unsigned long *ret) +{ + get_online_cpus(); + sum_vm_events(ret); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(all_vm_events); + +/* + * Fold the foreign cpu events into our own. + * + * This is adding to the events on one processor + * but keeps the global counts constant. + */ +void vm_events_fold_cpu(int cpu) +{ + struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); + int i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + count_vm_events(i, fold_state->event[i]); + fold_state->event[i] = 0; + } +} + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + +/* + * Manage combined zone based / global counters + * + * vm_stat contains the global counters + */ +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +EXPORT_SYMBOL(vm_stat); + +#ifdef CONFIG_SMP + +int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +int calculate_normal_threshold(struct zone *zone) +{ + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem+1) + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone->managed_pages >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +/* + * Refresh the thresholds for each zone. + */ +void refresh_zone_stat_thresholds(void) +{ + struct zone *zone; + int cpu; + int threshold; + + for_each_populated_zone(zone) { + unsigned long max_drift, tolerate_drift; + + threshold = calculate_normal_threshold(zone); + + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + + /* + * Only set percpu_drift_mark if there is a danger that + * NR_FREE_PAGES reports the low watermark is ok when in fact + * the min watermark could be breached by an allocation + */ + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); + max_drift = num_online_cpus() * threshold; + if (max_drift > tolerate_drift) + zone->percpu_drift_mark = high_wmark_pages(zone) + + max_drift; + } +} + +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = (*calculate_pressure)(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } +} + +/* + * For use when we know that interrupts are disabled, + * or when we know that preemption is disabled and that + * particular counter cannot be updated from interrupt context. + */ +void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + long x; + long t; + + preempt_disable_rt(); + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(x > t || x < -t)) { + zone_page_state_add(x, zone, item); + x = 0; + } + __this_cpu_write(*p, x); + preempt_enable_rt(); +} +EXPORT_SYMBOL(__mod_zone_page_state); + +/* + * Optimized increment and decrement functions. + * + * These are only for a single page and therefore can take a struct page * + * argument instead of struct zone *. This allows the inclusion of the code + * generated for page_zone(page) into the optimized functions. + * + * No overflow check is necessary and therefore the differential can be + * incremented or decremented in place which may allow the compilers to + * generate better code. + * The increment or decrement is known and therefore one boundary check can + * be omitted. + * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * + * Some processors have inc/dec instructions that are atomic vs an interrupt. + * However, the code must first determine the differential location in a zone + * based on the processor number and then inc/dec the counter. There is no + * guarantee without disabling preemption that the processor will not change + * in between and therefore the atomicity vs. interrupt cannot be exploited + * in a useful way here. + */ +void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + + preempt_disable_rt(); + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; + + zone_page_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); + } + preempt_enable_rt(); +} + +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __inc_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__inc_zone_page_state); + +void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + + preempt_disable_rt(); + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; + + zone_page_state_add(v - overstep, zone, item); + __this_cpu_write(*p, overstep); + } + preempt_enable_rt(); +} + +void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __dec_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__dec_zone_page_state); + +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL +/* + * If we have cmpxchg_local support then we do not need to incur the overhead + * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. + * + * mod_state() modifies the zone counter state through atomic per cpu + * operations. + * + * Overstep mode specifies how overstep should handled: + * 0 No overstepping + * 1 Overstepping half of threshold + * -1 Overstepping minus half of threshold +*/ +static inline void mod_state(struct zone *zone, + enum zone_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to zone counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a zone. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (n > t || n < -t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to zone counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + zone_page_state_add(z, zone, item); +} + +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + mod_state(zone, item, delta, 0); +} +EXPORT_SYMBOL(mod_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + mod_state(zone, item, 1, 1); +} + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_zone_page_state); +#else +/* + * Use interrupt disable to serialize counter updates + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_zone_page_state); +#endif + + +/* + * Fold a differential into the global counters. + * Returns the number of counters updated. + */ +static int fold_diff(int *diff) +{ + int i; + int changes = 0; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (diff[i]) { + atomic_long_add(diff[i], &vm_stat[i]); + changes++; + } + return changes; +} + +/* + * Update the zone counters for the current cpu. + * + * Note that refresh_cpu_vm_stats strives to only access + * node local memory. The per cpu pagesets on remote zones are placed + * in the memory local to the processor using that pageset. So the + * loop over all zones will access a series of cachelines local to + * the processor. + * + * The call to zone_page_state_add updates the cachelines with the + * statistics in the remote zone struct as well as the global cachelines + * with the global counters. These could cause remote node cache line + * bouncing and will have to be only done when necessary. + * + * The function returns the number of global counters updated. + */ +static int refresh_cpu_vm_stats(void) +{ + struct zone *zone; + int i; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int changes = 0; + + for_each_populated_zone(zone) { + struct per_cpu_pageset __percpu *p = zone->pageset; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_stat_diff[i], 0); + if (v) { + + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; +#ifdef CONFIG_NUMA + /* 3 seconds idle till flush */ + __this_cpu_write(p->expire, 3); +#endif + } + } + cond_resched(); +#ifdef CONFIG_NUMA + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || + !__this_cpu_read(p->pcp.count)) + continue; + + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } + + if (__this_cpu_dec_return(p->expire)) + continue; + + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } +#endif + } + changes += fold_diff(global_diff); + return changes; +} + +/* + * Fold the data for an offline cpu into the global array. + * There cannot be any access by the offline cpu and therefore + * synchronization is simplified. + */ +void cpu_vm_stats_fold(int cpu) +{ + struct zone *zone; + int i; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p; + + p = per_cpu_ptr(zone->pageset, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (p->vm_stat_diff[i]) { + int v; + + v = p->vm_stat_diff[i]; + p->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; + } + } + + fold_diff(global_diff); +} + +/* + * this is only called if !populated_zone(zone), which implies no other users of + * pset->vm_stat_diff[] exsist. + */ +void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pset->vm_stat_diff[i]) { + int v = pset->vm_stat_diff[i]; + pset->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + atomic_long_add(v, &vm_stat[i]); + } +} +#endif + +#ifdef CONFIG_NUMA +/* + * zonelist = the list of zones passed to the allocator + * z = the zone from which the allocation occurred. + * + * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. + */ +void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) +{ + if (z->zone_pgdat == preferred_zone->zone_pgdat) { + __inc_zone_state(z, NUMA_HIT); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } + if (z->node == ((flags & __GFP_OTHER_NODE) ? + preferred_zone->node : numa_node_id())) + __inc_zone_state(z, NUMA_LOCAL); + else + __inc_zone_state(z, NUMA_OTHER); +} +#endif + +#ifdef CONFIG_COMPACTION + +struct contig_page_info { + unsigned long free_pages; + unsigned long free_blocks_total; + unsigned long free_blocks_suitable; +}; + +/* + * Calculate the number of free pages in a zone, how many contiguous + * pages are free and how many are large enough to satisfy an allocation of + * the target size. Note that this function makes no attempt to estimate + * how many suitable free blocks there *might* be if MOVABLE pages were + * migrated. Calculating that is possible, but expensive and can be + * figured out from userspace + */ +static void fill_contig_page_info(struct zone *zone, + unsigned int suitable_order, + struct contig_page_info *info) +{ + unsigned int order; + + info->free_pages = 0; + info->free_blocks_total = 0; + info->free_blocks_suitable = 0; + + for (order = 0; order < MAX_ORDER; order++) { + unsigned long blocks; + + /* Count number of free blocks */ + blocks = zone->free_area[order].nr_free; + info->free_blocks_total += blocks; + + /* Count free base pages */ + info->free_pages += blocks << order; + + /* Count the suitable free blocks */ + if (order >= suitable_order) + info->free_blocks_suitable += blocks << + (order - suitable_order); + } +} + +/* + * A fragmentation index only makes sense if an allocation of a requested + * size would fail. If that is true, the fragmentation index indicates + * whether external fragmentation or a lack of memory was the problem. + * The value can be used to determine if page reclaim or compaction + * should be used + */ +static int __fragmentation_index(unsigned int order, struct contig_page_info *info) +{ + unsigned long requested = 1UL << order; + + if (!info->free_blocks_total) + return 0; + + /* Fragmentation index only makes sense when a request would fail */ + if (info->free_blocks_suitable) + return -1000; + + /* + * Index is between 0 and 1 so return within 3 decimal places + * + * 0 => allocation would fail due to lack of memory + * 1 => allocation would fail due to fragmentation + */ + return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); +} + +/* Same as __fragmentation index but allocs contig_page_info on stack */ +int fragmentation_index(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + return __fragmentation_index(order, &info); +} +#endif + +#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) +#ifdef CONFIG_ZONE_DMA +#define TEXT_FOR_DMA(xx) xx "_dma", +#else +#define TEXT_FOR_DMA(xx) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define TEXT_FOR_DMA32(xx) xx "_dma32", +#else +#define TEXT_FOR_DMA32(xx) +#endif + +#ifdef CONFIG_HIGHMEM +#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#else +#define TEXT_FOR_HIGHMEM(xx) +#endif + +#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ + TEXT_FOR_HIGHMEM(xx) xx "_movable", + +const char * const vmstat_text[] = { + /* enum zone_stat_item countes */ + "nr_free_pages", + "nr_alloc_batch", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", + "nr_unevictable", + "nr_mlock", + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_dirty", + "nr_writeback", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", + "nr_page_table_pages", + "nr_kernel_stack", + "nr_unstable", + "nr_bounce", + "nr_vmscan_write", + "nr_vmscan_immediate_reclaim", + "nr_writeback_temp", + "nr_isolated_anon", + "nr_isolated_file", + "nr_shmem", + "nr_dirtied", + "nr_written", + "nr_pages_scanned", + +#ifdef CONFIG_NUMA + "numa_hit", + "numa_miss", + "numa_foreign", + "numa_interleave", + "numa_local", + "numa_other", +#endif + "workingset_refault", + "workingset_activate", + "workingset_nodereclaim", + "nr_anon_transparent_hugepages", + "nr_free_cma", + + /* enum writeback_stat_item counters */ + "nr_dirty_threshold", + "nr_dirty_background_threshold", + +#ifdef CONFIG_VM_EVENT_COUNTERS + /* enum vm_event_item counters */ + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + TEXTS_FOR_ZONES("pgalloc") + + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + + TEXTS_FOR_ZONES("pgrefill") + TEXTS_FOR_ZONES("pgsteal_kswapd") + TEXTS_FOR_ZONES("pgsteal_direct") + TEXTS_FOR_ZONES("pgscan_kswapd") + TEXTS_FOR_ZONES("pgscan_direct") + "pgscan_direct_throttle", + +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif + "pginodesteal", + "slabs_scanned", + "kswapd_inodesteal", + "kswapd_low_wmark_hit_quickly", + "kswapd_high_wmark_hit_quickly", + "pageoutrun", + "allocstall", + + "pgrotated", + + "drop_pagecache", + "drop_slab", + +#ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", + "numa_huge_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", +#endif +#ifdef CONFIG_MIGRATION + "pgmigrate_success", + "pgmigrate_fail", +#endif +#ifdef CONFIG_COMPACTION + "compact_migrate_scanned", + "compact_free_scanned", + "compact_isolated", + "compact_stall", + "compact_fail", + "compact_success", +#endif + +#ifdef CONFIG_HUGETLB_PAGE + "htlb_buddy_alloc_success", + "htlb_buddy_alloc_fail", +#endif + "unevictable_pgs_culled", + "unevictable_pgs_scanned", + "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "thp_fault_alloc", + "thp_fault_fallback", + "thp_collapse_alloc", + "thp_collapse_alloc_failed", + "thp_split", + "thp_zero_page_alloc", + "thp_zero_page_alloc_failed", +#endif +#ifdef CONFIG_MEMORY_BALLOON + "balloon_inflate", + "balloon_deflate", +#ifdef CONFIG_BALLOON_COMPACTION + "balloon_migrate", +#endif +#endif /* CONFIG_MEMORY_BALLOON */ +#ifdef CONFIG_DEBUG_TLBFLUSH +#ifdef CONFIG_SMP + "nr_tlb_remote_flush", + "nr_tlb_remote_flush_received", +#endif /* CONFIG_SMP */ + "nr_tlb_local_flush_all", + "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ + +#ifdef CONFIG_DEBUG_VM_VMACACHE + "vmacache_find_calls", + "vmacache_find_hits", + "vmacache_full_flushes", +#endif +#endif /* CONFIG_VM_EVENTS_COUNTERS */ +}; +#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ + + +#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ + defined(CONFIG_PROC_FS) +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* Walk all the zones in a node and print using a callback */ +static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + print(m, pgdat, zone); + spin_unlock_irqrestore(&zone->lock, flags); + } +} +#endif + +#ifdef CONFIG_PROC_FS +static char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Reclaimable", + "Movable", + "Reserve", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + +static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int order; + + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + seq_putc(m, '\n'); +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, frag_show_print); + return 0; +} + +static void pagetypeinfo_showfree_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int order, mtype; + + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { + seq_printf(m, "Node %4d, zone %8s, type %12s ", + pgdat->node_id, + zone->name, + migratetype_names[mtype]); + for (order = 0; order < MAX_ORDER; ++order) { + unsigned long freecount = 0; + struct free_area *area; + struct list_head *curr; + + area = &(zone->free_area[order]); + + list_for_each(curr, &area->free_list[mtype]) + freecount++; + seq_printf(m, "%6lu ", freecount); + } + seq_putc(m, '\n'); + } +} + +/* Print out the free pages at each order for each migatetype */ +static int pagetypeinfo_showfree(struct seq_file *m, void *arg) +{ + int order; + pg_data_t *pgdat = (pg_data_t *)arg; + + /* Print header */ + seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6d ", order); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + + return 0; +} + +static void pagetypeinfo_showblockcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int mtype; + unsigned long pfn; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long count[MIGRATE_TYPES] = { 0, }; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + + /* Watch for unexpected holes punched in the memmap */ + if (!memmap_valid_within(pfn, page, zone)) + continue; + + mtype = get_pageblock_migratetype(page); + + if (mtype < MIGRATE_TYPES) + count[mtype]++; + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12lu ", count[mtype]); + seq_putc(m, '\n'); +} + +/* Print out the free pages at each order for each migratetype */ +static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +{ + int mtype; + pg_data_t *pgdat = (pg_data_t *)arg; + + seq_printf(m, "\n%-23s", "Number of blocks type "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + + return 0; +} + +#ifdef CONFIG_PAGE_OWNER +static void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, + struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + unsigned long pfn = zone->zone_start_pfn, block_end_pfn; + unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long count[MIGRATE_TYPES] = { 0, }; + int pageblock_mt, page_mt; + int i; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = min(block_end_pfn, end_pfn); + + page = pfn_to_page(pfn); + pageblock_mt = get_pfnblock_migratetype(page, pfn); + + for (; pfn < block_end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + pfn += (1UL << page_order(page)) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = lookup_page_ext(page); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + continue; + + page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); + if (pageblock_mt != page_mt) { + if (is_migrate_cma(pageblock_mt)) + count[MIGRATE_MOVABLE]++; + else + count[pageblock_mt]++; + + pfn = block_end_pfn; + break; + } + pfn += (1UL << page_ext->order) - 1; + } + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (i = 0; i < MIGRATE_TYPES; i++) + seq_printf(m, "%12lu ", count[i]); + seq_putc(m, '\n'); +} +#endif /* CONFIG_PAGE_OWNER */ + +/* + * Print out the number of pageblocks for each migratetype that contain pages + * of other types. This gives an indication of how well fallbacks are being + * contained by rmqueue_fallback(). It requires information from PAGE_OWNER + * to determine what is going on + */ +static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) +{ +#ifdef CONFIG_PAGE_OWNER + int mtype; + + if (!page_owner_inited) + return; + + drain_all_pages(NULL); + + seq_printf(m, "\n%-23s", "Number of mixed blocks "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); +#endif /* CONFIG_PAGE_OWNER */ +} + +/* + * This prints out statistics in relation to grouping pages by mobility. + * It is expensive to collect so do not constantly read the file. + */ +static int pagetypeinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_MEMORY)) + return 0; + + seq_printf(m, "Page block order: %d\n", pageblock_order); + seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); + seq_putc(m, '\n'); + pagetypeinfo_showfree(m, pgdat); + pagetypeinfo_showblockcount(m, pgdat); + pagetypeinfo_showmixedcount(m, pgdat); + + return 0; +} + +static const struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static int fragmentation_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fragmentation_op); +} + +static const struct file_operations fragmentation_file_operations = { + .open = fragmentation_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct seq_operations pagetypeinfo_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = pagetypeinfo_show, +}; + +static int pagetypeinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &pagetypeinfo_op); +} + +static const struct file_operations pagetypeinfo_file_ops = { + .open = pagetypeinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int i; + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n scanned %lu" + "\n spanned %lu" + "\n present %lu" + "\n managed %lu", + zone_page_state(zone, NR_FREE_PAGES), + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), + zone_page_state(zone, NR_PAGES_SCANNED), + zone->spanned_pages, + zone->present_pages, + zone->managed_pages); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, + "\n protection: (%ld", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pageset *pageset; + + pageset = per_cpu_ptr(zone->pageset, i); + seq_printf(m, + "\n cpu: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, + pageset->pcp.count, + pageset->pcp.high, + pageset->pcp.batch); +#ifdef CONFIG_SMP + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); +#endif + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n start_pfn: %lu" + "\n inactive_ratio: %u", + !zone_reclaimable(zone), + zone->zone_start_pfn, + zone->inactive_ratio); + seq_putc(m, '\n'); +} + +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, zoneinfo_show_print); + return 0; +} + +static const struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + +static int zoneinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &zoneinfo_op); +} + +static const struct file_operations proc_zoneinfo_file_operations = { + .open = zoneinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +enum writeback_stat_item { + NR_DIRTY_THRESHOLD, + NR_DIRTY_BG_THRESHOLD, + NR_VM_WRITEBACK_STAT_ITEMS, +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + unsigned long *v; + int i, stat_items_size; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); + +#ifdef CONFIG_VM_EVENT_COUNTERS + stat_items_size += sizeof(struct vm_event_state); +#endif + + v = kmalloc(stat_items_size, GFP_KERNEL); + m->private = v; + if (!v) + return ERR_PTR(-ENOMEM); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); + v += NR_VM_ZONE_STAT_ITEMS; + + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, + v + NR_DIRTY_THRESHOLD); + v += NR_VM_WRITEBACK_STAT_ITEMS; + +#ifdef CONFIG_VM_EVENT_COUNTERS + all_vm_events(v); + v[PGPGIN] /= 2; /* sectors -> kbytes */ + v[PGPGOUT] /= 2; +#endif + return (unsigned long *)m->private + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +static const struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +static int vmstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vmstat_op); +} + +static const struct file_operations proc_vmstat_file_operations = { + .open = vmstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct delayed_work, vmstat_work); +int sysctl_stat_interval __read_mostly = HZ; +static cpumask_var_t cpu_stat_off; + +static void vmstat_update(struct work_struct *w) +{ + if (refresh_cpu_vm_stats()) + /* + * Counters were updated so we expect more updates + * to occur in the future. Keep on running the + * update worker thread. + */ + schedule_delayed_work(this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + else { + /* + * We did not update any counters so the app may be in + * a mode where it does not cause counter updates. + * We may be uselessly running vmstat_update. + * Defer the checking for differentials to the + * shepherd thread on a different processor. + */ + int r; + /* + * Shepherd work thread does not race since it never + * changes the bit if its zero but the cpu + * online / off line code may race if + * worker threads are still allowed during + * shutdown / startup. + */ + r = cpumask_test_and_set_cpu(smp_processor_id(), + cpu_stat_off); + VM_BUG_ON(r); + } +} + +/* + * Check if the diffs for a certain cpu indicate that + * an update is needed. + */ +static bool need_update(int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); + + BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); + /* + * The fast way of checking if there are any vmstat diffs. + * This works because the diffs are byte sized items. + */ + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + return true; + + } + return false; +} + + +/* + * Shepherd worker thread that checks the + * differentials of processors that have their worker + * threads for vm statistics updates disabled because of + * inactivity. + */ +static void vmstat_shepherd(struct work_struct *w); + +static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); + +static void vmstat_shepherd(struct work_struct *w) +{ + int cpu; + + get_online_cpus(); + /* Check processors whose vmstat worker threads have been disabled */ + for_each_cpu(cpu, cpu_stat_off) + if (need_update(cpu) && + cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + + schedule_delayed_work_on(cpu, + &per_cpu(vmstat_work, cpu), 0); + + put_online_cpus(); + + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); + +} + +static void __init start_shepherd_timer(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), + vmstat_update); + + if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) + BUG(); + cpumask_copy(cpu_stat_off, cpu_online_mask); + + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); +} + +static void vmstat_cpu_dead(int node) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (cpu_to_node(cpu) == node) + goto end; + + node_clear_state(node, N_CPU); +end: + put_online_cpus(); +} + +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int vmstat_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + refresh_zone_stat_thresholds(); + node_set_state(cpu_to_node(cpu), N_CPU); + cpumask_set_cpu(cpu, cpu_stat_off); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + cpumask_clear_cpu(cpu, cpu_stat_off); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + cpumask_set_cpu(cpu, cpu_stat_off); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + refresh_zone_stat_thresholds(); + vmstat_cpu_dead(cpu_to_node(cpu)); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block vmstat_notifier = + { &vmstat_cpuup_callback, NULL, 0 }; +#endif + +static int __init setup_vmstat(void) +{ +#ifdef CONFIG_SMP + cpu_notifier_register_begin(); + __register_cpu_notifier(&vmstat_notifier); + + start_shepherd_timer(); + cpu_notifier_register_done(); +#endif +#ifdef CONFIG_PROC_FS + proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); + proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); + proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); + proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); +#endif + return 0; +} +module_init(setup_vmstat) + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) + +/* + * Return an index indicating how much of the available free memory is + * unusable for an allocation of the requested size. + */ +static int unusable_free_index(unsigned int order, + struct contig_page_info *info) +{ + /* No free memory is interpreted as all free memory is unusable */ + if (info->free_pages == 0) + return 1000; + + /* + * Index should be a value between 0 and 1. Return a value to 3 + * decimal places. + * + * 0 => no fragmentation + * 1 => high fragmentation + */ + return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); + +} + +static void unusable_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = unusable_free_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display unusable free space index + * + * The unusable free space index measures how much of the available free + * memory cannot be used to satisfy an allocation of a given size and is a + * value between 0 and 1. The higher the value, the more of free memory is + * unusable and by implication, the worse the external fragmentation is. This + * can be expressed as a percentage by multiplying by 100. + */ +static int unusable_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_MEMORY)) + return 0; + + walk_zones_in_node(m, pgdat, unusable_show_print); + + return 0; +} + +static const struct seq_operations unusable_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = unusable_show, +}; + +static int unusable_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &unusable_op); +} + +static const struct file_operations unusable_file_ops = { + .open = unusable_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void extfrag_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + + /* Alloc on stack as interrupts are disabled for zone walk */ + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = __fragmentation_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display fragmentation index for orders that allocations would fail for + */ +static int extfrag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + walk_zones_in_node(m, pgdat, extfrag_show_print); + + return 0; +} + +static const struct seq_operations extfrag_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = extfrag_show, +}; + +static int extfrag_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &extfrag_op); +} + +static const struct file_operations extfrag_file_ops = { + .open = extfrag_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init extfrag_debug_init(void) +{ + struct dentry *extfrag_debug_root; + + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); + if (!extfrag_debug_root) + return -ENOMEM; + + if (!debugfs_create_file("unusable_index", 0444, + extfrag_debug_root, NULL, &unusable_file_ops)) + goto fail; + + if (!debugfs_create_file("extfrag_index", 0444, + extfrag_debug_root, NULL, &extfrag_file_ops)) + goto fail; + + return 0; +fail: + debugfs_remove_recursive(extfrag_debug_root); + return -ENOMEM; +} + +module_init(extfrag_debug_init); +#endif diff --git a/kernel/mm/workingset.c b/kernel/mm/workingset.c new file mode 100644 index 000000000..263d01947 --- /dev/null +++ b/kernel/mm/workingset.c @@ -0,0 +1,416 @@ +/* + * Workingset detection + * + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Double CLOCK lists + * + * Per zone, two clock lists are maintained for file pages: the + * inactive and the active list. Freshly faulted pages start out at + * the head of the inactive list and page reclaim scans pages from the + * tail. Pages that are accessed multiple times on the inactive list + * are promoted to the active list, to protect them from reclaim, + * whereas active pages are demoted to the inactive list when the + * active list grows too big. + * + * fault ------------------------+ + * | + * +--------------+ | +-------------+ + * reclaim <- | inactive | <-+-- demotion | active | <--+ + * +--------------+ +-------------+ | + * | | + * +-------------- promotion ------------------+ + * + * + * Access frequency and refault distance + * + * A workload is thrashing when its pages are frequently used but they + * are evicted from the inactive list every time before another access + * would have promoted them to the active list. + * + * In cases where the average access distance between thrashing pages + * is bigger than the size of memory there is nothing that can be + * done - the thrashing set could never fit into memory under any + * circumstance. + * + * However, the average access distance could be bigger than the + * inactive list, yet smaller than the size of memory. In this case, + * the set could fit into memory if it weren't for the currently + * active pages - which may be used more, hopefully less frequently: + * + * +-memory available to cache-+ + * | | + * +-inactive------+-active----+ + * a b | c d e f g h i | J K L M N | + * +---------------+-----------+ + * + * It is prohibitively expensive to accurately track access frequency + * of pages. But a reasonable approximation can be made to measure + * thrashing on the inactive list, after which refaulting pages can be + * activated optimistically to compete with the existing active pages. + * + * Approximating inactive page access frequency - Observations: + * + * 1. When a page is accessed for the first time, it is added to the + * head of the inactive list, slides every existing inactive page + * towards the tail by one slot, and pushes the current tail page + * out of memory. + * + * 2. When a page is accessed for the second time, it is promoted to + * the active list, shrinking the inactive list by one slot. This + * also slides all inactive pages that were faulted into the cache + * more recently than the activated page towards the tail of the + * inactive list. + * + * Thus: + * + * 1. The sum of evictions and activations between any two points in + * time indicate the minimum number of inactive pages accessed in + * between. + * + * 2. Moving one inactive page N page slots towards the tail of the + * list requires at least N inactive page accesses. + * + * Combining these: + * + * 1. When a page is finally evicted from memory, the number of + * inactive pages accessed while the page was in cache is at least + * the number of page slots on the inactive list. + * + * 2. In addition, measuring the sum of evictions and activations (E) + * at the time of a page's eviction, and comparing it to another + * reading (R) at the time the page faults back into memory tells + * the minimum number of accesses while the page was not cached. + * This is called the refault distance. + * + * Because the first access of the page was the fault and the second + * access the refault, we combine the in-cache distance with the + * out-of-cache distance to get the complete minimum access distance + * of this page: + * + * NR_inactive + (R - E) + * + * And knowing the minimum access distance of a page, we can easily + * tell if the page would be able to stay in cache assuming all page + * slots in the cache were available: + * + * NR_inactive + (R - E) <= NR_inactive + NR_active + * + * which can be further simplified to + * + * (R - E) <= NR_active + * + * Put into words, the refault distance (out-of-cache) can be seen as + * a deficit in inactive list space (in-cache). If the inactive list + * had (R - E) more page slots, the page would not have been evicted + * in between accesses, but activated instead. And on a full system, + * the only thing eating into inactive list space is active pages. + * + * + * Activating refaulting pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given + * time there is actually a good chance that pages on the active list + * are no longer in active use. + * + * So when a refault distance of (R - E) is observed and there are at + * least (R - E) active pages, the refaulting page is activated + * optimistically in the hope that (R - E) active pages are actually + * used less frequently than the refaulting page - or even not used at + * all anymore. + * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. + * + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * + * + * Implementation + * + * For each zone's file LRU lists, a counter for inactive evictions + * and activations is maintained (zone->inactive_age). + * + * On eviction, a snapshot of this counter (along with some bits to + * identify the zone) is stored in the now empty page cache radix tree + * slot of the evicted page. This is called a shadow entry. + * + * On cache misses for which there are shadow entries, an eligible + * refault distance will immediately activate the refaulting page. + */ + +static void *pack_shadow(unsigned long eviction, struct zone *zone) +{ + eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); + eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); +} + +static void unpack_shadow(void *shadow, + struct zone **zone, + unsigned long *distance) +{ + unsigned long entry = (unsigned long)shadow; + unsigned long eviction; + unsigned long refault; + unsigned long mask; + int zid, nid; + + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + zid = entry & ((1UL << ZONES_SHIFT) - 1); + entry >>= ZONES_SHIFT; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + eviction = entry; + + *zone = NODE_DATA(nid)->node_zones + zid; + + refault = atomic_long_read(&(*zone)->inactive_age); + mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + + RADIX_TREE_EXCEPTIONAL_SHIFT); + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + *distance = (refault - eviction) & mask; +} + +/** + * workingset_eviction - note the eviction of a page from memory + * @mapping: address space the page was backing + * @page: the page being evicted + * + * Returns a shadow entry to be stored in @mapping->page_tree in place + * of the evicted @page so that a later refault can be detected. + */ +void *workingset_eviction(struct address_space *mapping, struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long eviction; + + eviction = atomic_long_inc_return(&zone->inactive_age); + return pack_shadow(eviction, zone); +} + +/** + * workingset_refault - evaluate the refault of a previously evicted page + * @shadow: shadow entry of the evicted page + * + * Calculates and evaluates the refault distance of the previously + * evicted page in the context of the zone it was allocated in. + * + * Returns %true if the page should be activated, %false otherwise. + */ +bool workingset_refault(void *shadow) +{ + unsigned long refault_distance; + struct zone *zone; + + unpack_shadow(shadow, &zone, &refault_distance); + inc_zone_state(zone, WORKINGSET_REFAULT); + + if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + inc_zone_state(zone, WORKINGSET_ACTIVATE); + return true; + } + return false; +} + +/** + * workingset_activation - note a page activation + * @page: page that is being activated + */ +void workingset_activation(struct page *page) +{ + atomic_long_inc(&page_zone(page)->inactive_age); +} + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru __workingset_shadow_nodes; +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock); + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long shadow_nodes; + unsigned long max_nodes; + unsigned long pages; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_lock_irq(workingset_shadow_lock); + shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc); + local_unlock_irq(workingset_shadow_lock); + + pages = node_present_pages(sc->nid); + /* + * Active cache pages are limited to 50% of memory, and shadow + * entries that represent a refault distance bigger than that + * do not have any effect. Limit the number of shadow nodes + * such that shadow entries do not exceed the number of active + * cache pages, assuming a worst-case node population density + * of 1/8th on average. + * + * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~2% of available memory: + * + * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + */ + max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + + if (shadow_nodes <= max_nodes) + return 0; + + return shadow_nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) +{ + struct address_space *mapping; + struct radix_tree_node *node; + unsigned int i; + int ret; + + /* + * Page cache insertions and deletions synchroneously maintain + * the shadow node LRU under the mapping->tree_lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has radix tree nodes on the LRU. + * + * We can then safely transition to the mapping->tree_lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + node = container_of(item, struct radix_tree_node, private_list); + mapping = node->private_data; + + /* Coming from the list, invert the lock order */ + if (!spin_trylock(&mapping->tree_lock)) { + spin_unlock(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_lru_isolate(lru, item); + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + + BUG_ON(!node->count); + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + node->slots[i] = NULL; + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + BUG_ON(!mapping->nrshadows); + mapping->nrshadows--; + } + } + BUG_ON(node->count); + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + if (!__radix_tree_delete_node(&mapping->page_tree, node)) + BUG(); + + spin_unlock(&mapping->tree_lock); + ret = LRU_REMOVED_RETRY; +out: + local_unlock_irq(workingset_shadow_lock); + cond_resched(); + local_lock_irq(workingset_shadow_lock); + spin_lock(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long ret; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_lock_irq(workingset_shadow_lock); + ret = list_lru_shrink_walk(&__workingset_shadow_nodes, sc, + shadow_lru_isolate, NULL); + local_unlock_irq(workingset_shadow_lock); + return ret; +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * mapping->tree_lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + int ret; + + ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key); + if (ret) + goto err; + ret = register_shrinker(&workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + return 0; +err_list_lru: + list_lru_destroy(&__workingset_shadow_nodes); +err: + return ret; +} +module_init(workingset_init); diff --git a/kernel/mm/zbud.c b/kernel/mm/zbud.c new file mode 100644 index 000000000..2ee4e4520 --- /dev/null +++ b/kernel/mm/zbud.c @@ -0,0 +1,624 @@ +/* + * zbud.c + * + * Copyright (C) 2013, Seth Jennings, IBM + * + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. + * + * zbud is an special purpose allocator for storing compressed pages. Contrary + * to what its name may suggest, zbud is not a buddy allocator, but rather an + * allocator that "buddies" two compressed pages together in a single memory + * page. + * + * While this design limits storage density, it has simple and deterministic + * reclaim properties that make it preferable to a higher density approach when + * reclaim will be used. + * + * zbud works by storing compressed pages, or "zpages", together in pairs in a + * single memory page called a "zbud page". The first buddy is "left + * justified" at the beginning of the zbud page, and the last buddy is "right + * justified" at the end of the zbud page. The benefit is that if either + * buddy is freed, the freed buddy space, coalesced with whatever slack space + * that existed between the buddies, results in the largest possible free region + * within the zbud page. + * + * zbud also provides an attractive lower bound on density. The ratio of zpages + * to zbud pages can not be less than 1. This ensures that zbud can never "do + * harm" by using more pages to store zpages than the uncompressed zpages would + * have used on their own. + * + * zbud pages are divided into "chunks". The size of the chunks is fixed at + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages + * into chunks allows organizing unbuddied zbud pages into a manageable number + * of unbuddied lists according to the number of free chunks available in the + * zbud page. + * + * The zbud API differs from that of conventional allocators in that the + * allocation function, zbud_alloc(), returns an opaque handle to the user, + * not a dereferenceable pointer. The user must map the handle using + * zbud_map() in order to get a usable pointer by which to access the + * allocation data and unmap the handle with zbud_unmap() when operations + * on the allocation data are complete. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by zbud header, NCHUNKS will be calculated to + * 63 which shows the max number of free chunks in zbud page, also there will be + * 63 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +/** + * struct zbud_pool - stores metadata for each zbud pool + * @lock: protects all pool fields and first|last_chunk fields of any + * zbud page in the pool + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; + * the lists each zbud page is added to depends on the size of + * its free region. + * @buddied: list tracking the zbud pages that contain two buddies; + * these zbud pages are full + * @lru: list tracking the zbud pages in LRU order by most recently + * added buddy. + * @pages_nr: number of zbud pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular zbud pool. + */ +struct zbud_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + struct zbud_ops *ops; +}; + +/* + * struct zbud_header - zbud page metadata occupying the first chunk of each + * zbud page. + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool + * @lru: links the zbud page into the lru list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + */ +struct zbud_header { + struct list_head buddy; + struct list_head lru; + unsigned int first_chunks; + unsigned int last_chunks; + bool under_reclaim; +}; + +/***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + return zpool_evict(pool, handle); +} + +static struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(char *name, gfp_t gfp, + struct zpool_ops *zpool_ops) +{ + return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zbud"); +#endif /* CONFIG_ZPOOL */ + +/***************** + * Helpers +*****************/ +/* Just to make the code easier to read */ +enum buddy { + FIRST, + LAST +}; + +/* Converts an allocation size in bytes to size in zbud chunks */ +static int size_to_chunks(size_t size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the zbud header of a newly allocated zbud page */ +static struct zbud_header *init_zbud_page(struct page *page) +{ + struct zbud_header *zhdr = page_address(page); + zhdr->first_chunks = 0; + zhdr->last_chunks = 0; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->lru); + zhdr->under_reclaim = 0; + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_zbud_page(struct zbud_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a zbud page + * Pool lock should be held as this function accesses first|last_chunks + */ +static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + /* + * For now, the encoded handle is actually just the pointer to the data + * but this might not always be the case. A little information hiding. + * Add CHUNK_SIZE to the handle if it is the first allocation to jump + * over the zbud header in the first chunk. + */ + handle = (unsigned long)zhdr; + if (bud == FIRST) + /* skip over zbud header */ + handle += ZHDR_SIZE_ALIGNED; + else /* bud == LAST */ + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + return handle; +} + +/* Returns the zbud page where a given handle is stored */ +static struct zbud_header *handle_to_zbud_header(unsigned long handle) +{ + return (struct zbud_header *)(handle & PAGE_MASK); +} + +/* Returns the number of free chunks in a zbud page */ +static int num_free_chunks(struct zbud_header *zhdr) +{ + /* + * Rather than branch for different situations, just use the fact that + * free buddies have a length of zero to simplify everything. + */ + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; +} + +/***************** + * API Functions +*****************/ +/** + * zbud_create_pool() - create a new zbud pool + * @gfp: gfp flags when allocating the zbud pool structure + * @ops: user-defined operations for the zbud pool + * + * Return: pointer to the new zbud pool or NULL if the metadata allocation + * failed. + */ +struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) +{ + struct zbud_pool *pool; + int i; + + pool = kmalloc(sizeof(struct zbud_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * zbud_destroy_pool() - destroys an existing zbud pool + * @pool: the zbud pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +void zbud_destroy_pool(struct zbud_pool *pool) +{ + kfree(pool); +} + +/** + * zbud_alloc() - allocates a region of a given size + * @pool: zbud pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as zbud pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + int chunks, i, freechunks; + struct zbud_header *zhdr = NULL; + enum buddy bud; + struct page *page; + + if (!size || (gfp & __GFP_HIGHMEM)) + return -EINVAL; + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + return -ENOSPC; + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied zbud page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct zbud_header, buddy); + list_del(&zhdr->buddy); + if (zhdr->first_chunks == 0) + bud = FIRST; + else + bud = LAST; + goto found; + } + } + + /* Couldn't find unbuddied zbud page, create new one */ + spin_unlock(&pool->lock); + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_zbud_page(page); + bud = FIRST; + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else + zhdr->last_chunks = chunks; + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* Add/move zbud page to beginning of LRU */ + if (!list_empty(&zhdr->lru)) + list_del(&zhdr->lru); + list_add(&zhdr->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * zbud_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by zbud_alloc() + * + * In the case that the zbud page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see zbud_reclaim_page() below). + */ +void zbud_free(struct zbud_pool *pool, unsigned long handle) +{ + struct zbud_header *zhdr; + int freechunks; + + spin_lock(&pool->lock); + zhdr = handle_to_zbud_header(handle); + + /* If first buddy, handle will be page aligned */ + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) + zhdr->last_chunks = 0; + else + zhdr->first_chunks = 0; + + if (zhdr->under_reclaim) { + /* zbud page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* zbud page is empty, free */ + list_del(&zhdr->lru); + free_zbud_page(zhdr); + pool->pages_nr--; + } else { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +#define list_tail_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * zbud_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * zbud reclaim is different from normal system reclaim in that the reclaim is + * done from the bottom, up. This is because only the bottom layer, zbud, has + * information on how the allocations are organized within each zbud page. This + * has the potential to create interesting locking situations between zbud and + * the user, however. + * + * To avoid these, this is how zbud_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call + * the user-defined eviction handler with the pool and handle as arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. zbud_reclaim_page() will add the zbud page back to the + * appropriate list and try the next zbud page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the zbud page are successfully evicted, then the + * zbud page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +{ + int i, ret, freechunks; + struct zbud_header *zhdr; + unsigned long first_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); + list_del(&zhdr->lru); + list_del(&zhdr->buddy); + /* Protect zbud page against free */ + zhdr->under_reclaim = true; + /* + * We need encode the handles before unlocking, since we can + * race with free that will set (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + zhdr->under_reclaim = false; + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* + * Both buddies are now free, free the zbud page and + * return success. + */ + free_zbud_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks == 0 || + zhdr->last_chunks == 0) { + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* add to beginning of LRU */ + list_add(&zhdr->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * zbud_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * While trivial for zbud, the mapping functions for others allocators + * implementing this allocation API could have more complex information encoded + * in the handle and could create temporary mappings to make the data + * accessible to the user. + * + * Returns: a pointer to the mapped allocation + */ +void *zbud_map(struct zbud_pool *pool, unsigned long handle) +{ + return (void *)(handle); +} + +/** + * zbud_unmap() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +{ +} + +/** + * zbud_get_pool_size() - gets the zbud pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +u64 zbud_get_pool_size(struct zbud_pool *pool) +{ + return pool->pages_nr; +} + +static int __init init_zbud(void) +{ + /* Make sure the zbud header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); + pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + + return 0; +} + +static void __exit exit_zbud(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + + pr_info("unloaded\n"); +} + +module_init(init_zbud); +module_exit(exit_zbud); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/kernel/mm/zpool.c b/kernel/mm/zpool.c new file mode 100644 index 000000000..bacdab6e4 --- /dev/null +++ b/kernel/mm/zpool.c @@ -0,0 +1,366 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @name The name of the zpool (e.g. zram0, zswap) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, + struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(name, gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman "); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/kernel/mm/zsmalloc.c b/kernel/mm/zsmalloc.c new file mode 100644 index 000000000..a8b5e749e --- /dev/null +++ b/kernel/mm/zsmalloc.c @@ -0,0 +1,1947 @@ +/* + * zsmalloc memory allocator + * + * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the license that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + */ + +/* + * Following is how we use various fields and flags of underlying + * struct page(s) to form a zspage. + * + * Usage of struct page fields: + * page->first_page: points to the first component (0-order) page + * page->index (union with page->freelist): offset of the first object + * starting in this page. For the first page, this is + * always 0, so we use this field (aka freelist) to point + * to the first free object in zspage. + * page->lru: links together all component pages (except the first page) + * of a zspage + * + * For _first_ page only: + * + * page->private (union with page->first_page): refers to the + * component page after the first page + * If the page is first_page for huge object, it stores handle. + * Look at size_class->huge. + * page->freelist: points to the first free object in zspage. + * Free objects are linked together using in-place + * metadata. + * page->objects: maximum number of objects we can store in this + * zspage (class->zspage_order * PAGE_SIZE / class->size) + * page->lru: links together first pages of various zspages. + * Basically forming list of zspages in a fullness group. + * page->mapping: class index and fullness group of the zspage + * + * Usage of struct page flags: + * PG_private: identifies the first component page + * PG_private2: identifies the last component page + * + */ + +#ifdef CONFIG_ZSMALLOC_DEBUG +#define DEBUG +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This must be power of 2 and greater than of equal to sizeof(link_free). + * These two conditions ensure that any 'struct link_free' itself doesn't + * span more than 1 page which avoids complex case of mapping 2 pages simply + * to restore link_free pointer values. + */ +#define ZS_ALIGN 8 + +/* + * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) + * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. + */ +#define ZS_MAX_ZSPAGE_ORDER 2 +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) + +#define ZS_HANDLE_SIZE (sizeof(unsigned long)) + +/* + * Object location (, ) is encoded as + * as single (unsigned long) handle value. + * + * Note that object index is relative to system + * page it is stored in, so for each sub-page belonging + * to a zspage, obj_idx starts with 0. + * + * This is made more complicated by various memory models and PAE. + */ + +#ifndef MAX_PHYSMEM_BITS +#ifdef CONFIG_HIGHMEM64G +#define MAX_PHYSMEM_BITS 36 +#else /* !CONFIG_HIGHMEM64G */ +/* + * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just + * be PAGE_SHIFT + */ +#define MAX_PHYSMEM_BITS BITS_PER_LONG +#endif +#endif +#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) + +/* + * Memory for allocating for handle keeps object position by + * encoding and the encoded value has a room + * in least bit(ie, look at obj_to_location). + * We use the bit to synchronize between object access by + * user and migration. + */ +#define HANDLE_PIN_BIT 0 + +/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because + * header keeps handle which is 4byte-aligned address so we + * have room for two bit at least. + */ +#define OBJ_ALLOCATED_TAG 1 +#define OBJ_TAG_BITS 1 +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) +#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) + +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ +#define ZS_MIN_ALLOC_SIZE \ + MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) +/* each chunk includes extra space to keep handle */ +#define ZS_MAX_ALLOC_SIZE PAGE_SIZE + +/* + * On systems with 4K page size, this gives 255 size classes! There is a + * trader-off here: + * - Large number of size classes is potentially wasteful as free page are + * spread across these classes + * - Small number of size classes causes large internal fragmentation + * - Probably its better to use specific size classes (empirically + * determined). NOTE: all those class sizes must be set as multiple of + * ZS_ALIGN to make sure link_free itself never has to span 2 pages. + * + * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN + * (reason above) + */ +#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) + +/* + * We do not maintain any list for completely empty or full pages + */ +enum fullness_group { + ZS_ALMOST_FULL, + ZS_ALMOST_EMPTY, + _ZS_NR_FULLNESS_GROUPS, + + ZS_EMPTY, + ZS_FULL +}; + +enum zs_stat_type { + OBJ_ALLOCATED, + OBJ_USED, + CLASS_ALMOST_FULL, + CLASS_ALMOST_EMPTY, + NR_ZS_STAT_TYPE, +}; + +#ifdef CONFIG_ZSMALLOC_STAT + +static struct dentry *zs_stat_root; + +struct zs_size_stat { + unsigned long objs[NR_ZS_STAT_TYPE]; +}; + +#endif + +/* + * number of size_classes + */ +static int zs_size_classes; + +/* + * We assign a page to ZS_ALMOST_EMPTY fullness group when: + * n <= N / f, where + * n = number of allocated objects + * N = total number of objects zspage can store + * f = fullness_threshold_frac + * + * Similarly, we assign zspage to: + * ZS_ALMOST_FULL when n > N / f + * ZS_EMPTY when n == 0 + * ZS_FULL when n == N + * + * (see: fix_fullness_group()) + */ +static const int fullness_threshold_frac = 4; + +struct size_class { + /* + * Size of objects stored in this class. Must be multiple + * of ZS_ALIGN. + */ + int size; + unsigned int index; + + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ + bool huge; + +#ifdef CONFIG_ZSMALLOC_STAT + struct zs_size_stat stats; +#endif + + spinlock_t lock; + + struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; +}; + +/* + * Placed within free objects to form a singly linked list. + * For every zspage, first_page->freelist gives head of this list. + * + * This must be power of 2 and less than or equal to ZS_ALIGN + */ +struct link_free { + union { + /* + * Position of next free chunk (encodes ) + * It's valid for non-allocated object + */ + void *next; + /* + * Handle of allocated object. + */ + unsigned long handle; + }; +}; + +struct zs_pool { + char *name; + + struct size_class **size_class; + struct kmem_cache *handle_cachep; + + gfp_t flags; /* allocation flags used when growing pool */ + atomic_long_t pages_allocated; + +#ifdef CONFIG_ZSMALLOC_STAT + struct dentry *stat_dentry; +#endif +}; + +/* + * A zspage's class index and fullness group + * are encoded in its (first)page->mapping + */ +#define CLASS_IDX_BITS 28 +#define FULLNESS_BITS 4 +#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) +#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) + +struct mapping_area { +#ifdef CONFIG_PGTABLE_MAPPING + struct vm_struct *vm; /* vm area for mapping object that span pages */ +#else + char *vm_buf; /* copy buffer for objects that span pages */ +#endif + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ + bool huge; +}; + +static int create_handle_cache(struct zs_pool *pool) +{ + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + 0, 0, NULL); + return pool->handle_cachep ? 0 : 1; +} + +static void destroy_handle_cache(struct zs_pool *pool) +{ + if (pool->handle_cachep) + kmem_cache_destroy(pool->handle_cachep); +} + +static unsigned long alloc_handle(struct zs_pool *pool) +{ + return (unsigned long)kmem_cache_alloc(pool->handle_cachep, + pool->flags & ~__GFP_HIGHMEM); +} + +static void free_handle(struct zs_pool *pool, unsigned long handle) +{ + kmem_cache_free(pool->handle_cachep, (void *)handle); +} + +static void record_obj(unsigned long handle, unsigned long obj) +{ + *(unsigned long *)handle = obj; +} + +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(name, gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_pages(pool) << PAGE_SHIFT; +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zsmalloc"); +#endif /* CONFIG_ZPOOL */ + +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} + +/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +static DEFINE_PER_CPU(struct mapping_area, zs_map_area); + +static int is_first_page(struct page *page) +{ + return PagePrivate(page); +} + +static int is_last_page(struct page *page) +{ + return PagePrivate2(page); +} + +static void get_zspage_mapping(struct page *page, unsigned int *class_idx, + enum fullness_group *fullness) +{ + unsigned long m; + BUG_ON(!is_first_page(page)); + + m = (unsigned long)page->mapping; + *fullness = m & FULLNESS_MASK; + *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; +} + +static void set_zspage_mapping(struct page *page, unsigned int class_idx, + enum fullness_group fullness) +{ + unsigned long m; + BUG_ON(!is_first_page(page)); + + m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | + (fullness & FULLNESS_MASK); + page->mapping = (struct address_space *)m; +} + +/* + * zsmalloc divides the pool into various size classes where each + * class maintains a list of zspages where each zspage is divided + * into equal sized chunks. Each allocation falls into one of these + * classes depending on its size. This function returns index of the + * size class which has chunk size big enough to hold the give size. + */ +static int get_size_class_index(int size) +{ + int idx = 0; + + if (likely(size > ZS_MIN_ALLOC_SIZE)) + idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, + ZS_SIZE_CLASS_DELTA); + + return min(zs_size_classes - 1, idx); +} + +#ifdef CONFIG_ZSMALLOC_STAT + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] += cnt; +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] -= cnt; +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return class->stats.objs[type]; +} + +static int __init zs_stat_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); + if (!zs_stat_root) + return -ENOMEM; + + return 0; +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long class_almost_full, class_almost_empty; + unsigned long obj_allocated, obj_used, pages_used; + unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + "class", "size", "almost_full", "almost_empty", + "obj_allocated", "obj_used", "pages_used", + "pages_per_zspage"); + + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&class->lock); + class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); + class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + spin_unlock(&class->lock); + + objs_per_zspage = get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + i, class->size, class_almost_full, class_almost_empty, + obj_allocated, obj_used, pages_used, + class->pages_per_zspage); + + total_class_almost_full += class_almost_full; + total_class_almost_empty += class_almost_empty; + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + "Total", "", total_class_almost_full, + total_class_almost_empty, total_objs, + total_used_objs, total_pages); + + return 0; +} + +static int zs_stats_size_open(struct inode *inode, struct file *file) +{ + return single_open(file, zs_stats_size_show, inode->i_private); +} + +static const struct file_operations zs_stat_size_ops = { + .open = zs_stats_size_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + struct dentry *entry; + + if (!zs_stat_root) + return -ENODEV; + + entry = debugfs_create_dir(name, zs_stat_root); + if (!entry) { + pr_warn("debugfs dir <%s> creation failed\n", name); + return -ENOMEM; + } + pool->stat_dentry = entry; + + entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, + pool->stat_dentry, pool, &zs_stat_size_ops); + if (!entry) { + pr_warn("%s: debugfs file entry <%s> creation failed\n", + name, "classes"); + return -ENOMEM; + } + + return 0; +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return 0; +} + +static int __init zs_stat_init(void) +{ + return 0; +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + return 0; +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ +} + +#endif + + +/* + * For each size class, zspages are divided into different groups + * depending on how "full" they are. This was done so that we could + * easily find empty or nearly empty zspages when we try to shrink + * the pool (not yet implemented). This function returns fullness + * status of the given page. + */ +static enum fullness_group get_fullness_group(struct page *page) +{ + int inuse, max_objects; + enum fullness_group fg; + BUG_ON(!is_first_page(page)); + + inuse = page->inuse; + max_objects = page->objects; + + if (inuse == 0) + fg = ZS_EMPTY; + else if (inuse == max_objects) + fg = ZS_FULL; + else if (inuse <= 3 * max_objects / fullness_threshold_frac) + fg = ZS_ALMOST_EMPTY; + else + fg = ZS_ALMOST_FULL; + + return fg; +} + +/* + * Each size class maintains various freelists and zspages are assigned + * to one of these freelists based on the number of live objects they + * have. This functions inserts the given zspage into the freelist + * identified by . + */ +static void insert_zspage(struct page *page, struct size_class *class, + enum fullness_group fullness) +{ + struct page **head; + + BUG_ON(!is_first_page(page)); + + if (fullness >= _ZS_NR_FULLNESS_GROUPS) + return; + + head = &class->fullness_list[fullness]; + if (*head) + list_add_tail(&page->lru, &(*head)->lru); + + *head = page; + zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); +} + +/* + * This function removes the given zspage from the freelist identified + * by . + */ +static void remove_zspage(struct page *page, struct size_class *class, + enum fullness_group fullness) +{ + struct page **head; + + BUG_ON(!is_first_page(page)); + + if (fullness >= _ZS_NR_FULLNESS_GROUPS) + return; + + head = &class->fullness_list[fullness]; + BUG_ON(!*head); + if (list_empty(&(*head)->lru)) + *head = NULL; + else if (*head == page) + *head = (struct page *)list_entry((*head)->lru.next, + struct page, lru); + + list_del_init(&page->lru); + zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); +} + +/* + * Each size class maintains zspages in different fullness groups depending + * on the number of live objects they contain. When allocating or freeing + * objects, the fullness status of the page can change, say, from ALMOST_FULL + * to ALMOST_EMPTY when freeing an object. This function checks if such + * a status change has occurred for the given page and accordingly moves the + * page from the freelist of the old fullness group to that of the new + * fullness group. + */ +static enum fullness_group fix_fullness_group(struct size_class *class, + struct page *page) +{ + int class_idx; + enum fullness_group currfg, newfg; + + BUG_ON(!is_first_page(page)); + + get_zspage_mapping(page, &class_idx, &currfg); + newfg = get_fullness_group(page); + if (newfg == currfg) + goto out; + + remove_zspage(page, class, currfg); + insert_zspage(page, class, newfg); + set_zspage_mapping(page, class_idx, newfg); + +out: + return newfg; +} + +/* + * We have to decide on how many pages to link together + * to form a zspage for each size class. This is important + * to reduce wastage due to unusable space left at end of + * each zspage which is given as: + * wastage = Zp % class_size + * usage = Zp - wastage + * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... + * + * For example, for size class of 3/8 * PAGE_SIZE, we should + * link together 3 PAGE_SIZE sized pages to form a zspage + * since then we can perfectly fit in 8 such objects. + */ +static int get_pages_per_zspage(int class_size) +{ + int i, max_usedpc = 0; + /* zspage order which gives maximum used size per KB */ + int max_usedpc_order = 1; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int zspage_size; + int waste, usedpc; + + zspage_size = i * PAGE_SIZE; + waste = zspage_size % class_size; + usedpc = (zspage_size - waste) * 100 / zspage_size; + + if (usedpc > max_usedpc) { + max_usedpc = usedpc; + max_usedpc_order = i; + } + } + + return max_usedpc_order; +} + +/* + * A single 'zspage' is composed of many system pages which are + * linked together using fields in struct page. This function finds + * the first/head page, given any component page of a zspage. + */ +static struct page *get_first_page(struct page *page) +{ + if (is_first_page(page)) + return page; + else + return page->first_page; +} + +static struct page *get_next_page(struct page *page) +{ + struct page *next; + + if (is_last_page(page)) + next = NULL; + else if (is_first_page(page)) + next = (struct page *)page_private(page); + else + next = list_entry(page->lru.next, struct page, lru); + + return next; +} + +/* + * Encode as a single handle value. + * We use the least bit of handle for tagging. + */ +static void *location_to_obj(struct page *page, unsigned long obj_idx) +{ + unsigned long obj; + + if (!page) { + BUG_ON(obj_idx); + return NULL; + } + + obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj <<= OBJ_TAG_BITS; + + return (void *)obj; +} + +/* + * Decode pair from the given object handle. We adjust the + * decoded obj_idx back to its original value since it was adjusted in + * location_to_obj(). + */ +static void obj_to_location(unsigned long obj, struct page **page, + unsigned long *obj_idx) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); +} + +static unsigned long handle_to_obj(unsigned long handle) +{ + return *(unsigned long *)handle; +} + +static unsigned long obj_to_head(struct size_class *class, struct page *page, + void *obj) +{ + if (class->huge) { + VM_BUG_ON(!is_first_page(page)); + return *(unsigned long *)page_private(page); + } else + return *(unsigned long *)obj; +} + +static unsigned long obj_idx_to_offset(struct page *page, + unsigned long obj_idx, int class_size) +{ + unsigned long off = 0; + + if (!is_first_page(page)) + off = page->index; + + return off + obj_idx * class_size; +} + +static inline int trypin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); +} + +static void pin_tag(unsigned long handle) +{ + while (!trypin_tag(handle)); +} + +static void unpin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + clear_bit_unlock(HANDLE_PIN_BIT, ptr); +} + +static void reset_page(struct page *page) +{ + clear_bit(PG_private, &page->flags); + clear_bit(PG_private_2, &page->flags); + set_page_private(page, 0); + page->mapping = NULL; + page->freelist = NULL; + page_mapcount_reset(page); +} + +static void free_zspage(struct page *first_page) +{ + struct page *nextp, *tmp, *head_extra; + + BUG_ON(!is_first_page(first_page)); + BUG_ON(first_page->inuse); + + head_extra = (struct page *)page_private(first_page); + + reset_page(first_page); + __free_page(first_page); + + /* zspage with only 1 system page */ + if (!head_extra) + return; + + list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { + list_del(&nextp->lru); + reset_page(nextp); + __free_page(nextp); + } + reset_page(head_extra); + __free_page(head_extra); +} + +/* Initialize a newly allocated zspage */ +static void init_zspage(struct page *first_page, struct size_class *class) +{ + unsigned long off = 0; + struct page *page = first_page; + + BUG_ON(!is_first_page(first_page)); + while (page) { + struct page *next_page; + struct link_free *link; + unsigned int i = 1; + void *vaddr; + + /* + * page->index stores offset of first object starting + * in the page. For the first page, this is always 0, + * so we use first_page->index (aka ->freelist) to store + * head of corresponding zspage's freelist. + */ + if (page != first_page) + page->index = off; + + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); + + while ((off += class->size) < PAGE_SIZE) { + link->next = location_to_obj(page, i++); + link += class->size / sizeof(*link); + } + + /* + * We now come to the last (full or partial) object on this + * page, which must point to the first object on the next + * page (if present) + */ + next_page = get_next_page(page); + link->next = location_to_obj(next_page, 0); + kunmap_atomic(vaddr); + page = next_page; + off %= PAGE_SIZE; + } +} + +/* + * Allocate a zspage for the given size class + */ +static struct page *alloc_zspage(struct size_class *class, gfp_t flags) +{ + int i, error; + struct page *first_page = NULL, *uninitialized_var(prev_page); + + /* + * Allocate individual pages and link them together as: + * 1. first page->private = first sub-page + * 2. all sub-pages are linked together using page->lru + * 3. each sub-page is linked to the first page using page->first_page + * + * For each size class, First/Head pages are linked together using + * page->lru. Also, we set PG_private to identify the first page + * (i.e. no other sub-page has this flag set) and PG_private_2 to + * identify the last page. + */ + error = -ENOMEM; + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; + + page = alloc_page(flags); + if (!page) + goto cleanup; + + INIT_LIST_HEAD(&page->lru); + if (i == 0) { /* first page */ + SetPagePrivate(page); + set_page_private(page, 0); + first_page = page; + first_page->inuse = 0; + } + if (i == 1) + set_page_private(first_page, (unsigned long)page); + if (i >= 1) + page->first_page = first_page; + if (i >= 2) + list_add(&page->lru, &prev_page->lru); + if (i == class->pages_per_zspage - 1) /* last page */ + SetPagePrivate2(page); + prev_page = page; + } + + init_zspage(first_page, class); + + first_page->freelist = location_to_obj(first_page, 0); + /* Maximum number of objects we can store in this zspage */ + first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; + + error = 0; /* Success */ + +cleanup: + if (unlikely(error) && first_page) { + free_zspage(first_page); + first_page = NULL; + } + + return first_page; +} + +static struct page *find_get_zspage(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) + break; + } + + return page; +} + +#ifdef CONFIG_PGTABLE_MAPPING +static inline int __zs_cpu_up(struct mapping_area *area) +{ + /* + * Make sure we don't leak memory if a cpu UP notification + * and zs_init() race and both call zs_cpu_up() on the same cpu + */ + if (area->vm) + return 0; + area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); + if (!area->vm) + return -ENOMEM; + return 0; +} + +static inline void __zs_cpu_down(struct mapping_area *area) +{ + if (area->vm) + free_vm_area(area->vm); + area->vm = NULL; +} + +static inline void *__zs_map_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + area->vm_addr = area->vm->addr; + return area->vm_addr + off; +} + +static inline void __zs_unmap_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + unsigned long addr = (unsigned long)area->vm_addr; + + unmap_kernel_range(addr, PAGE_SIZE * 2); +} + +#else /* CONFIG_PGTABLE_MAPPING */ + +static inline int __zs_cpu_up(struct mapping_area *area) +{ + /* + * Make sure we don't leak memory if a cpu UP notification + * and zs_init() race and both call zs_cpu_up() on the same cpu + */ + if (area->vm_buf) + return 0; + area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); + if (!area->vm_buf) + return -ENOMEM; + return 0; +} + +static inline void __zs_cpu_down(struct mapping_area *area) +{ + kfree(area->vm_buf); + area->vm_buf = NULL; +} + +static void *__zs_map_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf = area->vm_buf; + + /* disable page faults to match kmap_atomic() return conditions */ + pagefault_disable(); + + /* no read fastpath */ + if (area->vm_mm == ZS_MM_WO) + goto out; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy object to per-cpu buffer */ + addr = kmap_atomic(pages[0]); + memcpy(buf, addr + off, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(buf + sizes[0], addr, sizes[1]); + kunmap_atomic(addr); +out: + return area->vm_buf; +} + +static void __zs_unmap_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf; + + /* no write fastpath */ + if (area->vm_mm == ZS_MM_RO) + goto out; + + buf = area->vm_buf; + if (!area->huge) { + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; + } + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy per-cpu buffer to object */ + addr = kmap_atomic(pages[0]); + memcpy(addr + off, buf, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(addr, buf + sizes[0], sizes[1]); + kunmap_atomic(addr); + +out: + /* enable page faults to match kunmap_atomic() return conditions */ + pagefault_enable(); +} + +#endif /* CONFIG_PGTABLE_MAPPING */ + +static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, + void *pcpu) +{ + int ret, cpu = (long)pcpu; + struct mapping_area *area; + + switch (action) { + case CPU_UP_PREPARE: + area = &per_cpu(zs_map_area, cpu); + ret = __zs_cpu_up(area); + if (ret) + return notifier_from_errno(ret); + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + area = &per_cpu(zs_map_area, cpu); + __zs_cpu_down(area); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block zs_cpu_nb = { + .notifier_call = zs_cpu_notifier +}; + +static int zs_register_cpu_notifier(void) +{ + int cpu, uninitialized_var(ret); + + cpu_notifier_register_begin(); + + __register_cpu_notifier(&zs_cpu_nb); + for_each_online_cpu(cpu) { + ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + if (notifier_to_errno(ret)) + break; + } + + cpu_notifier_register_done(); + return notifier_to_errno(ret); +} + +static void zs_unregister_cpu_notifier(void) +{ + int cpu; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) + zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); + __unregister_cpu_notifier(&zs_cpu_nb); + + cpu_notifier_register_done(); +} + +static void init_zs_size_classes(void) +{ + int nr; + + nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; + if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) + nr += 1; + + zs_size_classes = nr; +} + +static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) +{ + if (prev->pages_per_zspage != pages_per_zspage) + return false; + + if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) + != get_maxobj_per_zspage(size, pages_per_zspage)) + return false; + + return true; +} + +static bool zspage_full(struct page *page) +{ + BUG_ON(!is_first_page(page)); + + return page->inuse == page->objects; +} + +unsigned long zs_get_total_pages(struct zs_pool *pool) +{ + return atomic_long_read(&pool->pages_allocated); +} +EXPORT_SYMBOL_GPL(zs_get_total_pages); + +/** + * zs_map_object - get address of allocated object from handle. + * @pool: pool from which the object was allocated + * @handle: handle returned from zs_malloc + * + * Before using an object allocated from zs_malloc, it must be mapped using + * this function. When done with the object, it must be unmapped using + * zs_unmap_object. + * + * Only one object can be mapped per cpu at a time. There is no protection + * against nested mappings. + * + * This function returns with preemption and page faults disabled. + */ +void *zs_map_object(struct zs_pool *pool, unsigned long handle, + enum zs_mapmode mm) +{ + struct page *page; + unsigned long obj, obj_idx, off; + + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + struct page *pages[2]; + void *ret; + + BUG_ON(!handle); + + /* + * Because we use per-cpu mapping areas shared among the + * pools/users, we can't allow mapping in interrupt context + * because it can corrupt another users mappings. + */ + BUG_ON(in_interrupt()); + + /* From now on, migration cannot move the object */ + pin_tag(handle); + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + + area = &get_cpu_var(zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + area->vm_addr = kmap_atomic(page); + ret = area->vm_addr + off; + goto out; + } + + /* this object spans two pages */ + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + ret = __zs_map_object(area, pages, off, class->size); +out: + if (!class->huge) + ret += ZS_HANDLE_SIZE; + + return ret; +} +EXPORT_SYMBOL_GPL(zs_map_object); + +void zs_unmap_object(struct zs_pool *pool, unsigned long handle) +{ + struct page *page; + unsigned long obj, obj_idx, off; + + unsigned int class_idx; + enum fullness_group fg; + struct size_class *class; + struct mapping_area *area; + + BUG_ON(!handle); + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + get_zspage_mapping(get_first_page(page), &class_idx, &fg); + class = pool->size_class[class_idx]; + off = obj_idx_to_offset(page, obj_idx, class->size); + + area = this_cpu_ptr(&zs_map_area); + if (off + class->size <= PAGE_SIZE) + kunmap_atomic(area->vm_addr); + else { + struct page *pages[2]; + + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + __zs_unmap_object(area, pages, off, class->size); + } + put_cpu_var(zs_map_area); + unpin_tag(handle); +} +EXPORT_SYMBOL_GPL(zs_unmap_object); + +static unsigned long obj_malloc(struct page *first_page, + struct size_class *class, unsigned long handle) +{ + unsigned long obj; + struct link_free *link; + + struct page *m_page; + unsigned long m_objidx, m_offset; + void *vaddr; + + handle |= OBJ_ALLOCATED_TAG; + obj = (unsigned long)first_page->freelist; + obj_to_location(obj, &m_page, &m_objidx); + m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + first_page->freelist = link->next; + if (!class->huge) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else + /* record handle in first_page->private */ + set_page_private(first_page, handle); + kunmap_atomic(vaddr); + first_page->inuse++; + zs_stat_inc(class, OBJ_USED, 1); + + return obj; +} + + +/** + * zs_malloc - Allocate block of given size from pool. + * @pool: pool to allocate from + * @size: size of block to allocate + * + * On success, handle to the allocated object is returned, + * otherwise 0. + * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. + */ +unsigned long zs_malloc(struct zs_pool *pool, size_t size) +{ + unsigned long handle, obj; + struct size_class *class; + struct page *first_page; + + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) + return 0; + + handle = alloc_handle(pool); + if (!handle) + return 0; + + /* extra space in chunk to keep the handle */ + size += ZS_HANDLE_SIZE; + class = pool->size_class[get_size_class_index(size)]; + + spin_lock(&class->lock); + first_page = find_get_zspage(class); + + if (!first_page) { + spin_unlock(&class->lock); + first_page = alloc_zspage(class, pool->flags); + if (unlikely(!first_page)) { + free_handle(pool, handle); + return 0; + } + + set_zspage_mapping(first_page, class->index, ZS_EMPTY); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + + spin_lock(&class->lock); + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + } + + obj = obj_malloc(first_page, class, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, first_page); + record_obj(handle, obj); + spin_unlock(&class->lock); + + return handle; +} +EXPORT_SYMBOL_GPL(zs_malloc); + +static void obj_free(struct zs_pool *pool, struct size_class *class, + unsigned long obj) +{ + struct link_free *link; + struct page *first_page, *f_page; + unsigned long f_objidx, f_offset; + void *vaddr; + int class_idx; + enum fullness_group fullness; + + BUG_ON(!obj); + + obj &= ~OBJ_ALLOCATED_TAG; + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + + vaddr = kmap_atomic(f_page); + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)(vaddr + f_offset); + link->next = first_page->freelist; + if (class->huge) + set_page_private(first_page, 0); + kunmap_atomic(vaddr); + first_page->freelist = (void *)obj; + first_page->inuse--; + zs_stat_dec(class, OBJ_USED, 1); +} + +void zs_free(struct zs_pool *pool, unsigned long handle) +{ + struct page *first_page, *f_page; + unsigned long obj, f_objidx; + int class_idx; + struct size_class *class; + enum fullness_group fullness; + + if (unlikely(!handle)) + return; + + pin_tag(handle); + obj = handle_to_obj(handle); + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + obj_free(pool, class, obj); + fullness = fix_fullness_group(class, first_page); + if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + free_zspage(first_page); + } + spin_unlock(&class->lock); + unpin_tag(handle); + + free_handle(pool, handle); +} +EXPORT_SYMBOL_GPL(zs_free); + +static void zs_object_copy(unsigned long src, unsigned long dst, + struct size_class *class) +{ + struct page *s_page, *d_page; + unsigned long s_objidx, d_objidx; + unsigned long s_off, d_off; + void *s_addr, *d_addr; + int s_size, d_size, size; + int written = 0; + + s_size = d_size = class->size; + + obj_to_location(src, &s_page, &s_objidx); + obj_to_location(dst, &d_page, &d_objidx); + + s_off = obj_idx_to_offset(s_page, s_objidx, class->size); + d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + + if (s_off + class->size > PAGE_SIZE) + s_size = PAGE_SIZE - s_off; + + if (d_off + class->size > PAGE_SIZE) + d_size = PAGE_SIZE - d_off; + + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + + while (1) { + size = min(s_size, d_size); + memcpy(d_addr + d_off, s_addr + s_off, size); + written += size; + + if (written == class->size) + break; + + s_off += size; + s_size -= size; + d_off += size; + d_size -= size; + + if (s_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); + s_page = get_next_page(s_page); + BUG_ON(!s_page); + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + s_size = class->size - written; + s_off = 0; + } + + if (d_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + d_page = get_next_page(d_page); + BUG_ON(!d_page); + d_addr = kmap_atomic(d_page); + d_size = class->size - written; + d_off = 0; + } + } + + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); +} + +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct page *page, int index, + struct size_class *class) +{ + unsigned long head; + int offset = 0; + unsigned long handle = 0; + void *addr = kmap_atomic(page); + + if (!is_first_page(page)) + offset = page->index; + offset += class->size * index; + + while (offset < PAGE_SIZE) { + head = obj_to_head(class, page, addr + offset); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (trypin_tag(handle)) + break; + handle = 0; + } + + offset += class->size; + index++; + } + + kunmap_atomic(addr); + return handle; +} + +struct zs_compact_control { + /* Source page for migration which could be a subpage of zspage. */ + struct page *s_page; + /* Destination page for migration which should be a first page + * of zspage. */ + struct page *d_page; + /* Starting object index within @s_page which used for live object + * in the subpage. */ + int index; + /* how many of objects are migrated */ + int nr_migrated; +}; + +static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + struct zs_compact_control *cc) +{ + unsigned long used_obj, free_obj; + unsigned long handle; + struct page *s_page = cc->s_page; + struct page *d_page = cc->d_page; + unsigned long index = cc->index; + int nr_migrated = 0; + int ret = 0; + + while (1) { + handle = find_alloced_obj(s_page, index, class); + if (!handle) { + s_page = get_next_page(s_page); + if (!s_page) + break; + index = 0; + continue; + } + + /* Stop if there is no more space */ + if (zspage_full(d_page)) { + unpin_tag(handle); + ret = -ENOMEM; + break; + } + + used_obj = handle_to_obj(handle); + free_obj = obj_malloc(d_page, class, handle); + zs_object_copy(used_obj, free_obj, class); + index++; + record_obj(handle, free_obj); + unpin_tag(handle); + obj_free(pool, class, used_obj); + nr_migrated++; + } + + /* Remember last position in this iteration */ + cc->s_page = s_page; + cc->index = index; + cc->nr_migrated = nr_migrated; + + return ret; +} + +static struct page *alloc_target_page(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) { + remove_zspage(page, class, i); + break; + } + } + + return page; +} + +static void putback_zspage(struct zs_pool *pool, struct size_class *class, + struct page *first_page) +{ + enum fullness_group fullness; + + BUG_ON(!is_first_page(first_page)); + + fullness = get_fullness_group(first_page); + insert_zspage(first_page, class, fullness); + set_zspage_mapping(first_page, class->index, fullness); + + if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + + free_zspage(first_page); + } +} + +static struct page *isolate_source_page(struct size_class *class) +{ + struct page *page; + + page = class->fullness_list[ZS_ALMOST_EMPTY]; + if (page) + remove_zspage(page, class, ZS_ALMOST_EMPTY); + + return page; +} + +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) +{ + int nr_to_migrate; + struct zs_compact_control cc; + struct page *src_page; + struct page *dst_page = NULL; + unsigned long nr_total_migrated = 0; + + spin_lock(&class->lock); + while ((src_page = isolate_source_page(class))) { + + BUG_ON(!is_first_page(src_page)); + + /* The goal is to migrate all live objects in source page */ + nr_to_migrate = src_page->inuse; + cc.index = 0; + cc.s_page = src_page; + + while ((dst_page = alloc_target_page(class))) { + cc.d_page = dst_page; + /* + * If there is no more space in dst_page, try to + * allocate another zspage. + */ + if (!migrate_zspage(pool, class, &cc)) + break; + + putback_zspage(pool, class, dst_page); + nr_total_migrated += cc.nr_migrated; + nr_to_migrate -= cc.nr_migrated; + } + + /* Stop if we couldn't find slot */ + if (dst_page == NULL) + break; + + putback_zspage(pool, class, dst_page); + putback_zspage(pool, class, src_page); + spin_unlock(&class->lock); + nr_total_migrated += cc.nr_migrated; + cond_resched(); + spin_lock(&class->lock); + } + + if (src_page) + putback_zspage(pool, class, src_page); + + spin_unlock(&class->lock); + + return nr_total_migrated; +} + +unsigned long zs_compact(struct zs_pool *pool) +{ + int i; + unsigned long nr_migrated = 0; + struct size_class *class; + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + nr_migrated += __zs_compact(pool, class); + } + + return nr_migrated; +} +EXPORT_SYMBOL_GPL(zs_compact); + +/** + * zs_create_pool - Creates an allocation pool to work from. + * @flags: allocation flags used to allocate pool metadata + * + * This function must be called before anything when using + * the zsmalloc allocator. + * + * On success, a pointer to the newly created pool is returned, + * otherwise NULL. + */ +struct zs_pool *zs_create_pool(char *name, gfp_t flags) +{ + int i; + struct zs_pool *pool; + struct size_class *prev_class = NULL; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), + GFP_KERNEL); + if (!pool->size_class) { + kfree(pool); + return NULL; + } + + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) + goto err; + + if (create_handle_cache(pool)) + goto err; + + /* + * Iterate reversly, because, size of size_class that we want to use + * for merging should be larger or equal to current size. + */ + for (i = zs_size_classes - 1; i >= 0; i--) { + int size; + int pages_per_zspage; + struct size_class *class; + + size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; + if (size > ZS_MAX_ALLOC_SIZE) + size = ZS_MAX_ALLOC_SIZE; + pages_per_zspage = get_pages_per_zspage(size); + + /* + * size_class is used for normal zsmalloc operation such + * as alloc/free for that size. Although it is natural that we + * have one size_class for each size, there is a chance that we + * can get more memory utilization if we use one size_class for + * many different sizes whose size_class have same + * characteristics. So, we makes size_class point to + * previous size_class if possible. + */ + if (prev_class) { + if (can_merge(prev_class, size, pages_per_zspage)) { + pool->size_class[i] = prev_class; + continue; + } + } + + class = kzalloc(sizeof(struct size_class), GFP_KERNEL); + if (!class) + goto err; + + class->size = size; + class->index = i; + class->pages_per_zspage = pages_per_zspage; + if (pages_per_zspage == 1 && + get_maxobj_per_zspage(size, pages_per_zspage) == 1) + class->huge = true; + spin_lock_init(&class->lock); + pool->size_class[i] = class; + + prev_class = class; + } + + pool->flags = flags; + + if (zs_pool_stat_create(name, pool)) + goto err; + + return pool; + +err: + zs_destroy_pool(pool); + return NULL; +} +EXPORT_SYMBOL_GPL(zs_create_pool); + +void zs_destroy_pool(struct zs_pool *pool) +{ + int i; + + zs_pool_stat_destroy(pool); + + for (i = 0; i < zs_size_classes; i++) { + int fg; + struct size_class *class = pool->size_class[i]; + + if (!class) + continue; + + if (class->index != i) + continue; + + for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { + if (class->fullness_list[fg]) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); + } + } + kfree(class); + } + + destroy_handle_cache(pool); + kfree(pool->size_class); + kfree(pool->name); + kfree(pool); +} +EXPORT_SYMBOL_GPL(zs_destroy_pool); + +static int __init zs_init(void) +{ + int ret = zs_register_cpu_notifier(); + + if (ret) + goto notifier_fail; + + init_zs_size_classes(); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + + ret = zs_stat_init(); + if (ret) { + pr_err("zs stat initialization failed\n"); + goto stat_fail; + } + return 0; + +stat_fail: +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif +notifier_fail: + zs_unregister_cpu_notifier(); + + return ret; +} + +static void __exit zs_exit(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + zs_unregister_cpu_notifier(); + + zs_stat_exit(); +} + +module_init(zs_init); +module_exit(zs_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Nitin Gupta "); diff --git a/kernel/mm/zswap.c b/kernel/mm/zswap.c new file mode 100644 index 000000000..4249e82ff --- /dev/null +++ b/kernel/mm/zswap.c @@ -0,0 +1,955 @@ +/* + * zswap.c - zswap driver file + * + * zswap is a backend for frontswap that takes pages that are in the process + * of being swapped out and attempts to compress and store them in a + * RAM-based memory pool. This can result in a significant I/O reduction on + * the swap device and, in the case where decompressing from RAM is faster + * than reading from the swap device, can also improve workload performance. + * + * Copyright (C) 2012 Seth Jennings + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/********************************* +* statistics +**********************************/ +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; +/* The number of compressed pages currently stored in zswap */ +static atomic_t zswap_stored_pages = ATOMIC_INIT(0); + +/* + * The statistics below are not protected from concurrent access for + * performance reasons so they may not be a 100% accurate. However, + * they do provide useful information on roughly how many times a + * certain event is occurring. +*/ + +/* Pool limit was hit (see zswap_max_pool_percent) */ +static u64 zswap_pool_limit_hit; +/* Pages written back when pool limit was reached */ +static u64 zswap_written_back_pages; +/* Store failed due to a reclaim failure after pool limit was reached */ +static u64 zswap_reject_reclaim_fail; +/* Compressed page was too big for the allocator to (optimally) store */ +static u64 zswap_reject_compress_poor; +/* Store failed because underlying allocator could not get memory */ +static u64 zswap_reject_alloc_fail; +/* Store failed because the entry metadata could not be allocated (rare) */ +static u64 zswap_reject_kmemcache_fail; +/* Duplicate store was encountered (rare) */ +static u64 zswap_duplicate_entry; + +/********************************* +* tunables +**********************************/ +/* Enable/disable zswap (disabled by default, fixed at boot for now) */ +static bool zswap_enabled __read_mostly; +module_param_named(enabled, zswap_enabled, bool, 0444); + +/* Compressor to be used by zswap (fixed at boot for now) */ +#define ZSWAP_COMPRESSOR_DEFAULT "lzo" +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +module_param_named(compressor, zswap_compressor, charp, 0444); + +/* The maximum percentage of memory that the compressed pool can occupy */ +static unsigned int zswap_max_pool_percent = 20; +module_param_named(max_pool_percent, + zswap_max_pool_percent, uint, 0644); + +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); + +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; + +/********************************* +* compression functions +**********************************/ +/* per-cpu compression transforms */ +static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; + +enum comp_op { + ZSWAP_COMPOP_COMPRESS, + ZSWAP_COMPOP_DECOMPRESS +}; + +static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + struct crypto_comp *tfm; + int ret; + + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); + switch (op) { + case ZSWAP_COMPOP_COMPRESS: + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); + break; + case ZSWAP_COMPOP_DECOMPRESS: + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); + break; + default: + ret = -EINVAL; + } + + put_cpu(); + return ret; +} + +static int __init zswap_comp_init(void) +{ + if (!crypto_has_comp(zswap_compressor, 0, 0)) { + pr_info("%s compressor not available\n", zswap_compressor); + /* fall back to default compressor */ + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + if (!crypto_has_comp(zswap_compressor, 0, 0)) + /* can't even load the default compressor */ + return -ENODEV; + } + pr_info("using %s compressor\n", zswap_compressor); + + /* alloc percpu transforms */ + zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); + if (!zswap_comp_pcpu_tfms) + return -ENOMEM; + return 0; +} + +static void __init zswap_comp_exit(void) +{ + /* free percpu transforms */ + free_percpu(zswap_comp_pcpu_tfms); +} + +/********************************* +* data structures +**********************************/ +/* + * struct zswap_entry + * + * This structure contains the metadata for tracking a single compressed + * page within zswap. + * + * rbnode - links the entry into red-black tree for the appropriate swap type + * refcount - the number of outstanding reference to the entry. This is needed + * to protect against premature freeing of the entry by code + * concurrent calls to load, invalidate, and writeback. The lock + * for the zswap_tree structure that contains the entry must + * be held while changing the refcount. Since the lock must + * be held, there is no reason to also make refcount atomic. + * offset - the swap offset for the entry. Index into the red-black tree. + * handle - zpool allocation handle that stores the compressed page data + * length - the length in bytes of the compressed page data. Needed during + * decompression + */ +struct zswap_entry { + struct rb_node rbnode; + pgoff_t offset; + int refcount; + unsigned int length; + unsigned long handle; +}; + +struct zswap_header { + swp_entry_t swpentry; +}; + +/* + * The tree lock in the zswap_tree struct protects a few things: + * - the rbtree + * - the refcount field of each entry in the tree + */ +struct zswap_tree { + struct rb_root rbroot; + spinlock_t lock; +}; + +static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static int __init zswap_entry_cache_create(void) +{ + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); + return zswap_entry_cache == NULL; +} + +static void __init zswap_entry_cache_destroy(void) +{ + kmem_cache_destroy(zswap_entry_cache); +} + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc(zswap_entry_cache, gfp); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +/********************************* +* rbtree functions +**********************************/ +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) +{ + struct rb_node *node = root->rb_node; + struct zswap_entry *entry; + + while (node) { + entry = rb_entry(node, struct zswap_entry, rbnode); + if (entry->offset > offset) + node = node->rb_left; + else if (entry->offset < offset) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * In the case that a entry with the same offset is found, a pointer to + * the existing entry is stored in dupentry and the function returns -EEXIST + */ +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, + struct zswap_entry **dupentry) +{ + struct rb_node **link = &root->rb_node, *parent = NULL; + struct zswap_entry *myentry; + + while (*link) { + parent = *link; + myentry = rb_entry(parent, struct zswap_entry, rbnode); + if (myentry->offset > entry->offset) + link = &(*link)->rb_left; + else if (myentry->offset < entry->offset) + link = &(*link)->rb_right; + else { + *dupentry = myentry; + return -EEXIST; + } + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, root); + return 0; +} + +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + if (!RB_EMPTY_NODE(&entry->rbnode)) { + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); + } +} + +/* + * Carries out the common pattern of freeing and entry's zpool allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_entry *entry) +{ + zpool_free(zswap_pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock +* remove from the tree and free it, if nobody reference the entry +*/ +static void zswap_entry_put(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + int refcount = --entry->refcount; + + BUG_ON(refcount < 0); + if (refcount == 0) { + zswap_rb_erase(&tree->rbroot, entry); + zswap_free_entry(entry); + } +} + +/* caller must hold the tree lock */ +static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + pgoff_t offset) +{ + struct zswap_entry *entry = NULL; + + entry = zswap_rb_search(root, offset); + if (entry) + zswap_entry_get(entry); + + return entry; +} + +/********************************* +* per-cpu code +**********************************/ +static DEFINE_PER_CPU(u8 *, zswap_dstmem); + +static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + u8 *dst; + + switch (action) { + case CPU_UP_PREPARE: + tfm = crypto_alloc_comp(zswap_compressor, 0, 0); + if (IS_ERR(tfm)) { + pr_err("can't allocate compressor transform\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + return NOTIFY_BAD; + } + per_cpu(zswap_dstmem, cpu) = dst; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); + if (tfm) { + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + } + dst = per_cpu(zswap_dstmem, cpu); + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zswap_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + return __zswap_cpu_notifier(action, cpu); +} + +static struct notifier_block zswap_cpu_notifier_block = { + .notifier_call = zswap_cpu_notifier +}; + +static int __init zswap_cpu_init(void) +{ + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + goto cleanup; + __register_cpu_notifier(&zswap_cpu_notifier_block); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + +/********************************* +* helpers +**********************************/ +static bool zswap_is_full(void) +{ + return totalram_pages * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); +} + +/********************************* +* writeback code +**********************************/ +/* return enum for zswap_get_swap_cache_page */ +enum zswap_get_swap_ret { + ZSWAP_SWAPCACHE_NEW, + ZSWAP_SWAPCACHE_EXIST, + ZSWAP_SWAPCACHE_FAIL, +}; + +/* + * zswap_get_swap_cache_page + * + * This is an adaption of read_swap_cache_async() + * + * This function tries to find a page with the given swap entry + * in the swapper_space address space (the swap cache). If the page + * is found, it is returned in retpage. Otherwise, a page is allocated, + * added to the swap cache, and returned in retpage. + * + * If success, the swap cache page is returned in retpage + * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache + * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, + * the new page is added to swapcache and locked + * Returns ZSWAP_SWAPCACHE_FAIL on error + */ +static int zswap_get_swap_cache_page(swp_entry_t entry, + struct page **retpage) +{ + struct page *found_page, *new_page = NULL; + struct address_space *swapper_space = swap_address_space(entry); + int err; + + *retpage = NULL; + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(swapper_space, entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page(GFP_KERNEL); + if (!new_page) + break; /* Out of memory */ + } + + /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(GFP_KERNEL); + if (err) + break; + + /* + * Swap entry may have been freed since our caller observed it. + */ + err = swapcache_prepare(entry); + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); + continue; + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); + break; + } + + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + err = __add_to_swap_cache(new_page, entry); + if (likely(!err)) { + radix_tree_preload_end(); + lru_cache_add_anon(new_page); + *retpage = new_page; + return ZSWAP_SWAPCACHE_NEW; + } + radix_tree_preload_end(); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + swapcache_free(entry); + } while (err != -ENOMEM); + + if (new_page) + page_cache_release(new_page); + if (!found_page) + return ZSWAP_SWAPCACHE_FAIL; + *retpage = found_page; + return ZSWAP_SWAPCACHE_EXIST; +} + +/* + * Attempts to free an entry by adding a page to the swap cache, + * decompressing the entry data into the page, and issuing a + * bio write to write the page back to the swap device. + * + * This can be thought of as a "resumed writeback" of the page + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the frontswap_store() + * in the first place. After the page has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) +{ + struct zswap_header *zhdr; + swp_entry_t swpentry; + struct zswap_tree *tree; + pgoff_t offset; + struct zswap_entry *entry; + struct page *page; + u8 *src, *dst; + unsigned int dlen; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + /* extract swpentry from data */ + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + swpentry = zhdr->swpentry; /* here */ + zpool_unmap_handle(pool, handle); + tree = zswap_trees[swp_type(swpentry)]; + offset = swp_offset(swpentry); + + /* find and ref zswap entry */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was invalidated */ + spin_unlock(&tree->lock); + return 0; + } + spin_unlock(&tree->lock); + BUG_ON(offset != entry->offset); + + /* try to allocate swap cache page */ + switch (zswap_get_swap_cache_page(swpentry, &page)) { + case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + ret = -ENOMEM; + goto fail; + + case ZSWAP_SWAPCACHE_EXIST: + /* page is already in the swap cache, ignore for now */ + page_cache_release(page); + ret = -EEXIST; + goto fail; + + case ZSWAP_SWAPCACHE_NEW: /* page is locked */ + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, + entry->length, dst, &dlen); + kunmap_atomic(dst); + zpool_unmap_handle(zswap_pool, entry->handle); + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* page is up to date */ + SetPageUptodate(page); + } + + /* move it to the tail of the inactive list after end_writeback */ + SetPageReclaim(page); + + /* start writeback */ + __swap_writepage(page, &wbc, end_swap_bio_write); + page_cache_release(page); + zswap_written_back_pages++; + + spin_lock(&tree->lock); + /* drop local reference */ + zswap_entry_put(tree, entry); + + /* + * There are two possible situations for entry here: + * (1) refcount is 1(normal case), entry is valid and on the tree + * (2) refcount is 0, entry is freed and not on the tree + * because invalidate happened during writeback + * search the tree and free the entry if find entry + */ + if (entry == zswap_rb_search(&tree->rbroot, offset)) + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + + goto end; + + /* + * if we get here due to ZSWAP_SWAPCACHE_EXIST + * a load may happening concurrently + * it is safe and okay to not free the entry + * if we free the entry in the following put + * it it either okay to return !0 + */ +fail: + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + +end: + return ret; +} + +/********************************* +* frontswap hooks +**********************************/ +/* attempts to compress and store an single page */ +static int zswap_frontswap_store(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *dupentry; + int ret; + unsigned int dlen = PAGE_SIZE, len; + unsigned long handle; + char *buf; + u8 *src, *dst; + struct zswap_header *zhdr; + + if (!tree) { + ret = -ENODEV; + goto reject; + } + + /* reclaim space if needed */ + if (zswap_is_full()) { + zswap_pool_limit_hit++; + if (zpool_shrink(zswap_pool, 1, NULL)) { + zswap_reject_reclaim_fail++; + ret = -ENOMEM; + goto reject; + } + } + + /* allocate entry */ + entry = zswap_entry_cache_alloc(GFP_KERNEL); + if (!entry) { + zswap_reject_kmemcache_fail++; + ret = -ENOMEM; + goto reject; + } + + /* compress */ + dst = get_cpu_var(zswap_dstmem); + src = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); + kunmap_atomic(src); + if (ret) { + ret = -EINVAL; + goto freepage; + } + + /* store */ + len = dlen + sizeof(struct zswap_header); + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto freepage; + } + if (ret) { + zswap_reject_alloc_fail++; + goto freepage; + } + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); + zhdr->swpentry = swp_entry(type, offset); + buf = (u8 *)(zhdr + 1); + memcpy(buf, dst, dlen); + zpool_unmap_handle(zswap_pool, handle); + put_cpu_var(zswap_dstmem); + + /* populate entry */ + entry->offset = offset; + entry->handle = handle; + entry->length = dlen; + + /* map */ + spin_lock(&tree->lock); + do { + ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); + if (ret == -EEXIST) { + zswap_duplicate_entry++; + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); + } + } while (ret == -EEXIST); + spin_unlock(&tree->lock); + + /* update stats */ + atomic_inc(&zswap_stored_pages); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); + + return 0; + +freepage: + put_cpu_var(zswap_dstmem); + zswap_entry_cache_free(entry); +reject: + return ret; +} + +/* + * returns 0 if the page was successfully decompressed + * return -1 on entry not found or error +*/ +static int zswap_frontswap_load(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + u8 *src, *dst; + unsigned int dlen; + int ret; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return -1; + } + spin_unlock(&tree->lock); + + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, + dst, &dlen); + kunmap_atomic(dst); + zpool_unmap_handle(zswap_pool, entry->handle); + BUG_ON(ret); + + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + + return 0; +} + +/* frees an entry in zswap */ +static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return; + } + + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, entry); + + /* drop the initial reference from entry creation */ + zswap_entry_put(tree, entry); + + spin_unlock(&tree->lock); +} + +/* frees all zswap entries for the given swap type */ +static void zswap_frontswap_invalidate_area(unsigned type) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *n; + + if (!tree) + return; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) + zswap_free_entry(entry); + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); + kfree(tree); + zswap_trees[type] = NULL; +} + +static struct zpool_ops zswap_zpool_ops = { + .evict = zswap_writeback_entry +}; + +static void zswap_frontswap_init(unsigned type) +{ + struct zswap_tree *tree; + + tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + zswap_trees[type] = tree; +} + +static struct frontswap_ops zswap_frontswap_ops = { + .store = zswap_frontswap_store, + .load = zswap_frontswap_load, + .invalidate_page = zswap_frontswap_invalidate_page, + .invalidate_area = zswap_frontswap_invalidate_area, + .init = zswap_frontswap_init +}; + +/********************************* +* debugfs functions +**********************************/ +#ifdef CONFIG_DEBUG_FS +#include + +static struct dentry *zswap_debugfs_root; + +static int __init zswap_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zswap_debugfs_root = debugfs_create_dir("zswap", NULL); + if (!zswap_debugfs_root) + return -ENOMEM; + + debugfs_create_u64("pool_limit_hit", S_IRUGO, + zswap_debugfs_root, &zswap_pool_limit_hit); + debugfs_create_u64("reject_reclaim_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_alloc_fail); + debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_poor", S_IRUGO, + zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("written_back_pages", S_IRUGO, + zswap_debugfs_root, &zswap_written_back_pages); + debugfs_create_u64("duplicate_entry", S_IRUGO, + zswap_debugfs_root, &zswap_duplicate_entry); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); + debugfs_create_atomic_t("stored_pages", S_IRUGO, + zswap_debugfs_root, &zswap_stored_pages); + + return 0; +} + +static void __exit zswap_debugfs_exit(void) +{ + debugfs_remove_recursive(zswap_debugfs_root); +} +#else +static int __init zswap_debugfs_init(void) +{ + return 0; +} + +static void __exit zswap_debugfs_exit(void) { } +#endif + +/********************************* +* module init and exit +**********************************/ +static int __init init_zswap(void) +{ + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + + if (!zswap_enabled) + return 0; + + pr_info("loading zswap\n"); + + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); + } + if (!zswap_pool) { + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); + goto error; + } + pr_info("using %s pool\n", zswap_zpool_type); + + if (zswap_entry_cache_create()) { + pr_err("entry cache creation failed\n"); + goto cachefail; + } + if (zswap_comp_init()) { + pr_err("compressor initialization failed\n"); + goto compfail; + } + if (zswap_cpu_init()) { + pr_err("per-cpu initialization failed\n"); + goto pcpufail; + } + + frontswap_register_ops(&zswap_frontswap_ops); + if (zswap_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + return 0; +pcpufail: + zswap_comp_exit(); +compfail: + zswap_entry_cache_destroy(); +cachefail: + zpool_destroy_pool(zswap_pool); +error: + return -ENOMEM; +} +/* must be late so crypto has time to come up */ +late_initcall(init_zswap); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Compressed cache for swap pages"); -- cgit 1.2.3-korg