From e09b41010ba33a20a87472ee821fa407a5b8da36 Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Mon, 11 Apr 2016 10:41:07 +0300 Subject: These changes are the raw update to linux-4.4.6-rt14. Kernel sources are taken from kernel.org, and rt patch from the rt wiki download page. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen --- kernel/arch/powerpc/mm/Makefile | 1 + kernel/arch/powerpc/mm/copro_fault.c | 9 +- kernel/arch/powerpc/mm/fault.c | 4 + kernel/arch/powerpc/mm/fsl_booke_mmu.c | 28 ++- kernel/arch/powerpc/mm/hash_low_64.S | 4 +- kernel/arch/powerpc/mm/hash_native_64.c | 25 ++- kernel/arch/powerpc/mm/hash_utils_64.c | 23 +- kernel/arch/powerpc/mm/hugetlbpage.c | 59 ++++-- kernel/arch/powerpc/mm/mem.c | 20 +- kernel/arch/powerpc/mm/mmu_context_hash64.c | 6 + kernel/arch/powerpc/mm/mmu_context_iommu.c | 316 ++++++++++++++++++++++++++++ kernel/arch/powerpc/mm/mmu_decl.h | 4 +- kernel/arch/powerpc/mm/numa.c | 23 +- kernel/arch/powerpc/mm/pgtable_64.c | 83 +++----- kernel/arch/powerpc/mm/slb.c | 69 +++--- kernel/arch/powerpc/mm/tlb_hash64.c | 9 +- kernel/arch/powerpc/mm/tlb_low_64e.S | 86 ++++++-- kernel/arch/powerpc/mm/tlb_nohash.c | 43 +++- kernel/arch/powerpc/mm/tlb_nohash_low.S | 63 ++++++ 19 files changed, 694 insertions(+), 181 deletions(-) create mode 100644 kernel/arch/powerpc/mm/mmu_context_iommu.c (limited to 'kernel/arch/powerpc/mm') diff --git a/kernel/arch/powerpc/mm/Makefile b/kernel/arch/powerpc/mm/Makefile index 9c8770b5f..3eb73a382 100644 --- a/kernel/arch/powerpc/mm/Makefile +++ b/kernel/arch/powerpc/mm/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o +obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o diff --git a/kernel/arch/powerpc/mm/copro_fault.c b/kernel/arch/powerpc/mm/copro_fault.c index f031a47d7..6527882ce 100644 --- a/kernel/arch/powerpc/mm/copro_fault.c +++ b/kernel/arch/powerpc/mm/copro_fault.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include /* * This ought to be kept in sync with the powerpc specific do_page_fault @@ -100,7 +100,7 @@ EXPORT_SYMBOL_GPL(copro_handle_mm_fault); int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) { - u64 vsid; + u64 vsid, vsidkey; int psize, ssize; switch (REGION_ID(ea)) { @@ -109,6 +109,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) psize = get_slice_psize(mm, ea); ssize = user_segment_size(ea); vsid = get_vsid(mm->context.id, ea, ssize); + vsidkey = SLB_VSID_USER; break; case VMALLOC_REGION_ID: pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); @@ -118,19 +119,21 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) psize = mmu_io_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; break; case KERNEL_REGION_ID: pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea); psize = mmu_linear_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; break; default: pr_debug("%s: invalid region access at %016llx\n", __func__, ea); return 1; } - vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER; + vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey; vsid |= mmu_psize_defs[psize].sllp | ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0); diff --git a/kernel/arch/powerpc/mm/fault.c b/kernel/arch/powerpc/mm/fault.c index 6d535973b..a67c6d781 100644 --- a/kernel/arch/powerpc/mm/fault.c +++ b/kernel/arch/powerpc/mm/fault.c @@ -529,6 +529,10 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Unable to handle kernel paging request for " "instruction fetch\n"); break; + case 0x600: + printk(KERN_ALERT "Unable to handle kernel paging request for " + "unaligned access at address 0x%08lx\n", regs->dar); + break; default: printk(KERN_ALERT "Unable to handle kernel paging request for " "unknown fault\n"); diff --git a/kernel/arch/powerpc/mm/fsl_booke_mmu.c b/kernel/arch/powerpc/mm/fsl_booke_mmu.c index 9c90e66cf..f3afe3d97 100644 --- a/kernel/arch/powerpc/mm/fsl_booke_mmu.c +++ b/kernel/arch/powerpc/mm/fsl_booke_mmu.c @@ -112,7 +112,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, tsize = __ilog2(size) - 10; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) if ((flags & _PAGE_NO_CACHE) == 0) flags |= _PAGE_COHERENT; #endif @@ -141,8 +141,6 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, tlbcam_addrs[index].start = virt; tlbcam_addrs[index].limit = virt + size - 1; tlbcam_addrs[index].phys = phys; - - loadcam_entry(index); } unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, @@ -171,7 +169,8 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, } static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, - unsigned long ram, int max_cam_idx) + unsigned long ram, int max_cam_idx, + bool dryrun) { int i; unsigned long amount_mapped = 0; @@ -181,13 +180,20 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, unsigned long cam_sz; cam_sz = calc_cam_sz(ram, virt, phys); - settlbcam(i, virt, phys, cam_sz, pgprot_val(PAGE_KERNEL_X), 0); + if (!dryrun) + settlbcam(i, virt, phys, cam_sz, + pgprot_val(PAGE_KERNEL_X), 0); ram -= cam_sz; amount_mapped += cam_sz; virt += cam_sz; phys += cam_sz; } + + if (dryrun) + return amount_mapped; + + loadcam_multi(0, i, max_cam_idx); tlbcam_index = i; #ifdef CONFIG_PPC64 @@ -199,12 +205,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, return amount_mapped; } -unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx) +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun) { unsigned long virt = PAGE_OFFSET; phys_addr_t phys = memstart_addr; - return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx); + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun); } #ifdef CONFIG_PPC32 @@ -235,7 +241,7 @@ void __init adjust_total_lowmem(void) ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); i = switch_to_as1(); - __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false); restore_to_as0(i, 0, 0, 1); pr_info("Memory CAM mapping: "); @@ -303,10 +309,12 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) n = switch_to_as1(); /* map a 64M area for the second relocation */ if (memstart_addr > start) - map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM); + map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM, + false); else map_mem_in_cams_addr(start, PAGE_OFFSET + offset, - 0x4000000, CONFIG_LOWMEM_CAM_NUM); + 0x4000000, CONFIG_LOWMEM_CAM_NUM, + false); restore_to_as0(n, offset, __va(dt_ptr), 1); /* We should never reach here */ panic("Relocation error"); diff --git a/kernel/arch/powerpc/mm/hash_low_64.S b/kernel/arch/powerpc/mm/hash_low_64.S index 463174a4a..3b49e3295 100644 --- a/kernel/arch/powerpc/mm/hash_low_64.S +++ b/kernel/arch/powerpc/mm/hash_low_64.S @@ -701,7 +701,7 @@ htab_pte_insert_failure: #endif /* CONFIG_PPC_64K_PAGES */ -#ifdef CONFIG_PPC_HAS_HASH_64K +#ifdef CONFIG_PPC_64K_PAGES /***************************************************************************** * * @@ -993,7 +993,7 @@ ht64_pte_insert_failure: b ht64_bail -#endif /* CONFIG_PPC_HAS_HASH_64K */ +#endif /* CONFIG_PPC_64K_PAGES */ /***************************************************************************** diff --git a/kernel/arch/powerpc/mm/hash_native_64.c b/kernel/arch/powerpc/mm/hash_native_64.c index 9c4880dde..c8822af10 100644 --- a/kernel/arch/powerpc/mm/hash_native_64.c +++ b/kernel/arch/powerpc/mm/hash_native_64.c @@ -29,7 +29,7 @@ #include #include -#include +#include #ifdef DEBUG_LOW #define DBG_LOW(fmt...) udbg_printf(fmt) @@ -582,13 +582,21 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * be when they isi), and we are the only one left. We rely on our kernel * mapping being 0xC0's and the hardware ignoring those two real bits. * + * This must be called with interrupts disabled. + * + * Taking the native_tlbie_lock is unsafe here due to the possibility of + * lockdep being on. On pre POWER5 hardware, not taking the lock could + * cause deadlock. POWER5 and newer not taking the lock is fine. This only + * gets called during boot before secondary CPUs have come up and during + * crashdump and all bets are off anyway. + * * TODO: add batching support when enabled. remember, no dynamic memory here, * athough there is the control page available... */ static void native_hpte_clear(void) { unsigned long vpn = 0; - unsigned long slot, slots, flags; + unsigned long slot, slots; struct hash_pte *hptep = htab_address; unsigned long hpte_v; unsigned long pteg_count; @@ -596,13 +604,6 @@ static void native_hpte_clear(void) pteg_count = htab_hash_mask + 1; - local_irq_save(flags); - - /* we take the tlbie lock and hold it. Some hardware will - * deadlock if we try to tlbie from two processors at once. - */ - raw_spin_lock(&native_tlbie_lock); - slots = pteg_count * HPTES_PER_GROUP; for (slot = 0; slot < slots; slot++, hptep++) { @@ -614,8 +615,8 @@ static void native_hpte_clear(void) hpte_v = be64_to_cpu(hptep->v); /* - * Call __tlbie() here rather than tlbie() since we - * already hold the native_tlbie_lock. + * Call __tlbie() here rather than tlbie() since we can't take the + * native_tlbie_lock. */ if (hpte_v & HPTE_V_VALID) { hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); @@ -625,8 +626,6 @@ static void native_hpte_clear(void) } asm volatile("eieio; tlbsync; ptesync":::"memory"); - raw_spin_unlock(&native_tlbie_lock); - local_irq_restore(flags); } /* diff --git a/kernel/arch/powerpc/mm/hash_utils_64.c b/kernel/arch/powerpc/mm/hash_utils_64.c index fda236f90..7f9616f7c 100644 --- a/kernel/arch/powerpc/mm/hash_utils_64.c +++ b/kernel/arch/powerpc/mm/hash_utils_64.c @@ -57,6 +57,7 @@ #include #include #include +#include #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -639,7 +640,7 @@ extern u32 ht64_call_hpte_updatepp[]; static void __init htab_finish_init(void) { -#ifdef CONFIG_PPC_HAS_HASH_64K +#ifdef CONFIG_PPC_64K_PAGES patch_branch(ht64_call_hpte_insert1, ppc_function_entry(ppc_md.hpte_insert), BRANCH_SET_LINK); @@ -652,7 +653,7 @@ static void __init htab_finish_init(void) patch_branch(ht64_call_hpte_updatepp, ppc_function_entry(ppc_md.hpte_updatepp), BRANCH_SET_LINK); -#endif /* CONFIG_PPC_HAS_HASH_64K */ +#endif /* CONFIG_PPC_64K_PAGES */ patch_branch(htab_call_hpte_insert1, ppc_function_entry(ppc_md.hpte_insert), @@ -993,6 +994,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap, unsigned long flags) { + bool is_thp; enum ctx_state prev_state = exception_enter(); pgd_t *pgdir; unsigned long vsid; @@ -1004,6 +1006,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", ea, access, trap); + trace_hash_fault(ea, access, trap); /* Get region & vsid */ switch (REGION_ID(ea)) { @@ -1066,7 +1069,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); + ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); rc = 1; @@ -1086,7 +1089,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, } if (hugeshift) { - if (pmd_trans_huge(*(pmd_t *)ptep)) + if (is_thp) rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, trap, flags, ssize, psize); #ifdef CONFIG_HUGETLB_PAGE @@ -1149,12 +1152,12 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, check_paca_psize(ea, mm, psize, user_region); #endif /* CONFIG_PPC_64K_PAGES */ -#ifdef CONFIG_PPC_HAS_HASH_64K +#ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) rc = __hash_page_64K(ea, access, vsid, ptep, trap, flags, ssize); else -#endif /* CONFIG_PPC_HAS_HASH_64K */ +#endif /* CONFIG_PPC_64K_PAGES */ { int spp = subpage_protection(mm, ea); if (access & spp) @@ -1241,7 +1244,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, * THP pages use update_mmu_cache_pmd. We don't do * hash preload there. Hence can ignore THP here */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); + ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift); if (!ptep) goto out_exit; @@ -1262,12 +1265,12 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, update_flags |= HPTE_LOCAL_UPDATE; /* Hash it in */ -#ifdef CONFIG_PPC_HAS_HASH_64K +#ifdef CONFIG_PPC_64K_PAGES if (mm->context.user_psize == MMU_PAGE_64K) rc = __hash_page_64K(ea, access, vsid, ptep, trap, update_flags, ssize); else -#endif /* CONFIG_PPC_HAS_HASH_64K */ +#endif /* CONFIG_PPC_64K_PAGES */ rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, ssize, subpage_protection(mm, ea)); @@ -1475,7 +1478,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long hash; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); long ret; hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); diff --git a/kernel/arch/powerpc/mm/hugetlbpage.c b/kernel/arch/powerpc/mm/hugetlbpage.c index 3385e3d05..9833fee49 100644 --- a/kernel/arch/powerpc/mm/hugetlbpage.c +++ b/kernel/arch/powerpc/mm/hugetlbpage.c @@ -89,6 +89,25 @@ int pgd_huge(pgd_t pgd) */ return ((pgd_val(pgd) & 0x3) != 0x0); } + +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM) +/* + * This enables us to catch the wrong page directory format + * Moved here so that we can use WARN() in the call. + */ +int hugepd_ok(hugepd_t hpd) +{ + bool is_hugepd; + + /* + * We should not find this format in page directory, warn otherwise. + */ + is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0)); + WARN(is_hugepd, "Found wrong page directory format\n"); + return 0; +} +#endif + #else int pmd_huge(pmd_t pmd) { @@ -109,7 +128,7 @@ int pgd_huge(pgd_t pgd) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { /* Only called for hugetlbfs pages, hence can ignore THP */ - return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); + return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -336,7 +355,7 @@ int alloc_bootmem_huge_page(struct hstate *hstate) unsigned long gpage_npages[MMU_PAGE_COUNT]; static int __init do_gpage_early_setup(char *param, char *val, - const char *unused) + const char *unused, void *arg) { static phys_addr_t size; unsigned long npages; @@ -385,7 +404,7 @@ void __init reserve_hugetlb_gpages(void) strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, - &do_gpage_early_setup); + NULL, &do_gpage_early_setup); /* * Walk gpage list in reverse, allocating larger page sizes first. @@ -439,11 +458,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate) } #endif -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) -{ - return 0; -} - #ifdef CONFIG_PPC_FSL_BOOK3E #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) @@ -689,13 +703,14 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { + bool is_thp; pte_t *ptep, pte; unsigned shift; unsigned long mask, flags; struct page *page = ERR_PTR(-EINVAL); local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); + ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift); if (!ptep) goto no_page; pte = READ_ONCE(*ptep); @@ -704,7 +719,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) * Transparent hugepages are handled by generic code. We can skip them * here. */ - if (!shift || pmd_trans_huge(__pmd(pte_val(pte)))) + if (!shift || is_thp) goto no_page; if (!pte_present(pte)) { @@ -813,14 +828,6 @@ static int __init add_huge_page_size(unsigned long long size) if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) return -EINVAL; -#ifdef CONFIG_SPU_FS_64K_LS - /* Disable support for 64K huge pages when 64K SPU local store - * support is enabled as the current implementation conflicts. - */ - if (shift == PAGE_SHIFT_64K) - return -EINVAL; -#endif /* CONFIG_SPU_FS_64K_LS */ - BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); /* Return if huge page size has already been setup */ @@ -933,7 +940,7 @@ static int __init hugetlbpage_init(void) return 0; } #endif -module_init(hugetlbpage_init); +arch_initcall(hugetlbpage_init); void flush_dcache_icache_hugepage(struct page *page) { @@ -969,7 +976,7 @@ void flush_dcache_icache_hugepage(struct page *page) */ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - unsigned *shift) + bool *is_thp, unsigned *shift) { pgd_t pgd, *pgdp; pud_t pud, *pudp; @@ -981,6 +988,9 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, if (shift) *shift = 0; + if (is_thp) + *is_thp = false; + pgdp = pgdir + pgd_index(ea); pgd = READ_ONCE(*pgdp); /* @@ -1028,7 +1038,14 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, if (pmd_none(pmd)) return NULL; - if (pmd_huge(pmd) || pmd_large(pmd)) { + if (pmd_trans_huge(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *) pmdp; + goto out; + } + + if (pmd_huge(pmd)) { ret_pte = (pte_t *) pmdp; goto out; } else if (is_hugepd(__hugepd(pmd_val(pmd)))) diff --git a/kernel/arch/powerpc/mm/mem.c b/kernel/arch/powerpc/mm/mem.c index 45fda71fe..22d94c3e6 100644 --- a/kernel/arch/powerpc/mm/mem.c +++ b/kernel/arch/powerpc/mm/mem.c @@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start) } #endif -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata; struct zone *zone; @@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size) /* this should work for most non-highmem platforms */ zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0); + zone_for_memory(nid, start, size, 0, for_device); return __add_pages(nid, zone, start_pfn, nr_pages); } @@ -414,17 +414,17 @@ void flush_dcache_icache_page(struct page *page) return; } #endif -#ifdef CONFIG_BOOKE - { +#if defined(CONFIG_8xx) || defined(CONFIG_PPC64) + /* On 8xx there is no need to kmap since highmem is not supported */ + __flush_dcache_icache(page_address(page)); +#else + if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { void *start = kmap_atomic(page); __flush_dcache_icache(start); kunmap_atomic(start); + } else { + __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); } -#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) - /* On 8xx there is no need to kmap since highmem is not supported */ - __flush_dcache_icache(page_address(page)); -#else - __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); #endif } EXPORT_SYMBOL(flush_dcache_icache_page); @@ -560,7 +560,7 @@ subsys_initcall(add_system_ram_resources); */ int devmem_is_allowed(unsigned long pfn) { - if (iomem_is_exclusive(pfn << PAGE_SHIFT)) + if (iomem_is_exclusive(PFN_PHYS(pfn))) return 0; if (!page_is_ram(pfn)) return 1; diff --git a/kernel/arch/powerpc/mm/mmu_context_hash64.c b/kernel/arch/powerpc/mm/mmu_context_hash64.c index 178876aef..4e4efbc26 100644 --- a/kernel/arch/powerpc/mm/mmu_context_hash64.c +++ b/kernel/arch/powerpc/mm/mmu_context_hash64.c @@ -88,6 +88,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) #ifdef CONFIG_PPC_64K_PAGES mm->context.pte_frag = NULL; +#endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_init(&mm->context); #endif return 0; } @@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm) void destroy_context(struct mm_struct *mm) { +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_cleanup(&mm->context); +#endif #ifdef CONFIG_PPC_ICSWX drop_cop(mm->context.acop, mm); diff --git a/kernel/arch/powerpc/mm/mmu_context_iommu.c b/kernel/arch/powerpc/mm/mmu_context_iommu.c new file mode 100644 index 000000000..da6a2168a --- /dev/null +++ b/kernel/arch/powerpc/mm/mmu_context_iommu.c @@ -0,0 +1,316 @@ +/* + * IOMMU helpers in MMU context. + * + * Copyright (C) 2015 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(mem_list_mutex); + +struct mm_iommu_table_group_mem_t { + struct list_head next; + struct rcu_head rcu; + unsigned long used; + atomic64_t mapped; + u64 ua; /* userspace address */ + u64 entries; /* number of entries in hpas[] */ + u64 *hpas; /* vmalloc'ed */ +}; + +static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, + unsigned long npages, bool incr) +{ + long ret = 0, locked, lock_limit; + + if (!npages) + return 0; + + down_write(&mm->mmap_sem); + + if (incr) { + locked = mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + mm->locked_vm += npages; + } else { + if (WARN_ON_ONCE(npages > mm->locked_vm)) + npages = mm->locked_vm; + mm->locked_vm -= npages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", + current->pid, + incr ? '+' : '-', + npages << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK)); + up_write(&mm->mmap_sem); + + return ret; +} + +bool mm_iommu_preregistered(void) +{ + if (!current || !current->mm) + return false; + + return !list_empty(¤t->mm->context.iommu_group_mem_list); +} +EXPORT_SYMBOL_GPL(mm_iommu_preregistered); + +long mm_iommu_get(unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem) +{ + struct mm_iommu_table_group_mem_t *mem; + long i, j, ret = 0, locked_entries = 0; + struct page *page = NULL; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list, + next) { + if ((mem->ua == ua) && (mem->entries == entries)) { + ++mem->used; + *pmem = mem; + goto unlock_exit; + } + + /* Overlap? */ + if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem->ua + + (mem->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + goto unlock_exit; + } + + } + + ret = mm_iommu_adjust_locked_vm(current->mm, entries, true); + if (ret) + goto unlock_exit; + + locked_entries = entries; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) { + ret = -ENOMEM; + goto unlock_exit; + } + + mem->hpas = vzalloc(entries * sizeof(mem->hpas[0])); + if (!mem->hpas) { + kfree(mem); + ret = -ENOMEM; + goto unlock_exit; + } + + for (i = 0; i < entries; ++i) { + if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + 1/* pages */, 1/* iswrite */, &page)) { + for (j = 0; j < i; ++j) + put_page(pfn_to_page( + mem->hpas[j] >> PAGE_SHIFT)); + vfree(mem->hpas); + kfree(mem); + ret = -EFAULT; + goto unlock_exit; + } + + mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; + } + + atomic64_set(&mem->mapped, 1); + mem->used = 1; + mem->ua = ua; + mem->entries = entries; + *pmem = mem; + + list_add_rcu(&mem->next, ¤t->mm->context.iommu_group_mem_list); + +unlock_exit: + if (locked_entries && ret) + mm_iommu_adjust_locked_vm(current->mm, locked_entries, false); + + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_get); + +static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) +{ + long i; + struct page *page = NULL; + + for (i = 0; i < mem->entries; ++i) { + if (!mem->hpas[i]) + continue; + + page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT); + if (!page) + continue; + + put_page(page); + mem->hpas[i] = 0; + } +} + +static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem) +{ + + mm_iommu_unpin(mem); + vfree(mem->hpas); + kfree(mem); +} + +static void mm_iommu_free(struct rcu_head *head) +{ + struct mm_iommu_table_group_mem_t *mem = container_of(head, + struct mm_iommu_table_group_mem_t, rcu); + + mm_iommu_do_free(mem); +} + +static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) +{ + list_del_rcu(&mem->next); + mm_iommu_adjust_locked_vm(current->mm, mem->entries, false); + call_rcu(&mem->rcu, mm_iommu_free); +} + +long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem) +{ + long ret = 0; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + mutex_lock(&mem_list_mutex); + + if (mem->used == 0) { + ret = -ENOENT; + goto unlock_exit; + } + + --mem->used; + /* There are still users, exit */ + if (mem->used) + goto unlock_exit; + + /* Are there still mappings? */ + if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) { + ++mem->used; + ret = -EBUSY; + goto unlock_exit; + } + + /* @mapped became 0 so now mappings are disabled, release the region */ + mm_iommu_release(mem); + +unlock_exit: + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_put); + +struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua, + unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_rcu(mem, + ¤t->mm->context.iommu_group_mem_list, + next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_lookup); + +struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua, + unsigned long entries) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_rcu(mem, + ¤t->mm->context.iommu_group_mem_list, + next) { + if ((mem->ua == ua) && (mem->entries == entries)) { + ret = mem; + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_find); + +long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + u64 *va = &mem->hpas[entry]; + + if (entry >= mem->entries) + return -EFAULT; + + *hpa = *va | (ua & ~PAGE_MASK); + + return 0; +} +EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); + +long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) +{ + if (atomic64_inc_not_zero(&mem->mapped)) + return 0; + + /* Last mm_iommu_put() has been called, no more mappings allowed() */ + return -ENXIO; +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc); + +void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem) +{ + atomic64_add_unless(&mem->mapped, -1, 1); +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec); + +void mm_iommu_init(mm_context_t *ctx) +{ + INIT_LIST_HEAD_RCU(&ctx->iommu_group_mem_list); +} + +void mm_iommu_cleanup(mm_context_t *ctx) +{ + struct mm_iommu_table_group_mem_t *mem, *tmp; + + list_for_each_entry_safe(mem, tmp, &ctx->iommu_group_mem_list, next) { + list_del_rcu(&mem->next); + mm_iommu_do_free(mem); + } +} diff --git a/kernel/arch/powerpc/mm/mmu_decl.h b/kernel/arch/powerpc/mm/mmu_decl.h index 085b66b10..9f58ff44a 100644 --- a/kernel/arch/powerpc/mm/mmu_decl.h +++ b/kernel/arch/powerpc/mm/mmu_decl.h @@ -141,7 +141,8 @@ extern void MMU_init_hw(void); extern unsigned long mmu_mapin_ram(unsigned long top); #elif defined(CONFIG_PPC_FSL_BOOK3E) -extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx); +extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, + bool dryrun); extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, phys_addr_t phys); #ifdef CONFIG_PPC32 @@ -152,6 +153,7 @@ extern int switch_to_as1(void); extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); #endif extern void loadcam_entry(unsigned int index); +extern void loadcam_multi(int first_idx, int num, int tmp_idx); struct tlbcam { u32 MAS0; diff --git a/kernel/arch/powerpc/mm/numa.c b/kernel/arch/powerpc/mm/numa.c index 5e80621d9..669a15e7f 100644 --- a/kernel/arch/powerpc/mm/numa.c +++ b/kernel/arch/powerpc/mm/numa.c @@ -80,7 +80,7 @@ static void __init setup_node_to_cpumask_map(void) setup_nr_node_ids(); /* allocate the map */ - for (node = 0; node < nr_node_ids; node++) + for_each_node(node) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ @@ -225,7 +225,7 @@ static void initialize_distance_lookup_table(int nid, for (i = 0; i < distance_ref_points_depth; i++) { const __be32 *entry; - entry = &associativity[be32_to_cpu(distance_ref_points[i])]; + entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; distance_lookup_table[nid][i] = of_read_number(entry, 1); } } @@ -248,8 +248,12 @@ static int associativity_to_nid(const __be32 *associativity) nid = -1; if (nid > 0 && - of_read_number(associativity, 1) >= distance_ref_points_depth) - initialize_distance_lookup_table(nid, associativity); + of_read_number(associativity, 1) >= distance_ref_points_depth) { + /* + * Skip the length field and send start of associativity array + */ + initialize_distance_lookup_table(nid, associativity + 1); + } out: return nid; @@ -272,7 +276,6 @@ static int of_node_to_nid_single(struct device_node *device) /* Walk the device tree upwards, looking for an associativity id */ int of_node_to_nid(struct device_node *device) { - struct device_node *tmp; int nid = -1; of_node_get(device); @@ -281,9 +284,7 @@ int of_node_to_nid(struct device_node *device) if (nid != -1) break; - tmp = device; - device = of_get_parent(tmp); - of_node_put(tmp); + device = of_get_next_parent(device); } of_node_put(device); @@ -507,6 +508,12 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, if (nid == 0xffff || nid >= MAX_NUMNODES) nid = default_nid; + + if (nid > 0) { + index = drmem->aa_index * aa->array_sz; + initialize_distance_lookup_table(nid, + &aa->arrays[index]); + } } return nid; diff --git a/kernel/arch/powerpc/mm/pgtable_64.c b/kernel/arch/powerpc/mm/pgtable_64.c index 6bfadf1aa..e92cb2146 100644 --- a/kernel/arch/powerpc/mm/pgtable_64.c +++ b/kernel/arch/powerpc/mm/pgtable_64.c @@ -149,17 +149,7 @@ int map_kernel_page(unsigned long ea, unsigned long pa, int flags) #endif /* !CONFIG_PPC_MMU_NOHASH */ } -#ifdef CONFIG_PPC_BOOK3E_64 - /* - * With hardware tablewalk, a sync is needed to ensure that - * subsequent accesses see the PTE we just wrote. Unlike userspace - * mappings, we can't tolerate spurious faults, so make sure - * the new PTE will be seen the first time. - */ - mb(); -#else smp_wmb(); -#endif return 0; } @@ -554,47 +544,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, return old; } -pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - if (pmd_trans_huge(*pmdp)) { - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); - } else { - /* - * khugepaged calls this for normal pmd - */ - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - kick_all_cpus_sync(); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - } + VM_BUG_ON(pmd_trans_huge(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); return pmd; } @@ -817,8 +802,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, return; } -pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) +pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { pmd_t old_pmd; pgtable_t pgtable; diff --git a/kernel/arch/powerpc/mm/slb.c b/kernel/arch/powerpc/mm/slb.c index 6e450ca66..515730e49 100644 --- a/kernel/arch/powerpc/mm/slb.c +++ b/kernel/arch/powerpc/mm/slb.c @@ -25,6 +25,11 @@ #include #include +enum slb_index { + LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */ + VMALLOC_INDEX = 1, /* Kernel virtual map (0xd000000000000000) */ + KSTACK_INDEX = 2, /* Kernel stack map */ +}; extern void slb_allocate_realmode(unsigned long ea); extern void slb_allocate_user(unsigned long ea); @@ -41,9 +46,9 @@ static void slb_allocate(unsigned long ea) (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) static inline unsigned long mk_esid_data(unsigned long ea, int ssize, - unsigned long slot) + enum slb_index index) { - return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index; } static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, @@ -55,39 +60,39 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, static inline void slb_shadow_update(unsigned long ea, int ssize, unsigned long flags, - unsigned long entry) + enum slb_index index) { + struct slb_shadow *p = get_slb_shadow(); + /* * Clear the ESID first so the entry is not valid while we are * updating it. No write barriers are needed here, provided * we only update the current CPU's SLB shadow buffer. */ - get_slb_shadow()->save_area[entry].esid = 0; - get_slb_shadow()->save_area[entry].vsid = - cpu_to_be64(mk_vsid_data(ea, ssize, flags)); - get_slb_shadow()->save_area[entry].esid = - cpu_to_be64(mk_esid_data(ea, ssize, entry)); + p->save_area[index].esid = 0; + p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags)); + p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index)); } -static inline void slb_shadow_clear(unsigned long entry) +static inline void slb_shadow_clear(enum slb_index index) { - get_slb_shadow()->save_area[entry].esid = 0; + get_slb_shadow()->save_area[index].esid = 0; } static inline void create_shadowed_slbe(unsigned long ea, int ssize, unsigned long flags, - unsigned long entry) + enum slb_index index) { /* * Updating the shadow buffer before writing the SLB ensures * we don't get a stale entry here if we get preempted by PHYP * between these two statements. */ - slb_shadow_update(ea, ssize, flags, entry); + slb_shadow_update(ea, ssize, flags, index); asm volatile("slbmte %0,%1" : : "r" (mk_vsid_data(ea, ssize, flags)), - "r" (mk_esid_data(ea, ssize, entry)) + "r" (mk_esid_data(ea, ssize, index)) : "memory" ); } @@ -103,16 +108,16 @@ static void __slb_flush_and_rebolt(void) lflags = SLB_VSID_KERNEL | linear_llp; vflags = SLB_VSID_KERNEL | vmalloc_llp; - ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2); + ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX); if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { ksp_esid_data &= ~SLB_ESID_V; ksp_vsid_data = 0; - slb_shadow_clear(2); + slb_shadow_clear(KSTACK_INDEX); } else { /* Update stack entry; others don't change */ - slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2); + slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX); ksp_vsid_data = - be64_to_cpu(get_slb_shadow()->save_area[2].vsid); + be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid); } /* We need to do this all in asm, so we're sure we don't touch @@ -151,7 +156,7 @@ void slb_vmalloc_update(void) unsigned long vflags; vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; - slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1); + slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); slb_flush_and_rebolt(); } @@ -249,11 +254,24 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) static inline void patch_slb_encoding(unsigned int *insn_addr, unsigned int immed) { - int insn = (*insn_addr & 0xffff0000) | immed; + + /* + * This function patches either an li or a cmpldi instruction with + * a new immediate value. This relies on the fact that both li + * (which is actually addi) and cmpldi both take a 16-bit immediate + * value, and it is situated in the same location in the instruction, + * ie. bits 16-31 (Big endian bit order) or the lower 16 bits. + * The signedness of the immediate operand differs between the two + * instructions however this code is only ever patching a small value, + * much less than 1 << 15, so we can get away with it. + * To patch the value we read the existing instruction, clear the + * immediate value, and or in our new value, then write the instruction + * back. + */ + unsigned int insn = (*insn_addr & 0xffff0000) | immed; patch_instruction(insn_addr, insn); } -extern u32 slb_compare_rr_to_size[]; extern u32 slb_miss_kernel_load_linear[]; extern u32 slb_miss_kernel_load_io[]; extern u32 slb_compare_rr_to_size[]; @@ -309,24 +327,23 @@ void slb_initialize(void) lflags = SLB_VSID_KERNEL | linear_llp; vflags = SLB_VSID_KERNEL | vmalloc_llp; - /* Invalidate the entire SLB (even slot 0) & all the ERATS */ + /* Invalidate the entire SLB (even entry 0) & all the ERATS */ asm volatile("isync":::"memory"); asm volatile("slbmte %0,%0"::"r" (0) : "memory"); asm volatile("isync; slbia; isync":::"memory"); - create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0); - - create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1); + create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); + create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX); /* For the boot cpu, we're running on the stack in init_thread_union, * which is in the first segment of the linear mapping, and also * get_paca()->kstack hasn't been initialized yet. * For secondary cpus, we need to bolt the kernel stack entry now. */ - slb_shadow_clear(2); + slb_shadow_clear(KSTACK_INDEX); if (raw_smp_processor_id() != boot_cpuid && (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) create_shadowed_slbe(get_paca()->kstack, - mmu_kernel_ssize, lflags, 2); + mmu_kernel_ssize, lflags, KSTACK_INDEX); asm volatile("isync":::"memory"); } diff --git a/kernel/arch/powerpc/mm/tlb_hash64.c b/kernel/arch/powerpc/mm/tlb_hash64.c index c522969f0..f7b80391b 100644 --- a/kernel/arch/powerpc/mm/tlb_hash64.c +++ b/kernel/arch/powerpc/mm/tlb_hash64.c @@ -190,6 +190,7 @@ void tlb_flush(struct mmu_gather *tlb) void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end) { + bool is_thp; int hugepage_shift; unsigned long flags; @@ -208,21 +209,21 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, + pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp, &hugepage_shift); unsigned long pte; if (ptep == NULL) continue; pte = pte_val(*ptep); - if (hugepage_shift) + if (is_thp) trace_hugepage_invalidate(start, pte); if (!(pte & _PAGE_HASHPTE)) continue; - if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) + if (unlikely(is_thp)) hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); else - hpte_need_flush(mm, start, ptep, pte, 0); + hpte_need_flush(mm, start, ptep, pte, hugepage_shift); } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); diff --git a/kernel/arch/powerpc/mm/tlb_low_64e.S b/kernel/arch/powerpc/mm/tlb_low_64e.S index 89bf95bd6..29d6987c3 100644 --- a/kernel/arch/powerpc/mm/tlb_low_64e.S +++ b/kernel/arch/powerpc/mm/tlb_low_64e.S @@ -68,11 +68,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) ld r14,PACAPGD(r13) std r15,EX_TLB_R15(r12) std r10,EX_TLB_CR(r12) +#ifdef CONFIG_PPC_FSL_BOOK3E + std r7,EX_TLB_R7(r12) +#endif TLB_MISS_PROLOG_STATS .endm .macro tlb_epilog_bolted ld r14,EX_TLB_CR(r12) +#ifdef CONFIG_PPC_FSL_BOOK3E + ld r7,EX_TLB_R7(r12) +#endif ld r10,EX_TLB_R10(r12) ld r11,EX_TLB_R11(r12) ld r13,EX_TLB_R13(r12) @@ -297,6 +303,7 @@ itlb_miss_fault_bolted: * r13 = PACA * r11 = tlb_per_core ptr * r10 = crap (free to use) + * r7 = esel_next */ tlb_miss_common_e6500: crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */ @@ -308,11 +315,11 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ * * MAS6:IND should be already set based on MAS4 */ -1: lbarx r15,0,r11 lhz r10,PACAPACAINDEX(r13) - cmpdi r15,0 - cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */ addi r10,r10,1 + crclr cr1*4+eq /* set cr1.eq = 0 for non-recursive */ +1: lbarx r15,0,r11 + cmpdi r15,0 bne 2f stbcx. r10,0,r11 bne 1b @@ -320,12 +327,16 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ .subsection 1 2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */ beq cr1,3b /* unlock will happen if cr1.eq = 0 */ - lbz r15,0(r11) +10: lbz r15,0(r11) cmpdi r15,0 - bne 2b + bne 10b b 1b .previous +END_FTR_SECTION_IFSET(CPU_FTR_SMT) + + lbz r7,TCD_ESEL_NEXT(r11) +BEGIN_FTR_SECTION /* CPU_FTR_SMT */ /* * Erratum A-008139 says that we can't use tlbwe to change * an indirect entry in any way (including replacing or @@ -334,8 +345,7 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */ * with tlbilx before overwriting. */ - lbz r15,TCD_ESEL_NEXT(r11) - rlwinm r10,r15,16,0xff0000 + rlwinm r10,r7,16,0xff0000 oris r10,r10,MAS0_TLBSEL(1)@h mtspr SPRN_MAS0,r10 isync @@ -398,18 +408,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 cmpdi cr0,r14,0 - bge tlb_miss_fault_e6500 /* Bad pgd entry or hugepage; bail */ + bge tlb_miss_huge_e6500 /* Bad pgd entry or hugepage; bail */ ldx r14,r14,r15 /* grab pud entry */ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 clrrdi r15,r15,3 cmpdi cr0,r14,0 - bge tlb_miss_fault_e6500 + bge tlb_miss_huge_e6500 ldx r14,r14,r15 /* Grab pmd entry */ mfspr r10,SPRN_MAS0 cmpdi cr0,r14,0 - bge tlb_miss_fault_e6500 + bge tlb_miss_huge_e6500 /* Now we build the MAS for a 2M indirect page: * @@ -428,15 +438,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) clrrdi r15,r16,21 /* make EA 2M-aligned */ mtspr SPRN_MAS2,r15 - lbz r15,TCD_ESEL_NEXT(r11) +tlb_miss_huge_done_e6500: lbz r16,TCD_ESEL_MAX(r11) lbz r14,TCD_ESEL_FIRST(r11) - rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */ - addi r15,r15,1 /* increment esel_next */ + rlwimi r10,r7,16,0x00ff0000 /* insert esel_next into MAS0 */ + addi r7,r7,1 /* increment esel_next */ mtspr SPRN_MAS0,r10 - cmpw r15,r16 - iseleq r15,r14,r15 /* if next == last use first */ - stb r15,TCD_ESEL_NEXT(r11) + cmpw r7,r16 + iseleq r7,r14,r7 /* if next == last use first */ + stb r7,TCD_ESEL_NEXT(r11) tlbwe @@ -456,6 +466,50 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT) tlb_epilog_bolted rfi +tlb_miss_huge_e6500: + beq tlb_miss_fault_e6500 + li r10,1 + andi. r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */ + rldimi r14,r10,63,0 /* Set PD_HUGE */ + xor r14,r14,r15 /* Clear size bits */ + ldx r14,0,r14 + + /* + * Now we build the MAS for a huge page. + * + * MAS 0 : ESEL needs to be filled by software round-robin + * - can be handled by indirect code + * MAS 1 : Need to clear IND and set TSIZE + * MAS 2,3+7: Needs to be redone similar to non-tablewalk handler + */ + + subi r15,r15,10 /* Convert psize to tsize */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,~MAS1_IND + rlwimi r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK + mtspr SPRN_MAS1,r10 + + li r10,-0x400 + sld r15,r10,r15 /* Generate mask based on size */ + and r10,r16,r15 + rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + rlwimi r10,r14,32-19,27,31 /* Insert WIMGE */ + clrldi r15,r15,PAGE_SHIFT /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + mtspr SPRN_MAS2,r10 + andi. r10,r14,_PAGE_DIRTY + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + bne 1f + li r10,MAS3_SW|MAS3_UW + andc r15,r15,r10 +1: + mtspr SPRN_MAS7_MAS3,r15 + + mfspr r10,SPRN_MAS0 + b tlb_miss_huge_done_e6500 + tlb_miss_kernel_e6500: ld r14,PACA_KERNELPGD(r13) cmpldi cr1,r15,8 /* Check for vmalloc region */ diff --git a/kernel/arch/powerpc/mm/tlb_nohash.c b/kernel/arch/powerpc/mm/tlb_nohash.c index cbd3d0698..bb04e4df3 100644 --- a/kernel/arch/powerpc/mm/tlb_nohash.c +++ b/kernel/arch/powerpc/mm/tlb_nohash.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -217,7 +218,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock); static int mm_is_core_local(struct mm_struct *mm) { return cpumask_subset(mm_cpumask(mm), - topology_thread_cpumask(smp_processor_id())); + topology_sibling_cpumask(smp_processor_id())); } struct tlb_flush_param { @@ -628,10 +629,26 @@ static void early_init_this_mmu(void) #ifdef CONFIG_PPC_FSL_BOOK3E if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned int num_cams; + int __maybe_unused cpu = smp_processor_id(); + bool map = true; /* use a quarter of the TLBCAM for bolted linear map */ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; - linear_map_top = map_mem_in_cams(linear_map_top, num_cams); + + /* + * Only do the mapping once per core, or else the + * transient mapping would cause problems. + */ +#ifdef CONFIG_SMP + if (cpu != boot_cpuid && + (cpu != cpu_first_thread_sibling(cpu) || + cpu == cpu_first_thread_sibling(boot_cpuid))) + map = false; +#endif + + if (map) + linear_map_top = map_mem_in_cams(linear_map_top, + num_cams, false); } #endif @@ -729,10 +746,14 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, * entries are supported though that may eventually * change. * - * on FSL Embedded 64-bit, we adjust the RMA size to match the - * first bolted TLB entry size. We still limit max to 1G even if - * the TLB could cover more. This is due to what the early init - * code is setup to do. + * on FSL Embedded 64-bit, usually all RAM is bolted, but with + * unusual memory sizes it's possible for some RAM to not be mapped + * (such RAM is not used at all by Linux, since we don't support + * highmem on 64-bit). We limit ppc64_rma_size to what would be + * mappable if this memblock is the only one. Additional memblocks + * can only increase, not decrease, the amount that ends up getting + * mapped. We still limit max to 1G even if we'll eventually map + * more. This is due to what the early init code is set up to do. * * We crop it to the size of the first MEMBLOCK to * avoid going over total available memory just in case... @@ -740,8 +761,14 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, #ifdef CONFIG_PPC_FSL_BOOK3E if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned long linear_sz; - linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET, - first_memblock_base); + unsigned int num_cams; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + + linear_sz = map_mem_in_cams(first_memblock_size, num_cams, + true); + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); } else #endif diff --git a/kernel/arch/powerpc/mm/tlb_nohash_low.S b/kernel/arch/powerpc/mm/tlb_nohash_low.S index 43ff3c797..68c477592 100644 --- a/kernel/arch/powerpc/mm/tlb_nohash_low.S +++ b/kernel/arch/powerpc/mm/tlb_nohash_low.S @@ -400,6 +400,7 @@ _GLOBAL(set_context) * extern void loadcam_entry(unsigned int index) * * Load TLBCAM[index] entry in to the L2 CAM MMU + * Must preserve r7, r8, r9, and r10 */ _GLOBAL(loadcam_entry) mflr r5 @@ -423,4 +424,66 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) tlbwe isync blr + +/* + * Load multiple TLB entries at once, using an alternate-space + * trampoline so that we don't have to care about whether the same + * TLB entry maps us before and after. + * + * r3 = first entry to write + * r4 = number of entries to write + * r5 = temporary tlb entry + */ +_GLOBAL(loadcam_multi) + mflr r8 + + /* + * Set up temporary TLB entry that is the same as what we're + * running from, but in AS=1. + */ + bl 1f +1: mflr r6 + tlbsx 0,r8 + mfspr r6,SPRN_MAS1 + ori r6,r6,MAS1_TS + mtspr SPRN_MAS1,r6 + mfspr r6,SPRN_MAS0 + rlwimi r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK + mr r7,r5 + mtspr SPRN_MAS0,r6 + isync + tlbwe + isync + + /* Switch to AS=1 */ + mfmsr r6 + ori r6,r6,MSR_IS|MSR_DS + mtmsr r6 + isync + + mr r9,r3 + add r10,r3,r4 +2: bl loadcam_entry + addi r9,r9,1 + cmpw r9,r10 + mr r3,r9 + blt 2b + + /* Return to AS=0 and clear the temporary entry */ + mfmsr r6 + rlwinm. r6,r6,0,~(MSR_IS|MSR_DS) + mtmsr r6 + isync + + li r6,0 + mtspr SPRN_MAS1,r6 + rlwinm r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK + oris r6,r6,MAS0_TLBSEL(1)@h + mtspr SPRN_MAS0,r6 + isync + tlbwe + isync + + mtlr r8 + blr #endif -- cgit 1.2.3-korg