19 files changed, 694 insertions, 181 deletions
diff --git a/kernel/arch/powerpc/mm/Makefile b/kernel/arch/powerpc/mm/Makefile
index 9c8770b5f..3eb73a382 100644
--- a/kernel/arch/powerpc/mm/Makefile
+++ b/kernel/arch/powerpc/mm/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= mmu_context_iommu.o
diff --git a/kernel/arch/powerpc/mm/copro_fault.c b/kernel/arch/powerpc/mm/copro_fault.c
index f031a47d7..6527882ce 100644
--- a/kernel/arch/powerpc/mm/copro_fault.c
+++ b/kernel/arch/powerpc/mm/copro_fault.c
@@ -26,7 +26,7 @@
 #include <asm/reg.h>
 #include <asm/copro.h>
 #include <asm/spu.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 /*
  * This ought to be kept in sync with the powerpc specific do_page_fault
@@ -100,7 +100,7 @@ EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
 
 int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 {
-	u64 vsid;
+	u64 vsid, vsidkey;
 	int psize, ssize;
 
 	switch (REGION_ID(ea)) {
@@ -109,6 +109,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 		psize = get_slice_psize(mm, ea);
 		ssize = user_segment_size(ea);
 		vsid = get_vsid(mm->context.id, ea, ssize);
+		vsidkey = SLB_VSID_USER;
 		break;
 	case VMALLOC_REGION_ID:
 		pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
@@ -118,19 +119,21 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 			psize = mmu_io_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		vsidkey = SLB_VSID_KERNEL;
 		break;
 	case KERNEL_REGION_ID:
 		pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea);
 		psize = mmu_linear_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		vsidkey = SLB_VSID_KERNEL;
 		break;
 	default:
 		pr_debug("%s: invalid region access at %016llx\n", __func__, ea);
 		return 1;
 	}
 
-	vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER;
+	vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey;
 
 	vsid |= mmu_psize_defs[psize].sllp |
 		((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);
diff --git a/kernel/arch/powerpc/mm/fault.c b/kernel/arch/powerpc/mm/fault.c
index 6d535973b..a67c6d781 100644
--- a/kernel/arch/powerpc/mm/fault.c
+++ b/kernel/arch/powerpc/mm/fault.c
@@ -529,6 +529,10 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 		printk(KERN_ALERT "Unable to handle kernel paging request for "
 			"instruction fetch\n");
 		break;
+	case 0x600:
+		printk(KERN_ALERT "Unable to handle kernel paging request for "
+			"unaligned access at address 0x%08lx\n", regs->dar);
+		break;
 	default:
 		printk(KERN_ALERT "Unable to handle kernel paging request for "
 			"unknown fault\n");
diff --git a/kernel/arch/powerpc/mm/fsl_booke_mmu.c b/kernel/arch/powerpc/mm/fsl_booke_mmu.c
index 9c90e66cf..f3afe3d97 100644
--- a/kernel/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/kernel/arch/powerpc/mm/fsl_booke_mmu.c
@@ -112,7 +112,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 
 	tsize = __ilog2(size) - 10;
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC)
 	if ((flags & _PAGE_NO_CACHE) == 0)
 		flags |= _PAGE_COHERENT;
 #endif
@@ -141,8 +141,6 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 	tlbcam_addrs[index].start = virt;
 	tlbcam_addrs[index].limit = virt + size - 1;
 	tlbcam_addrs[index].phys = phys;
-
-	loadcam_entry(index);
 }
 
 unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
@@ -171,7 +169,8 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 }
 
 static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
-					unsigned long ram, int max_cam_idx)
+					unsigned long ram, int max_cam_idx,
+					bool dryrun)
 {
 	int i;
 	unsigned long amount_mapped = 0;
@@ -181,13 +180,20 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 		unsigned long cam_sz;
 
 		cam_sz = calc_cam_sz(ram, virt, phys);
-		settlbcam(i, virt, phys, cam_sz, pgprot_val(PAGE_KERNEL_X), 0);
+		if (!dryrun)
+			settlbcam(i, virt, phys, cam_sz,
+				  pgprot_val(PAGE_KERNEL_X), 0);
 
 		ram -= cam_sz;
 		amount_mapped += cam_sz;
 		virt += cam_sz;
 		phys += cam_sz;
 	}
+
+	if (dryrun)
+		return amount_mapped;
+
+	loadcam_multi(0, i, max_cam_idx);
 	tlbcam_index = i;
 
 #ifdef CONFIG_PPC64
@@ -199,12 +205,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	return amount_mapped;
 }
 
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx)
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
 {
 	unsigned long virt = PAGE_OFFSET;
 	phys_addr_t phys = memstart_addr;
 
-	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx);
+	return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun);
 }
 
 #ifdef CONFIG_PPC32
@@ -235,7 +241,7 @@ void __init adjust_total_lowmem(void)
 	ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
 	i = switch_to_as1();
-	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM);
+	__max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false);
 	restore_to_as0(i, 0, 0, 1);
 
 	pr_info("Memory CAM mapping: ");
@@ -303,10 +309,12 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
 		n = switch_to_as1();
 		/* map a 64M area for the second relocation */
 		if (memstart_addr > start)
-			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM);
+			map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM,
+					false);
 		else
 			map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
-					0x4000000, CONFIG_LOWMEM_CAM_NUM);
+					0x4000000, CONFIG_LOWMEM_CAM_NUM,
+					false);
 		restore_to_as0(n, offset, __va(dt_ptr), 1);
 		/* We should never reach here */
 		panic("Relocation error");
diff --git a/kernel/arch/powerpc/mm/hash_low_64.S b/kernel/arch/powerpc/mm/hash_low_64.S
index 463174a4a..3b49e3295 100644
--- a/kernel/arch/powerpc/mm/hash_low_64.S
+++ b/kernel/arch/powerpc/mm/hash_low_64.S
@@ -701,7 +701,7 @@ htab_pte_insert_failure:
 
 #endif /* CONFIG_PPC_64K_PAGES */
 
-#ifdef CONFIG_PPC_HAS_HASH_64K
+#ifdef CONFIG_PPC_64K_PAGES
 
 /*****************************************************************************
  *                                                                           *
@@ -993,7 +993,7 @@ ht64_pte_insert_failure:
 	b	ht64_bail
 
 
-#endif /* CONFIG_PPC_HAS_HASH_64K */
+#endif /* CONFIG_PPC_64K_PAGES */
 
 
 /*****************************************************************************
diff --git a/kernel/arch/powerpc/mm/hash_native_64.c b/kernel/arch/powerpc/mm/hash_native_64.c
index 9c4880dde..c8822af10 100644
--- a/kernel/arch/powerpc/mm/hash_native_64.c
+++ b/kernel/arch/powerpc/mm/hash_native_64.c
@@ -29,7 +29,7 @@
 #include <asm/kexec.h>
 #include <asm/ppc-opcode.h>
 
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #ifdef DEBUG_LOW
 #define DBG_LOW(fmt...) udbg_printf(fmt)
@@ -582,13 +582,21 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
  * be when they isi), and we are the only one left.  We rely on our kernel
  * mapping being 0xC0's and the hardware ignoring those two real bits.
  *
+ * This must be called with interrupts disabled.
+ *
+ * Taking the native_tlbie_lock is unsafe here due to the possibility of
+ * lockdep being on. On pre POWER5 hardware, not taking the lock could
+ * cause deadlock. POWER5 and newer not taking the lock is fine. This only
+ * gets called during boot before secondary CPUs have come up and during
+ * crashdump and all bets are off anyway.
+ *
  * TODO: add batching support when enabled.  remember, no dynamic memory here,
  * athough there is the control page available...
  */
 static void native_hpte_clear(void)
 {
 	unsigned long vpn = 0;
-	unsigned long slot, slots, flags;
+	unsigned long slot, slots;
 	struct hash_pte *hptep = htab_address;
 	unsigned long hpte_v;
 	unsigned long pteg_count;
@@ -596,13 +604,6 @@ static void native_hpte_clear(void)
 
 	pteg_count = htab_hash_mask + 1;
 
-	local_irq_save(flags);
-
-	/* we take the tlbie lock and hold it.  Some hardware will
-	 * deadlock if we try to tlbie from two processors at once.
-	 */
-	raw_spin_lock(&native_tlbie_lock);
-
 	slots = pteg_count * HPTES_PER_GROUP;
 
 	for (slot = 0; slot < slots; slot++, hptep++) {
@@ -614,8 +615,8 @@ static void native_hpte_clear(void)
 		hpte_v = be64_to_cpu(hptep->v);
 
 		/*
-		 * Call __tlbie() here rather than tlbie() since we
-		 * already hold the native_tlbie_lock.
+		 * Call __tlbie() here rather than tlbie() since we can't take the
+		 * native_tlbie_lock.
 		 */
 		if (hpte_v & HPTE_V_VALID) {
 			hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn);
@@ -625,8 +626,6 @@ static void native_hpte_clear(void)
 	}
 
 	asm volatile("eieio; tlbsync; ptesync":::"memory");
-	raw_spin_unlock(&native_tlbie_lock);
-	local_irq_restore(flags);
 }
 
 /*
diff --git a/kernel/arch/powerpc/mm/hash_utils_64.c b/kernel/arch/powerpc/mm/hash_utils_64.c
index fda236f90..7f9616f7c 100644
--- a/kernel/arch/powerpc/mm/hash_utils_64.c
+++ b/kernel/arch/powerpc/mm/hash_utils_64.c
@@ -57,6 +57,7 @@
 #include <asm/fadump.h>
 #include <asm/firmware.h>
 #include <asm/tm.h>
+#include <asm/trace.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -639,7 +640,7 @@ extern u32 ht64_call_hpte_updatepp[];
 
 static void __init htab_finish_init(void)
 {
-#ifdef CONFIG_PPC_HAS_HASH_64K
+#ifdef CONFIG_PPC_64K_PAGES
 	patch_branch(ht64_call_hpte_insert1,
 		ppc_function_entry(ppc_md.hpte_insert),
 		BRANCH_SET_LINK);
@@ -652,7 +653,7 @@ static void __init htab_finish_init(void)
 	patch_branch(ht64_call_hpte_updatepp,
 		ppc_function_entry(ppc_md.hpte_updatepp),
 		BRANCH_SET_LINK);
-#endif /* CONFIG_PPC_HAS_HASH_64K */
+#endif /* CONFIG_PPC_64K_PAGES */
 
 	patch_branch(htab_call_hpte_insert1,
 		ppc_function_entry(ppc_md.hpte_insert),
@@ -993,6 +994,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 		 unsigned long access, unsigned long trap,
 		 unsigned long flags)
 {
+	bool is_thp;
 	enum ctx_state prev_state = exception_enter();
 	pgd_t *pgdir;
 	unsigned long vsid;
@@ -1004,6 +1006,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 
 	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
 		ea, access, trap);
+	trace_hash_fault(ea, access, trap);
 
 	/* Get region & vsid */
  	switch (REGION_ID(ea)) {
@@ -1066,7 +1069,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 #endif /* CONFIG_PPC_64K_PAGES */
 
 	/* Get PTE and page size from page tables */
-	ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
+	ptep = __find_linux_pte_or_hugepte(pgdir, ea, &is_thp, &hugeshift);
 	if (ptep == NULL || !pte_present(*ptep)) {
 		DBG_LOW(" no PTE !\n");
 		rc = 1;
@@ -1086,7 +1089,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 	}
 
 	if (hugeshift) {
-		if (pmd_trans_huge(*(pmd_t *)ptep))
+		if (is_thp)
 			rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
 					     trap, flags, ssize, psize);
 #ifdef CONFIG_HUGETLB_PAGE
@@ -1149,12 +1152,12 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 		check_paca_psize(ea, mm, psize, user_region);
 #endif /* CONFIG_PPC_64K_PAGES */
 
-#ifdef CONFIG_PPC_HAS_HASH_64K
+#ifdef CONFIG_PPC_64K_PAGES
 	if (psize == MMU_PAGE_64K)
 		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
 				     flags, ssize);
 	else
-#endif /* CONFIG_PPC_HAS_HASH_64K */
+#endif /* CONFIG_PPC_64K_PAGES */
 	{
 		int spp = subpage_protection(mm, ea);
 		if (access & spp)
@@ -1241,7 +1244,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 	 * THP pages use update_mmu_cache_pmd. We don't do
 	 * hash preload there. Hence can ignore THP here
 	 */
-	ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift);
+	ptep = find_linux_pte_or_hugepte(pgdir, ea, NULL, &hugepage_shift);
 	if (!ptep)
 		goto out_exit;
 
@@ -1262,12 +1265,12 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 		update_flags |= HPTE_LOCAL_UPDATE;
 
 	/* Hash it in */
-#ifdef CONFIG_PPC_HAS_HASH_64K
+#ifdef CONFIG_PPC_64K_PAGES
 	if (mm->context.user_psize == MMU_PAGE_64K)
 		rc = __hash_page_64K(ea, access, vsid, ptep, trap,
 				     update_flags, ssize);
 	else
-#endif /* CONFIG_PPC_HAS_HASH_64K */
+#endif /* CONFIG_PPC_64K_PAGES */
 		rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags,
 				    ssize, subpage_protection(mm, ea));
 
@@ -1475,7 +1478,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
 	unsigned long hash;
 	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
 	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-	unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
+	unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
 	long ret;
 
 	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
diff --git a/kernel/arch/powerpc/mm/hugetlbpage.c b/kernel/arch/powerpc/mm/hugetlbpage.c
index 3385e3d05..9833fee49 100644
--- a/kernel/arch/powerpc/mm/hugetlbpage.c
+++ b/kernel/arch/powerpc/mm/hugetlbpage.c
@@ -89,6 +89,25 @@ int pgd_huge(pgd_t pgd)
 	 */
 	return ((pgd_val(pgd) & 0x3) != 0x0);
 }
+
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM)
+/*
+ * This enables us to catch the wrong page directory format
+ * Moved here so that we can use WARN() in the call.
+ */
+int hugepd_ok(hugepd_t hpd)
+{
+	bool is_hugepd;
+
+	/*
+	 * We should not find this format in page directory, warn otherwise.
+	 */
+	is_hugepd = (((hpd.pd & 0x3) == 0x0) && ((hpd.pd & HUGEPD_SHIFT_MASK) != 0));
+	WARN(is_hugepd, "Found wrong page directory format\n");
+	return 0;
+}
+#endif
+
 #else
 int pmd_huge(pmd_t pmd)
 {
@@ -109,7 +128,7 @@ int pgd_huge(pgd_t pgd)
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	/* Only called for hugetlbfs pages, hence can ignore THP */
-	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
 }
 
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -336,7 +355,7 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 unsigned long gpage_npages[MMU_PAGE_COUNT];
 
 static int __init do_gpage_early_setup(char *param, char *val,
-				       const char *unused)
+				       const char *unused, void *arg)
 {
 	static phys_addr_t size;
 	unsigned long npages;
@@ -385,7 +404,7 @@ void __init reserve_hugetlb_gpages(void)
 
 	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
 	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
-			&do_gpage_early_setup);
+			NULL, &do_gpage_early_setup);
 
 	/*
 	 * Walk gpage list in reverse, allocating larger page sizes first.
@@ -439,11 +458,6 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 }
 #endif
 
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
-{
-	return 0;
-}
-
 #ifdef CONFIG_PPC_FSL_BOOK3E
 #define HUGEPD_FREELIST_SIZE \
 	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
@@ -689,13 +703,14 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
+	bool is_thp;
 	pte_t *ptep, pte;
 	unsigned shift;
 	unsigned long mask, flags;
 	struct page *page = ERR_PTR(-EINVAL);
 
 	local_irq_save(flags);
-	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
+	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
 	if (!ptep)
 		goto no_page;
 	pte = READ_ONCE(*ptep);
@@ -704,7 +719,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 	 * Transparent hugepages are handled by generic code. We can skip them
 	 * here.
 	 */
-	if (!shift || pmd_trans_huge(__pmd(pte_val(pte))))
+	if (!shift || is_thp)
 		goto no_page;
 
 	if (!pte_present(pte)) {
@@ -813,14 +828,6 @@ static int __init add_huge_page_size(unsigned long long size)
 	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
 		return -EINVAL;
 
-#ifdef CONFIG_SPU_FS_64K_LS
-	/* Disable support for 64K huge pages when 64K SPU local store
-	 * support is enabled as the current implementation conflicts.
-	 */
-	if (shift == PAGE_SHIFT_64K)
-		return -EINVAL;
-#endif /* CONFIG_SPU_FS_64K_LS */
-
 	BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
 	/* Return if huge page size has already been setup */
@@ -933,7 +940,7 @@ static int __init hugetlbpage_init(void)
 	return 0;
 }
 #endif
-module_init(hugetlbpage_init);
+arch_initcall(hugetlbpage_init);
 
 void flush_dcache_icache_hugepage(struct page *page)
 {
@@ -969,7 +976,7 @@ void flush_dcache_icache_hugepage(struct page *page)
  */
 
 pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-				   unsigned *shift)
+				   bool *is_thp, unsigned *shift)
 {
 	pgd_t pgd, *pgdp;
 	pud_t pud, *pudp;
@@ -981,6 +988,9 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	if (shift)
 		*shift = 0;
 
+	if (is_thp)
+		*is_thp = false;
+
 	pgdp = pgdir + pgd_index(ea);
 	pgd  = READ_ONCE(*pgdp);
 	/*
@@ -1028,7 +1038,14 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 			if (pmd_none(pmd))
 				return NULL;
 
-			if (pmd_huge(pmd) || pmd_large(pmd)) {
+			if (pmd_trans_huge(pmd)) {
+				if (is_thp)
+					*is_thp = true;
+				ret_pte = (pte_t *) pmdp;
+				goto out;
+			}
+
+			if (pmd_huge(pmd)) {
 				ret_pte = (pte_t *) pmdp;
 				goto out;
 			} else if (is_hugepd(__hugepd(pmd_val(pmd))))
diff --git a/kernel/arch/powerpc/mm/mem.c b/kernel/arch/powerpc/mm/mem.c
index 45fda71fe..22d94c3e6 100644
--- a/kernel/arch/powerpc/mm/mem.c
+++ b/kernel/arch/powerpc/mm/mem.c
@@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata;
 	struct zone *zone;
@@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	/* this should work for most non-highmem platforms */
 	zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, 0);
+		zone_for_memory(nid, start, size, 0, for_device);
 
 	return __add_pages(nid, zone, start_pfn, nr_pages);
 }
@@ -414,17 +414,17 @@ void flush_dcache_icache_page(struct page *page)
 		return;
 	}
 #endif
-#ifdef CONFIG_BOOKE
-	{
+#if defined(CONFIG_8xx) || defined(CONFIG_PPC64)
+	/* On 8xx there is no need to kmap since highmem is not supported */
+	__flush_dcache_icache(page_address(page));
+#else
+	if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) {
 		void *start = kmap_atomic(page);
 		__flush_dcache_icache(start);
 		kunmap_atomic(start);
+	} else {
+		__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 	}
-#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
-	/* On 8xx there is no need to kmap since highmem is not supported */
-	__flush_dcache_icache(page_address(page)); 
-#else
-	__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 #endif
 }
 EXPORT_SYMBOL(flush_dcache_icache_page);
@@ -560,7 +560,7 @@ subsys_initcall(add_system_ram_resources);
  */
 int devmem_is_allowed(unsigned long pfn)
 {
-	if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+	if (iomem_is_exclusive(PFN_PHYS(pfn)))
 		return 0;
 	if (!page_is_ram(pfn))
 		return 1;
diff --git a/kernel/arch/powerpc/mm/mmu_context_hash64.c b/kernel/arch/powerpc/mm/mmu_context_hash64.c
index 178876aef..4e4efbc26 100644
--- a/kernel/arch/powerpc/mm/mmu_context_hash64.c
+++ b/kernel/arch/powerpc/mm/mmu_context_hash64.c
@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 #ifdef CONFIG_PPC_64K_PAGES
 	mm->context.pte_frag = NULL;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_init(&mm->context);
+#endif
 	return 0;
 }
 
@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
 
 void destroy_context(struct mm_struct *mm)
 {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_cleanup(&mm->context);
+#endif
 
 #ifdef CONFIG_PPC_ICSWX
 	drop_cop(mm->context.acop, mm);
diff --git a/kernel/arch/powerpc/mm/mmu_context_iommu.c b/kernel/arch/powerpc/mm/mmu_context_iommu.c
new file mode 100644
index 000000000..da6a2168a
--- /dev/null
+++ b/kernel/arch/powerpc/mm/mmu_context_iommu.c
@@ -0,0 +1,316 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <asm/mmu_context.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+struct mm_iommu_table_group_mem_t {
+	struct list_head next;
+	struct rcu_head rcu;
+	unsigned long used;
+	atomic64_t mapped;
+	u64 ua;			/* userspace address */
+	u64 entries;		/* number of entries in hpas[] */
+	u64 *hpas;		/* vmalloc'ed */
+};
+
+static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
+		unsigned long npages, bool incr)
+{
+	long ret = 0, locked, lock_limit;
+
+	if (!npages)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+
+	if (incr) {
+		locked = mm->locked_vm + npages;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			ret = -ENOMEM;
+		else
+			mm->locked_vm += npages;
+	} else {
+		if (WARN_ON_ONCE(npages > mm->locked_vm))
+			npages = mm->locked_vm;
+		mm->locked_vm -= npages;
+	}
+
+	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
+			current->pid,
+			incr ? '+' : '-',
+			npages << PAGE_SHIFT,
+			mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+bool mm_iommu_preregistered(void)
+{
+	if (!current || !current->mm)
+		return false;
+
+	return !list_empty(&current->mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+long mm_iommu_get(unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	long i, j, ret = 0, locked_entries = 0;
+	struct page *page = NULL;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	mutex_lock(&mem_list_mutex);
+
+	list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			++mem->used;
+			*pmem = mem;
+			goto unlock_exit;
+		}
+
+		/* Overlap? */
+		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+				(ua < (mem->ua +
+				       (mem->entries << PAGE_SHIFT)))) {
+			ret = -EINVAL;
+			goto unlock_exit;
+		}
+
+	}
+
+	ret = mm_iommu_adjust_locked_vm(current->mm, entries, true);
+	if (ret)
+		goto unlock_exit;
+
+	locked_entries = entries;
+
+	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	if (!mem) {
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
+	if (!mem->hpas) {
+		kfree(mem);
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+					1/* pages */, 1/* iswrite */, &page)) {
+			for (j = 0; j < i; ++j)
+				put_page(pfn_to_page(
+						mem->hpas[j] >> PAGE_SHIFT));
+			vfree(mem->hpas);
+			kfree(mem);
+			ret = -EFAULT;
+			goto unlock_exit;
+		}
+
+		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+	}
+
+	atomic64_set(&mem->mapped, 1);
+	mem->used = 1;
+	mem->ua = ua;
+	mem->entries = entries;
+	*pmem = mem;
+
+	list_add_rcu(&mem->next, &current->mm->context.iommu_group_mem_list);
+
+unlock_exit:
+	if (locked_entries && ret)
+		mm_iommu_adjust_locked_vm(current->mm, locked_entries, false);
+
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+	long i;
+	struct page *page = NULL;
+
+	for (i = 0; i < mem->entries; ++i) {
+		if (!mem->hpas[i])
+			continue;
+
+		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+		if (!page)
+			continue;
+
+		put_page(page);
+		mem->hpas[i] = 0;
+	}
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+	mm_iommu_unpin(mem);
+	vfree(mem->hpas);
+	kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+	struct mm_iommu_table_group_mem_t *mem = container_of(head,
+			struct mm_iommu_table_group_mem_t, rcu);
+
+	mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+	list_del_rcu(&mem->next);
+	mm_iommu_adjust_locked_vm(current->mm, mem->entries, false);
+	call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
+{
+	long ret = 0;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	mutex_lock(&mem_list_mutex);
+
+	if (mem->used == 0) {
+		ret = -ENOENT;
+		goto unlock_exit;
+	}
+
+	--mem->used;
+	/* There are still users, exit */
+	if (mem->used)
+		goto unlock_exit;
+
+	/* Are there still mappings? */
+	if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+		++mem->used;
+		ret = -EBUSY;
+		goto unlock_exit;
+	}
+
+	/* @mapped became 0 so now mappings are disabled, release the region */
+	mm_iommu_release(mem);
+
+unlock_exit:
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+		unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem,
+			&current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
+		unsigned long entries)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem,
+			&current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_find);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	u64 *va = &mem->hpas[entry];
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	*hpa = *va | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+	if (atomic64_inc_not_zero(&mem->mapped))
+		return 0;
+
+	/* Last mm_iommu_put() has been called, no more mappings allowed() */
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+	atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(mm_context_t *ctx)
+{
+	INIT_LIST_HEAD_RCU(&ctx->iommu_group_mem_list);
+}
+
+void mm_iommu_cleanup(mm_context_t *ctx)
+{
+	struct mm_iommu_table_group_mem_t *mem, *tmp;
+
+	list_for_each_entry_safe(mem, tmp, &ctx->iommu_group_mem_list, next) {
+		list_del_rcu(&mem->next);
+		mm_iommu_do_free(mem);
+	}
+}
diff --git a/kernel/arch/powerpc/mm/mmu_decl.h b/kernel/arch/powerpc/mm/mmu_decl.h
index 085b66b10..9f58ff44a 100644
--- a/kernel/arch/powerpc/mm/mmu_decl.h
+++ b/kernel/arch/powerpc/mm/mmu_decl.h
@@ -141,7 +141,8 @@ extern void MMU_init_hw(void);
 extern unsigned long mmu_mapin_ram(unsigned long top);
 
 #elif defined(CONFIG_PPC_FSL_BOOK3E)
-extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx);
+extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx,
+				     bool dryrun);
 extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 				 phys_addr_t phys);
 #ifdef CONFIG_PPC32
@@ -152,6 +153,7 @@ extern int switch_to_as1(void);
 extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
 #endif
 extern void loadcam_entry(unsigned int index);
+extern void loadcam_multi(int first_idx, int num, int tmp_idx);
 
 struct tlbcam {
 	u32	MAS0;
diff --git a/kernel/arch/powerpc/mm/numa.c b/kernel/arch/powerpc/mm/numa.c
index 5e80621d9..669a15e7f 100644
--- a/kernel/arch/powerpc/mm/numa.c
+++ b/kernel/arch/powerpc/mm/numa.c
@@ -80,7 +80,7 @@ static void __init setup_node_to_cpumask_map(void)
 		setup_nr_node_ids();
 
 	/* allocate the map */
-	for (node = 0; node < nr_node_ids; node++)
+	for_each_node(node)
 		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
 
 	/* cpumask_of_node() will now work */
@@ -225,7 +225,7 @@ static void initialize_distance_lookup_table(int nid,
 	for (i = 0; i < distance_ref_points_depth; i++) {
 		const __be32 *entry;
 
-		entry = &associativity[be32_to_cpu(distance_ref_points[i])];
+		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
 		distance_lookup_table[nid][i] = of_read_number(entry, 1);
 	}
 }
@@ -248,8 +248,12 @@ static int associativity_to_nid(const __be32 *associativity)
 		nid = -1;
 
 	if (nid > 0 &&
-	    of_read_number(associativity, 1) >= distance_ref_points_depth)
-		initialize_distance_lookup_table(nid, associativity);
+		of_read_number(associativity, 1) >= distance_ref_points_depth) {
+		/*
+		 * Skip the length field and send start of associativity array
+		 */
+		initialize_distance_lookup_table(nid, associativity + 1);
+	}
 
 out:
 	return nid;
@@ -272,7 +276,6 @@ static int of_node_to_nid_single(struct device_node *device)
 /* Walk the device tree upwards, looking for an associativity id */
 int of_node_to_nid(struct device_node *device)
 {
-	struct device_node *tmp;
 	int nid = -1;
 
 	of_node_get(device);
@@ -281,9 +284,7 @@ int of_node_to_nid(struct device_node *device)
 		if (nid != -1)
 			break;
 
-	        tmp = device;
-		device = of_get_parent(tmp);
-		of_node_put(tmp);
+		device = of_get_next_parent(device);
 	}
 	of_node_put(device);
 
@@ -507,6 +508,12 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
 
 		if (nid == 0xffff || nid >= MAX_NUMNODES)
 			nid = default_nid;
+
+		if (nid > 0) {
+			index = drmem->aa_index * aa->array_sz;
+			initialize_distance_lookup_table(nid,
+							&aa->arrays[index]);
+		}
 	}
 
 	return nid;
diff --git a/kernel/arch/powerpc/mm/pgtable_64.c b/kernel/arch/powerpc/mm/pgtable_64.c
index 6bfadf1aa..e92cb2146 100644
--- a/kernel/arch/powerpc/mm/pgtable_64.c
+++ b/kernel/arch/powerpc/mm/pgtable_64.c
@@ -149,17 +149,7 @@ int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
 #endif /* !CONFIG_PPC_MMU_NOHASH */
 	}
 
-#ifdef CONFIG_PPC_BOOK3E_64
-	/*
-	 * With hardware tablewalk, a sync is needed to ensure that
-	 * subsequent accesses see the PTE we just wrote.  Unlike userspace
-	 * mappings, we can't tolerate spurious faults, so make sure
-	 * the new PTE will be seen the first time.
-	 */
-	mb();
-#else
 	smp_wmb();
-#endif
 	return 0;
 }
 
@@ -554,47 +544,42 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
 	return old;
 }
 
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
-		       pmd_t *pmdp)
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+			  pmd_t *pmdp)
 {
 	pmd_t pmd;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	if (pmd_trans_huge(*pmdp)) {
-		pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
-	} else {
-		/*
-		 * khugepaged calls this for normal pmd
-		 */
-		pmd = *pmdp;
-		pmd_clear(pmdp);
-		/*
-		 * Wait for all pending hash_page to finish. This is needed
-		 * in case of subpage collapse. When we collapse normal pages
-		 * to hugepage, we first clear the pmd, then invalidate all
-		 * the PTE entries. The assumption here is that any low level
-		 * page fault will see a none pmd and take the slow path that
-		 * will wait on mmap_sem. But we could very well be in a
-		 * hash_page with local ptep pointer value. Such a hash page
-		 * can result in adding new HPTE entries for normal subpages.
-		 * That means we could be modifying the page content as we
-		 * copy them to a huge page. So wait for parallel hash_page
-		 * to finish before invalidating HPTE entries. We can do this
-		 * by sending an IPI to all the cpus and executing a dummy
-		 * function there.
-		 */
-		kick_all_cpus_sync();
-		/*
-		 * Now invalidate the hpte entries in the range
-		 * covered by pmd. This make sure we take a
-		 * fault and will find the pmd as none, which will
-		 * result in a major fault which takes mmap_sem and
-		 * hence wait for collapse to complete. Without this
-		 * the __collapse_huge_page_copy can result in copying
-		 * the old content.
-		 */
-		flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
-	}
+	VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+	pmd = *pmdp;
+	pmd_clear(pmdp);
+	/*
+	 * Wait for all pending hash_page to finish. This is needed
+	 * in case of subpage collapse. When we collapse normal pages
+	 * to hugepage, we first clear the pmd, then invalidate all
+	 * the PTE entries. The assumption here is that any low level
+	 * page fault will see a none pmd and take the slow path that
+	 * will wait on mmap_sem. But we could very well be in a
+	 * hash_page with local ptep pointer value. Such a hash page
+	 * can result in adding new HPTE entries for normal subpages.
+	 * That means we could be modifying the page content as we
+	 * copy them to a huge page. So wait for parallel hash_page
+	 * to finish before invalidating HPTE entries. We can do this
+	 * by sending an IPI to all the cpus and executing a dummy
+	 * function there.
+	 */
+	kick_all_cpus_sync();
+	/*
+	 * Now invalidate the hpte entries in the range
+	 * covered by pmd. This make sure we take a
+	 * fault and will find the pmd as none, which will
+	 * result in a major fault which takes mmap_sem and
+	 * hence wait for collapse to complete. Without this
+	 * the __collapse_huge_page_copy can result in copying
+	 * the old content.
+	 */
+	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
 	return pmd;
 }
 
@@ -817,8 +802,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return;
 }
 
-pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-			 unsigned long addr, pmd_t *pmdp)
+pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t old_pmd;
 	pgtable_t pgtable;
diff --git a/kernel/arch/powerpc/mm/slb.c b/kernel/arch/powerpc/mm/slb.c
index 6e450ca66..515730e49 100644
--- a/kernel/arch/powerpc/mm/slb.c
+++ b/kernel/arch/powerpc/mm/slb.c
@@ -25,6 +25,11 @@
 #include <asm/udbg.h>
 #include <asm/code-patching.h>
 
+enum slb_index {
+	LINEAR_INDEX	= 0, /* Kernel linear map  (0xc000000000000000) */
+	VMALLOC_INDEX	= 1, /* Kernel virtual map (0xd000000000000000) */
+	KSTACK_INDEX	= 2, /* Kernel stack map */
+};
 
 extern void slb_allocate_realmode(unsigned long ea);
 extern void slb_allocate_user(unsigned long ea);
@@ -41,9 +46,9 @@ static void slb_allocate(unsigned long ea)
 	(((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
 
 static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
-					 unsigned long slot)
+					 enum slb_index index)
 {
-	return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot;
+	return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
 }
 
 static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
@@ -55,39 +60,39 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
 
 static inline void slb_shadow_update(unsigned long ea, int ssize,
 				     unsigned long flags,
-				     unsigned long entry)
+				     enum slb_index index)
 {
+	struct slb_shadow *p = get_slb_shadow();
+
 	/*
 	 * Clear the ESID first so the entry is not valid while we are
 	 * updating it.  No write barriers are needed here, provided
 	 * we only update the current CPU's SLB shadow buffer.
 	 */
-	get_slb_shadow()->save_area[entry].esid = 0;
-	get_slb_shadow()->save_area[entry].vsid =
-				cpu_to_be64(mk_vsid_data(ea, ssize, flags));
-	get_slb_shadow()->save_area[entry].esid =
-				cpu_to_be64(mk_esid_data(ea, ssize, entry));
+	p->save_area[index].esid = 0;
+	p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags));
+	p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index));
 }
 
-static inline void slb_shadow_clear(unsigned long entry)
+static inline void slb_shadow_clear(enum slb_index index)
 {
-	get_slb_shadow()->save_area[entry].esid = 0;
+	get_slb_shadow()->save_area[index].esid = 0;
 }
 
 static inline void create_shadowed_slbe(unsigned long ea, int ssize,
 					unsigned long flags,
-					unsigned long entry)
+					enum slb_index index)
 {
 	/*
 	 * Updating the shadow buffer before writing the SLB ensures
 	 * we don't get a stale entry here if we get preempted by PHYP
 	 * between these two statements.
 	 */
-	slb_shadow_update(ea, ssize, flags, entry);
+	slb_shadow_update(ea, ssize, flags, index);
 
 	asm volatile("slbmte  %0,%1" :
 		     : "r" (mk_vsid_data(ea, ssize, flags)),
-		       "r" (mk_esid_data(ea, ssize, entry))
+		       "r" (mk_esid_data(ea, ssize, index))
 		     : "memory" );
 }
 
@@ -103,16 +108,16 @@ static void __slb_flush_and_rebolt(void)
 	lflags = SLB_VSID_KERNEL | linear_llp;
 	vflags = SLB_VSID_KERNEL | vmalloc_llp;
 
-	ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2);
+	ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX);
 	if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) {
 		ksp_esid_data &= ~SLB_ESID_V;
 		ksp_vsid_data = 0;
-		slb_shadow_clear(2);
+		slb_shadow_clear(KSTACK_INDEX);
 	} else {
 		/* Update stack entry; others don't change */
-		slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2);
+		slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX);
 		ksp_vsid_data =
-			be64_to_cpu(get_slb_shadow()->save_area[2].vsid);
+			be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid);
 	}
 
 	/* We need to do this all in asm, so we're sure we don't touch
@@ -151,7 +156,7 @@ void slb_vmalloc_update(void)
 	unsigned long vflags;
 
 	vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp;
-	slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
+	slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX);
 	slb_flush_and_rebolt();
 }
 
@@ -249,11 +254,24 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 static inline void patch_slb_encoding(unsigned int *insn_addr,
 				      unsigned int immed)
 {
-	int insn = (*insn_addr & 0xffff0000) | immed;
+
+	/*
+	 * This function patches either an li or a cmpldi instruction with
+	 * a new immediate value. This relies on the fact that both li
+	 * (which is actually addi) and cmpldi both take a 16-bit immediate
+	 * value, and it is situated in the same location in the instruction,
+	 * ie. bits 16-31 (Big endian bit order) or the lower 16 bits.
+	 * The signedness of the immediate operand differs between the two
+	 * instructions however this code is only ever patching a small value,
+	 * much less than 1 << 15, so we can get away with it.
+	 * To patch the value we read the existing instruction, clear the
+	 * immediate value, and or in our new value, then write the instruction
+	 * back.
+	 */
+	unsigned int insn = (*insn_addr & 0xffff0000) | immed;
 	patch_instruction(insn_addr, insn);
 }
 
-extern u32 slb_compare_rr_to_size[];
 extern u32 slb_miss_kernel_load_linear[];
 extern u32 slb_miss_kernel_load_io[];
 extern u32 slb_compare_rr_to_size[];
@@ -309,24 +327,23 @@ void slb_initialize(void)
 	lflags = SLB_VSID_KERNEL | linear_llp;
 	vflags = SLB_VSID_KERNEL | vmalloc_llp;
 
-	/* Invalidate the entire SLB (even slot 0) & all the ERATS */
+	/* Invalidate the entire SLB (even entry 0) & all the ERATS */
 	asm volatile("isync":::"memory");
 	asm volatile("slbmte  %0,%0"::"r" (0) : "memory");
 	asm volatile("isync; slbia; isync":::"memory");
-	create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0);
-
-	create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1);
+	create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
+	create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX);
 
 	/* For the boot cpu, we're running on the stack in init_thread_union,
 	 * which is in the first segment of the linear mapping, and also
 	 * get_paca()->kstack hasn't been initialized yet.
 	 * For secondary cpus, we need to bolt the kernel stack entry now.
 	 */
-	slb_shadow_clear(2);
+	slb_shadow_clear(KSTACK_INDEX);
 	if (raw_smp_processor_id() != boot_cpuid &&
 	    (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
 		create_shadowed_slbe(get_paca()->kstack,
-				     mmu_kernel_ssize, lflags, 2);
+				     mmu_kernel_ssize, lflags, KSTACK_INDEX);
 
 	asm volatile("isync":::"memory");
 }
diff --git a/kernel/arch/powerpc/mm/tlb_hash64.c b/kernel/arch/powerpc/mm/tlb_hash64.c
index c522969f0..f7b80391b 100644
--- a/kernel/arch/powerpc/mm/tlb_hash64.c
+++ b/kernel/arch/powerpc/mm/tlb_hash64.c
@@ -190,6 +190,7 @@ void tlb_flush(struct mmu_gather *tlb)
 void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 			      unsigned long end)
 {
+	bool is_thp;
 	int hugepage_shift;
 	unsigned long flags;
 
@@ -208,21 +209,21 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
 	local_irq_save(flags);
 	arch_enter_lazy_mmu_mode();
 	for (; start < end; start += PAGE_SIZE) {
-		pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start,
+		pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, &is_thp,
 							&hugepage_shift);
 		unsigned long pte;
 
 		if (ptep == NULL)
 			continue;
 		pte = pte_val(*ptep);
-		if (hugepage_shift)
+		if (is_thp)
 			trace_hugepage_invalidate(start, pte);
 		if (!(pte & _PAGE_HASHPTE))
 			continue;
-		if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
+		if (unlikely(is_thp))
 			hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
 		else
-			hpte_need_flush(mm, start, ptep, pte, 0);
+			hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
 	}
 	arch_leave_lazy_mmu_mode();
 	local_irq_restore(flags);
diff --git a/kernel/arch/powerpc/mm/tlb_low_64e.S b/kernel/arch/powerpc/mm/tlb_low_64e.S
index 89bf95bd6..29d6987c3 100644
--- a/kernel/arch/powerpc/mm/tlb_low_64e.S
+++ b/kernel/arch/powerpc/mm/tlb_low_64e.S
@@ -68,11 +68,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
 	ld	r14,PACAPGD(r13)
 	std	r15,EX_TLB_R15(r12)
 	std	r10,EX_TLB_CR(r12)
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	std	r7,EX_TLB_R7(r12)
+#endif
 	TLB_MISS_PROLOG_STATS
 .endm
 
 .macro tlb_epilog_bolted
 	ld	r14,EX_TLB_CR(r12)
+#ifdef CONFIG_PPC_FSL_BOOK3E
+	ld	r7,EX_TLB_R7(r12)
+#endif
 	ld	r10,EX_TLB_R10(r12)
 	ld	r11,EX_TLB_R11(r12)
 	ld	r13,EX_TLB_R13(r12)
@@ -297,6 +303,7 @@ itlb_miss_fault_bolted:
  * r13 = PACA
  * r11 = tlb_per_core ptr
  * r10 = crap (free to use)
+ * r7  = esel_next
  */
 tlb_miss_common_e6500:
 	crmove	cr2*4+2,cr0*4+2		/* cr2.eq != 0 if kernel address */
@@ -308,11 +315,11 @@ BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
 	 *
 	 * MAS6:IND should be already set based on MAS4
 	 */
-1:	lbarx	r15,0,r11
 	lhz	r10,PACAPACAINDEX(r13)
-	cmpdi	r15,0
-	cmpdi	cr1,r15,1	/* set cr1.eq = 0 for non-recursive */
 	addi	r10,r10,1
+	crclr	cr1*4+eq	/* set cr1.eq = 0 for non-recursive */
+1:	lbarx	r15,0,r11
+	cmpdi	r15,0
 	bne	2f
 	stbcx.	r10,0,r11
 	bne	1b
@@ -320,12 +327,16 @@ BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
 	.subsection 1
 2:	cmpd	cr1,r15,r10	/* recursive lock due to mcheck/crit/etc? */
 	beq	cr1,3b		/* unlock will happen if cr1.eq = 0 */
-	lbz	r15,0(r11)
+10:	lbz	r15,0(r11)
 	cmpdi	r15,0
-	bne	2b
+	bne	10b
 	b	1b
 	.previous
+END_FTR_SECTION_IFSET(CPU_FTR_SMT)
+
+	lbz	r7,TCD_ESEL_NEXT(r11)
 
+BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
 	/*
 	 * Erratum A-008139 says that we can't use tlbwe to change
 	 * an indirect entry in any way (including replacing or
@@ -334,8 +345,7 @@ BEGIN_FTR_SECTION		/* CPU_FTR_SMT */
 	 * with tlbilx before overwriting.
 	 */
 
-	lbz	r15,TCD_ESEL_NEXT(r11)
-	rlwinm	r10,r15,16,0xff0000
+	rlwinm	r10,r7,16,0xff0000
 	oris	r10,r10,MAS0_TLBSEL(1)@h
 	mtspr	SPRN_MAS0,r10
 	isync
@@ -398,18 +408,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
 	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500	/* Bad pgd entry or hugepage; bail */
+	bge	tlb_miss_huge_e6500	/* Bad pgd entry or hugepage; bail */
 	ldx	r14,r14,r15		/* grab pud entry */
 
 	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500
+	bge	tlb_miss_huge_e6500
 	ldx	r14,r14,r15		/* Grab pmd entry */
 
 	mfspr	r10,SPRN_MAS0
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500
+	bge	tlb_miss_huge_e6500
 
 	/* Now we build the MAS for a 2M indirect page:
 	 *
@@ -428,15 +438,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
 	clrrdi	r15,r16,21		/* make EA 2M-aligned */
 	mtspr	SPRN_MAS2,r15
 
-	lbz	r15,TCD_ESEL_NEXT(r11)
+tlb_miss_huge_done_e6500:
 	lbz	r16,TCD_ESEL_MAX(r11)
 	lbz	r14,TCD_ESEL_FIRST(r11)
-	rlwimi	r10,r15,16,0x00ff0000	/* insert esel_next into MAS0 */
-	addi	r15,r15,1		/* increment esel_next */
+	rlwimi	r10,r7,16,0x00ff0000	/* insert esel_next into MAS0 */
+	addi	r7,r7,1			/* increment esel_next */
 	mtspr	SPRN_MAS0,r10
-	cmpw	r15,r16
-	iseleq	r15,r14,r15		/* if next == last use first */
-	stb	r15,TCD_ESEL_NEXT(r11)
+	cmpw	r7,r16
+	iseleq	r7,r14,r7		/* if next == last use first */
+	stb	r7,TCD_ESEL_NEXT(r11)
 
 	tlbwe
 
@@ -456,6 +466,50 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT)
 	tlb_epilog_bolted
 	rfi
 
+tlb_miss_huge_e6500:
+	beq	tlb_miss_fault_e6500
+	li	r10,1
+	andi.	r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */
+	rldimi	r14,r10,63,0		/* Set PD_HUGE */
+	xor	r14,r14,r15		/* Clear size bits */
+	ldx	r14,0,r14
+
+	/*
+	 * Now we build the MAS for a huge page.
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 *		 - can be handled by indirect code
+	 * MAS 1   :	Need to clear IND and set TSIZE
+	 * MAS 2,3+7:	Needs to be redone similar to non-tablewalk handler
+	 */
+
+	subi	r15,r15,10		/* Convert psize to tsize */
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r10,r10,0,~MAS1_IND
+	rlwimi	r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
+	mtspr	SPRN_MAS1,r10
+
+	li	r10,-0x400
+	sld	r15,r10,r15		/* Generate mask based on size */
+	and	r10,r16,r15
+	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	rlwimi	r10,r14,32-19,27,31	/* Insert WIMGE */
+	clrldi	r15,r15,PAGE_SHIFT	/* Clear crap at the top */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	mtspr	SPRN_MAS2,r10
+	andi.	r10,r14,_PAGE_DIRTY
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	bne	1f
+	li	r10,MAS3_SW|MAS3_UW
+	andc	r15,r15,r10
+1:
+	mtspr	SPRN_MAS7_MAS3,r15
+
+	mfspr	r10,SPRN_MAS0
+	b	tlb_miss_huge_done_e6500
+
 tlb_miss_kernel_e6500:
 	ld	r14,PACA_KERNELPGD(r13)
 	cmpldi	cr1,r15,8		/* Check for vmalloc region */
diff --git a/kernel/arch/powerpc/mm/tlb_nohash.c b/kernel/arch/powerpc/mm/tlb_nohash.c
index cbd3d0698..bb04e4df3 100644
--- a/kernel/arch/powerpc/mm/tlb_nohash.c
+++ b/kernel/arch/powerpc/mm/tlb_nohash.c
@@ -42,6 +42,7 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/code-patching.h>
+#include <asm/cputhreads.h>
 #include <asm/hugetlb.h>
 #include <asm/paca.h>
 
@@ -217,7 +218,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock);
 static int mm_is_core_local(struct mm_struct *mm)
 {
 	return cpumask_subset(mm_cpumask(mm),
-			      topology_thread_cpumask(smp_processor_id()));
+			      topology_sibling_cpumask(smp_processor_id()));
 }
 
 struct tlb_flush_param {
@@ -628,10 +629,26 @@ static void early_init_this_mmu(void)
 #ifdef CONFIG_PPC_FSL_BOOK3E
 	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
 		unsigned int num_cams;
+		int __maybe_unused cpu = smp_processor_id();
+		bool map = true;
 
 		/* use a quarter of the TLBCAM for bolted linear map */
 		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
-		linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
+
+		/*
+		 * Only do the mapping once per core, or else the
+		 * transient mapping would cause problems.
+		 */
+#ifdef CONFIG_SMP
+		if (cpu != boot_cpuid &&
+		    (cpu != cpu_first_thread_sibling(cpu) ||
+		     cpu == cpu_first_thread_sibling(boot_cpuid)))
+			map = false;
+#endif
+
+		if (map)
+			linear_map_top = map_mem_in_cams(linear_map_top,
+							 num_cams, false);
 	}
 #endif
 
@@ -729,10 +746,14 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 	 * entries are supported though that may eventually
 	 * change.
 	 *
-	 * on FSL Embedded 64-bit, we adjust the RMA size to match the
-	 * first bolted TLB entry size.  We still limit max to 1G even if
-	 * the TLB could cover more.  This is due to what the early init
-	 * code is setup to do.
+	 * on FSL Embedded 64-bit, usually all RAM is bolted, but with
+	 * unusual memory sizes it's possible for some RAM to not be mapped
+	 * (such RAM is not used at all by Linux, since we don't support
+	 * highmem on 64-bit).  We limit ppc64_rma_size to what would be
+	 * mappable if this memblock is the only one.  Additional memblocks
+	 * can only increase, not decrease, the amount that ends up getting
+	 * mapped.  We still limit max to 1G even if we'll eventually map
+	 * more.  This is due to what the early init code is set up to do.
 	 *
 	 * We crop it to the size of the first MEMBLOCK to
 	 * avoid going over total available memory just in case...
@@ -740,8 +761,14 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
 #ifdef CONFIG_PPC_FSL_BOOK3E
 	if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
 		unsigned long linear_sz;
-		linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET,
-					first_memblock_base);
+		unsigned int num_cams;
+
+		/* use a quarter of the TLBCAM for bolted linear map */
+		num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
+
+		linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
+					    true);
+
 		ppc64_rma_size = min_t(u64, linear_sz, 0x40000000);
 	} else
 #endif
diff --git a/kernel/arch/powerpc/mm/tlb_nohash_low.S b/kernel/arch/powerpc/mm/tlb_nohash_low.S
index 43ff3c797..68c477592 100644
--- a/kernel/arch/powerpc/mm/tlb_nohash_low.S
+++ b/kernel/arch/powerpc/mm/tlb_nohash_low.S
@@ -400,6 +400,7 @@ _GLOBAL(set_context)
  * extern void loadcam_entry(unsigned int index)
  *
  * Load TLBCAM[index] entry in to the L2 CAM MMU
+ * Must preserve r7, r8, r9, and r10
  */
 _GLOBAL(loadcam_entry)
 	mflr	r5
@@ -423,4 +424,66 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
 	tlbwe
 	isync
 	blr
+
+/*
+ * Load multiple TLB entries at once, using an alternate-space
+ * trampoline so that we don't have to care about whether the same
+ * TLB entry maps us before and after.
+ *
+ * r3 = first entry to write
+ * r4 = number of entries to write
+ * r5 = temporary tlb entry
+ */
+_GLOBAL(loadcam_multi)
+	mflr	r8
+
+	/*
+	 * Set up temporary TLB entry that is the same as what we're
+	 * running from, but in AS=1.
+	 */
+	bl	1f
+1:	mflr	r6
+	tlbsx	0,r8
+	mfspr	r6,SPRN_MAS1
+	ori	r6,r6,MAS1_TS
+	mtspr	SPRN_MAS1,r6
+	mfspr	r6,SPRN_MAS0
+	rlwimi	r6,r5,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
+	mr	r7,r5
+	mtspr	SPRN_MAS0,r6
+	isync
+	tlbwe
+	isync
+
+	/* Switch to AS=1 */
+	mfmsr	r6
+	ori	r6,r6,MSR_IS|MSR_DS
+	mtmsr	r6
+	isync
+
+	mr	r9,r3
+	add	r10,r3,r4
+2:	bl	loadcam_entry
+	addi	r9,r9,1
+	cmpw	r9,r10
+	mr	r3,r9
+	blt	2b
+
+	/* Return to AS=0 and clear the temporary entry */
+	mfmsr	r6
+	rlwinm.	r6,r6,0,~(MSR_IS|MSR_DS)
+	mtmsr	r6
+	isync
+
+	li	r6,0
+	mtspr	SPRN_MAS1,r6
+	rlwinm	r6,r7,MAS0_ESEL_SHIFT,MAS0_ESEL_MASK
+	oris	r6,r6,MAS0_TLBSEL(1)@h
+	mtspr	SPRN_MAS0,r6
+	isync
+	tlbwe
+	isync
+
+	mtlr	r8
+	blr
 #endif