31 files changed, 11774 insertions, 0 deletions
diff --git a/kernel/arch/powerpc/platforms/powernv/Kconfig b/kernel/arch/powerpc/platforms/powernv/Kconfig
new file mode 100644
index 000000000..4b044d8cb
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/Kconfig
@@ -0,0 +1,21 @@
+config PPC_POWERNV
+	depends on PPC64 && PPC_BOOK3S
+	bool "IBM PowerNV (Non-Virtualized) platform support"
+	select PPC_NATIVE
+	select PPC_XICS
+	select PPC_ICP_NATIVE
+	select PPC_P7_NAP
+	select PPC_PCI_CHOICE if EMBEDDED
+	select EPAPR_BOOT
+	select PPC_INDIRECT_PIO
+	select PPC_UDBG_16550
+	select PPC_SCOM
+	select ARCH_RANDOM
+	select CPU_FREQ
+	select CPU_FREQ_GOV_PERFORMANCE
+	select CPU_FREQ_GOV_POWERSAVE
+	select CPU_FREQ_GOV_USERSPACE
+	select CPU_FREQ_GOV_ONDEMAND
+	select CPU_FREQ_GOV_CONSERVATIVE
+	select PPC_DOORBELL
+	default y
diff --git a/kernel/arch/powerpc/platforms/powernv/Makefile b/kernel/arch/powerpc/platforms/powernv/Makefile
new file mode 100644
index 000000000..33e44f372
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/Makefile
@@ -0,0 +1,11 @@
+obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o
+obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
+obj-y			+= opal-msglog.o opal-hmi.o opal-power.o
+
+obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
+obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
+obj-$(CONFIG_EEH)	+= eeh-powernv.o
+obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
+obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
+obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
diff --git a/kernel/arch/powerpc/platforms/powernv/eeh-powernv.c b/kernel/arch/powerpc/platforms/powernv/eeh-powernv.c
new file mode 100644
index 000000000..ce738ab3d
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -0,0 +1,1542 @@
+/*
+ * The file intends to implement the platform dependent EEH operations on
+ * powernv platform. Actually, the powernv was created in order to fully
+ * hypervisor support.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/atomic.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/firmware.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/opal.h>
+#include <asm/ppc-pci.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+static bool pnv_eeh_nb_init = false;
+
+/**
+ * pnv_eeh_init - EEH platform dependent initialization
+ *
+ * EEH platform dependent initialization on powernv
+ */
+static int pnv_eeh_init(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+
+	/* We require OPALv3 */
+	if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
+		pr_warn("%s: OPALv3 is required !\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	/* Set probe mode */
+	eeh_add_flag(EEH_PROBE_MODE_DEV);
+
+	/*
+	 * P7IOC blocks PCI config access to frozen PE, but PHB3
+	 * doesn't do that. So we have to selectively enable I/O
+	 * prior to collecting error log.
+	 */
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		if (phb->model == PNV_PHB_MODEL_P7IOC)
+			eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+
+		/*
+		 * PE#0 should be regarded as valid by EEH core
+		 * if it's not the reserved one. Currently, we
+		 * have the reserved PE#0 and PE#127 for PHB3
+		 * and P7IOC separately. So we should regard
+		 * PE#0 as valid for P7IOC.
+		 */
+		if (phb->ioda.reserved_pe != 0)
+			eeh_add_flag(EEH_VALID_PE_ZERO);
+
+		break;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_event(struct notifier_block *nb,
+			 unsigned long events, void *change)
+{
+	uint64_t changed_evts = (uint64_t)change;
+
+	/*
+	 * We simply send special EEH event if EEH has
+	 * been enabled, or clear pending events in
+	 * case that we enable EEH soon
+	 */
+	if (!(changed_evts & OPAL_EVENT_PCI_ERROR) ||
+	    !(events & OPAL_EVENT_PCI_ERROR))
+		return 0;
+
+	if (eeh_enabled())
+		eeh_send_failure_event(NULL);
+	else
+		opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
+
+	return 0;
+}
+
+static struct notifier_block pnv_eeh_nb = {
+	.notifier_call	= pnv_eeh_event,
+	.next		= NULL,
+	.priority	= 0
+};
+
+#ifdef CONFIG_DEBUG_FS
+static ssize_t pnv_eeh_ei_write(struct file *filp,
+				const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct pci_controller *hose = filp->private_data;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int pe_no, type, func;
+	unsigned long addr, mask;
+	char buf[50];
+	int ret;
+
+	if (!eeh_ops || !eeh_ops->err_inject)
+		return -ENXIO;
+
+	/* Copy over argument buffer */
+	ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+	if (!ret)
+		return -EFAULT;
+
+	/* Retrieve parameters */
+	ret = sscanf(buf, "%x:%x:%x:%lx:%lx",
+		     &pe_no, &type, &func, &addr, &mask);
+	if (ret != 5)
+		return -EINVAL;
+
+	/* Retrieve PE */
+	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+	if (!edev)
+		return -ENOMEM;
+	edev->phb = hose;
+	edev->pe_config_addr = pe_no;
+	pe = eeh_pe_get(edev);
+	kfree(edev);
+	if (!pe)
+		return -ENODEV;
+
+	/* Do error injection */
+	ret = eeh_ops->err_inject(pe, type, func, addr, mask);
+	return ret < 0 ? ret : count;
+}
+
+static const struct file_operations pnv_eeh_ei_fops = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+	.write	= pnv_eeh_ei_write,
+};
+
+static int pnv_eeh_dbgfs_set(void *data, int offset, u64 val)
+{
+	struct pci_controller *hose = data;
+	struct pnv_phb *phb = hose->private_data;
+
+	out_be64(phb->regs + offset, val);
+	return 0;
+}
+
+static int pnv_eeh_dbgfs_get(void *data, int offset, u64 *val)
+{
+	struct pci_controller *hose = data;
+	struct pnv_phb *phb = hose->private_data;
+
+	*val = in_be64(phb->regs + offset);
+	return 0;
+}
+
+static int pnv_eeh_outb_dbgfs_set(void *data, u64 val)
+{
+	return pnv_eeh_dbgfs_set(data, 0xD10, val);
+}
+
+static int pnv_eeh_outb_dbgfs_get(void *data, u64 *val)
+{
+	return pnv_eeh_dbgfs_get(data, 0xD10, val);
+}
+
+static int pnv_eeh_inbA_dbgfs_set(void *data, u64 val)
+{
+	return pnv_eeh_dbgfs_set(data, 0xD90, val);
+}
+
+static int pnv_eeh_inbA_dbgfs_get(void *data, u64 *val)
+{
+	return pnv_eeh_dbgfs_get(data, 0xD90, val);
+}
+
+static int pnv_eeh_inbB_dbgfs_set(void *data, u64 val)
+{
+	return pnv_eeh_dbgfs_set(data, 0xE10, val);
+}
+
+static int pnv_eeh_inbB_dbgfs_get(void *data, u64 *val)
+{
+	return pnv_eeh_dbgfs_get(data, 0xE10, val);
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_outb_dbgfs_ops, pnv_eeh_outb_dbgfs_get,
+			pnv_eeh_outb_dbgfs_set, "0x%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbA_dbgfs_ops, pnv_eeh_inbA_dbgfs_get,
+			pnv_eeh_inbA_dbgfs_set, "0x%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbB_dbgfs_ops, pnv_eeh_inbB_dbgfs_get,
+			pnv_eeh_inbB_dbgfs_set, "0x%llx\n");
+#endif /* CONFIG_DEBUG_FS */
+
+/**
+ * pnv_eeh_post_init - EEH platform dependent post initialization
+ *
+ * EEH platform dependent post initialization on powernv. When
+ * the function is called, the EEH PEs and devices should have
+ * been built. If the I/O cache staff has been built, EEH is
+ * ready to supply service.
+ */
+static int pnv_eeh_post_init(void)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	int ret = 0;
+
+	/* Register OPAL event notifier */
+	if (!pnv_eeh_nb_init) {
+		ret = opal_notifier_register(&pnv_eeh_nb);
+		if (ret) {
+			pr_warn("%s: Can't register OPAL event notifier (%d)\n",
+				__func__, ret);
+			return ret;
+		}
+
+		pnv_eeh_nb_init = true;
+	}
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		/*
+		 * If EEH is enabled, we're going to rely on that.
+		 * Otherwise, we restore to conventional mechanism
+		 * to clear frozen PE during PCI config access.
+		 */
+		if (eeh_enabled())
+			phb->flags |= PNV_PHB_FLAG_EEH;
+		else
+			phb->flags &= ~PNV_PHB_FLAG_EEH;
+
+		/* Create debugfs entries */
+#ifdef CONFIG_DEBUG_FS
+		if (phb->has_dbgfs || !phb->dbgfs)
+			continue;
+
+		phb->has_dbgfs = 1;
+		debugfs_create_file("err_injct", 0200,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_ei_fops);
+
+		debugfs_create_file("err_injct_outbound", 0600,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_outb_dbgfs_ops);
+		debugfs_create_file("err_injct_inboundA", 0600,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_inbA_dbgfs_ops);
+		debugfs_create_file("err_injct_inboundB", 0600,
+				    phb->dbgfs, hose,
+				    &pnv_eeh_inbB_dbgfs_ops);
+#endif /* CONFIG_DEBUG_FS */
+	}
+
+
+	return ret;
+}
+
+static int pnv_eeh_cap_start(struct pci_dn *pdn)
+{
+	u32 status;
+
+	if (!pdn)
+		return 0;
+
+	pnv_pci_cfg_read(pdn, PCI_STATUS, 2, &status);
+	if (!(status & PCI_STATUS_CAP_LIST))
+		return 0;
+
+	return PCI_CAPABILITY_LIST;
+}
+
+static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap)
+{
+	int pos = pnv_eeh_cap_start(pdn);
+	int cnt = 48;   /* Maximal number of capabilities */
+	u32 id;
+
+	if (!pos)
+		return 0;
+
+	while (cnt--) {
+		pnv_pci_cfg_read(pdn, pos, 1, &pos);
+		if (pos < 0x40)
+			break;
+
+		pos &= ~3;
+		pnv_pci_cfg_read(pdn, pos + PCI_CAP_LIST_ID, 1, &id);
+		if (id == 0xff)
+			break;
+
+		/* Found */
+		if (id == cap)
+			return pos;
+
+		/* Next one */
+		pos += PCI_CAP_LIST_NEXT;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 header;
+	int pos = 256, ttl = (4096 - 256) / 8;
+
+	if (!edev || !edev->pcie_cap)
+		return 0;
+	if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+		return 0;
+	else if (!header)
+		return 0;
+
+	while (ttl-- > 0) {
+		if (PCI_EXT_CAP_ID(header) == cap && pos)
+			return pos;
+
+		pos = PCI_EXT_CAP_NEXT(header);
+		if (pos < 256)
+			break;
+
+		if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL)
+			break;
+	}
+
+	return 0;
+}
+
+/**
+ * pnv_eeh_probe - Do probe on PCI device
+ * @pdn: PCI device node
+ * @data: unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose. By default, EEH has been enabled
+ * on all PCI devices. That's to say, we only need do necessary
+ * initialization on the corresponding eeh device and create PE
+ * accordingly.
+ *
+ * It's notable that's unsafe to retrieve the EEH device through
+ * the corresponding PCI device. During the PCI device hotplug, which
+ * was possiblly triggered by EEH core, the binding between EEH device
+ * and the PCI device isn't built yet.
+ */
+static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
+{
+	struct pci_controller *hose = pdn->phb;
+	struct pnv_phb *phb = hose->private_data;
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	uint32_t pcie_flags;
+	int ret;
+
+	/*
+	 * When probing the root bridge, which doesn't have any
+	 * subordinate PCI devices. We don't have OF node for
+	 * the root bridge. So it's not reasonable to continue
+	 * the probing.
+	 */
+	if (!edev || edev->pe)
+		return NULL;
+
+	/* Skip for PCI-ISA bridge */
+	if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+		return NULL;
+
+	/* Initialize eeh device */
+	edev->class_code = pdn->class_code;
+	edev->mode	&= 0xFFFFFF00;
+	edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
+	edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+	edev->aer_cap  = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
+	if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+		edev->mode |= EEH_DEV_BRIDGE;
+		if (edev->pcie_cap) {
+			pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
+					 2, &pcie_flags);
+			pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4;
+			if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT)
+				edev->mode |= EEH_DEV_ROOT_PORT;
+			else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM)
+				edev->mode |= EEH_DEV_DS_PORT;
+		}
+	}
+
+	edev->config_addr    = (pdn->busno << 8) | (pdn->devfn);
+	edev->pe_config_addr = phb->ioda.pe_rmap[edev->config_addr];
+
+	/* Create PE */
+	ret = eeh_add_to_parent_pe(edev);
+	if (ret) {
+		pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%d)\n",
+			__func__, hose->global_number, pdn->busno,
+			PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
+		return NULL;
+	}
+
+	/*
+	 * If the PE contains any one of following adapters, the
+	 * PCI config space can't be accessed when dumping EEH log.
+	 * Otherwise, we will run into fenced PHB caused by shortage
+	 * of outbound credits in the adapter. The PCI config access
+	 * should be blocked until PE reset. MMIO access is dropped
+	 * by hardware certainly. In order to drop PCI config requests,
+	 * one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which
+	 * will be checked in the backend for PE state retrival. If
+	 * the PE becomes frozen for the first time and the flag has
+	 * been set for the PE, we will set EEH_PE_CFG_BLOCKED for
+	 * that PE to block its config space.
+	 *
+	 * Broadcom Austin 4-ports NICs (14e4:1657)
+	 * Broadcom Shiner 2-ports 10G NICs (14e4:168e)
+	 */
+	if ((pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+	     pdn->device_id == 0x1657) ||
+	    (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM &&
+	     pdn->device_id == 0x168e))
+		edev->pe->state |= EEH_PE_CFG_RESTRICTED;
+
+	/*
+	 * Cache the PE primary bus, which can't be fetched when
+	 * full hotplug is in progress. In that case, all child
+	 * PCI devices of the PE are expected to be removed prior
+	 * to PE reset.
+	 */
+	if (!edev->pe->bus)
+		edev->pe->bus = pci_find_bus(hose->global_number,
+					     pdn->busno);
+
+	/*
+	 * Enable EEH explicitly so that we will do EEH check
+	 * while accessing I/O stuff
+	 */
+	eeh_add_flag(EEH_ENABLED);
+
+	/* Save memory bars */
+	eeh_save_bars(edev);
+
+	return NULL;
+}
+
+/**
+ * pnv_eeh_set_option - Initialize EEH or MMIO/DMA reenable
+ * @pe: EEH PE
+ * @option: operation to be issued
+ *
+ * The function is used to control the EEH functionality globally.
+ * Currently, following options are support according to PAPR:
+ * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
+ */
+static int pnv_eeh_set_option(struct eeh_pe *pe, int option)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	bool freeze_pe = false;
+	int opt, ret = 0;
+	s64 rc;
+
+	/* Sanity check on option */
+	switch (option) {
+	case EEH_OPT_DISABLE:
+		return -EPERM;
+	case EEH_OPT_ENABLE:
+		return 0;
+	case EEH_OPT_THAW_MMIO:
+		opt = OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO;
+		break;
+	case EEH_OPT_THAW_DMA:
+		opt = OPAL_EEH_ACTION_CLEAR_FREEZE_DMA;
+		break;
+	case EEH_OPT_FREEZE_PE:
+		freeze_pe = true;
+		opt = OPAL_EEH_ACTION_SET_FREEZE_ALL;
+		break;
+	default:
+		pr_warn("%s: Invalid option %d\n", __func__, option);
+		return -EINVAL;
+	}
+
+	/* If PHB supports compound PE, to handle it */
+	if (freeze_pe) {
+		if (phb->freeze_pe) {
+			phb->freeze_pe(phb, pe->addr);
+		} else {
+			rc = opal_pci_eeh_freeze_set(phb->opal_id,
+						     pe->addr, opt);
+			if (rc != OPAL_SUCCESS) {
+				pr_warn("%s: Failure %lld freezing "
+					"PHB#%x-PE#%x\n",
+					__func__, rc,
+					phb->hose->global_number, pe->addr);
+				ret = -EIO;
+			}
+		}
+	} else {
+		if (phb->unfreeze_pe) {
+			ret = phb->unfreeze_pe(phb, pe->addr, opt);
+		} else {
+			rc = opal_pci_eeh_freeze_clear(phb->opal_id,
+						       pe->addr, opt);
+			if (rc != OPAL_SUCCESS) {
+				pr_warn("%s: Failure %lld enable %d "
+					"for PHB#%x-PE#%x\n",
+					__func__, rc, option,
+					phb->hose->global_number, pe->addr);
+				ret = -EIO;
+			}
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * pnv_eeh_get_pe_addr - Retrieve PE address
+ * @pe: EEH PE
+ *
+ * Retrieve the PE address according to the given tranditional
+ * PCI BDF (Bus/Device/Function) address.
+ */
+static int pnv_eeh_get_pe_addr(struct eeh_pe *pe)
+{
+	return pe->addr;
+}
+
+static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb->private_data;
+	s64 rc;
+
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data,
+					 PNV_PCI_DIAG_BUF_SIZE);
+	if (rc != OPAL_SUCCESS)
+		pr_warn("%s: Failure %lld getting PHB#%x diag-data\n",
+			__func__, rc, pe->phb->global_number);
+}
+
+static int pnv_eeh_get_phb_state(struct eeh_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb->private_data;
+	u8 fstate;
+	__be16 pcierr;
+	s64 rc;
+	int result = 0;
+
+	rc = opal_pci_eeh_freeze_status(phb->opal_id,
+					pe->addr,
+					&fstate,
+					&pcierr,
+					NULL);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld getting PHB#%x state\n",
+			__func__, rc, phb->hose->global_number);
+		return EEH_STATE_NOT_SUPPORT;
+	}
+
+	/*
+	 * Check PHB state. If the PHB is frozen for the
+	 * first time, to dump the PHB diag-data.
+	 */
+	if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) {
+		result = (EEH_STATE_MMIO_ACTIVE  |
+			  EEH_STATE_DMA_ACTIVE   |
+			  EEH_STATE_MMIO_ENABLED |
+			  EEH_STATE_DMA_ENABLED);
+	} else if (!(pe->state & EEH_PE_ISOLATED)) {
+		eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+		pnv_eeh_get_phb_diag(pe);
+
+		if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+			pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+	}
+
+	return result;
+}
+
+static int pnv_eeh_get_pe_state(struct eeh_pe *pe)
+{
+	struct pnv_phb *phb = pe->phb->private_data;
+	u8 fstate;
+	__be16 pcierr;
+	s64 rc;
+	int result;
+
+	/*
+	 * We don't clobber hardware frozen state until PE
+	 * reset is completed. In order to keep EEH core
+	 * moving forward, we have to return operational
+	 * state during PE reset.
+	 */
+	if (pe->state & EEH_PE_RESET) {
+		result = (EEH_STATE_MMIO_ACTIVE  |
+			  EEH_STATE_DMA_ACTIVE   |
+			  EEH_STATE_MMIO_ENABLED |
+			  EEH_STATE_DMA_ENABLED);
+		return result;
+	}
+
+	/*
+	 * Fetch PE state from hardware. If the PHB
+	 * supports compound PE, let it handle that.
+	 */
+	if (phb->get_pe_state) {
+		fstate = phb->get_pe_state(phb, pe->addr);
+	} else {
+		rc = opal_pci_eeh_freeze_status(phb->opal_id,
+						pe->addr,
+						&fstate,
+						&pcierr,
+						NULL);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld getting PHB#%x-PE%x state\n",
+				__func__, rc, phb->hose->global_number,
+				pe->addr);
+			return EEH_STATE_NOT_SUPPORT;
+		}
+	}
+
+	/* Figure out state */
+	switch (fstate) {
+	case OPAL_EEH_STOPPED_NOT_FROZEN:
+		result = (EEH_STATE_MMIO_ACTIVE  |
+			  EEH_STATE_DMA_ACTIVE   |
+			  EEH_STATE_MMIO_ENABLED |
+			  EEH_STATE_DMA_ENABLED);
+		break;
+	case OPAL_EEH_STOPPED_MMIO_FREEZE:
+		result = (EEH_STATE_DMA_ACTIVE |
+			  EEH_STATE_DMA_ENABLED);
+		break;
+	case OPAL_EEH_STOPPED_DMA_FREEZE:
+		result = (EEH_STATE_MMIO_ACTIVE |
+			  EEH_STATE_MMIO_ENABLED);
+		break;
+	case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE:
+		result = 0;
+		break;
+	case OPAL_EEH_STOPPED_RESET:
+		result = EEH_STATE_RESET_ACTIVE;
+		break;
+	case OPAL_EEH_STOPPED_TEMP_UNAVAIL:
+		result = EEH_STATE_UNAVAILABLE;
+		break;
+	case OPAL_EEH_STOPPED_PERM_UNAVAIL:
+		result = EEH_STATE_NOT_SUPPORT;
+		break;
+	default:
+		result = EEH_STATE_NOT_SUPPORT;
+		pr_warn("%s: Invalid PHB#%x-PE#%x state %x\n",
+			__func__, phb->hose->global_number,
+			pe->addr, fstate);
+	}
+
+	/*
+	 * If PHB supports compound PE, to freeze all
+	 * slave PEs for consistency.
+	 *
+	 * If the PE is switching to frozen state for the
+	 * first time, to dump the PHB diag-data.
+	 */
+	if (!(result & EEH_STATE_NOT_SUPPORT) &&
+	    !(result & EEH_STATE_UNAVAILABLE) &&
+	    !(result & EEH_STATE_MMIO_ACTIVE) &&
+	    !(result & EEH_STATE_DMA_ACTIVE)  &&
+	    !(pe->state & EEH_PE_ISOLATED)) {
+		if (phb->freeze_pe)
+			phb->freeze_pe(phb, pe->addr);
+
+		eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+		pnv_eeh_get_phb_diag(pe);
+
+		if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+			pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+	}
+
+	return result;
+}
+
+/**
+ * pnv_eeh_get_state - Retrieve PE state
+ * @pe: EEH PE
+ * @delay: delay while PE state is temporarily unavailable
+ *
+ * Retrieve the state of the specified PE. For IODA-compitable
+ * platform, it should be retrieved from IODA table. Therefore,
+ * we prefer passing down to hardware implementation to handle
+ * it.
+ */
+static int pnv_eeh_get_state(struct eeh_pe *pe, int *delay)
+{
+	int ret;
+
+	if (pe->type & EEH_PE_PHB)
+		ret = pnv_eeh_get_phb_state(pe);
+	else
+		ret = pnv_eeh_get_pe_state(pe);
+
+	if (!delay)
+		return ret;
+
+	/*
+	 * If the PE state is temporarily unavailable,
+	 * to inform the EEH core delay for default
+	 * period (1 second)
+	 */
+	*delay = 0;
+	if (ret & EEH_STATE_UNAVAILABLE)
+		*delay = 1000;
+
+	return ret;
+}
+
+static s64 pnv_eeh_phb_poll(struct pnv_phb *phb)
+{
+	s64 rc = OPAL_HARDWARE;
+
+	while (1) {
+		rc = opal_pci_poll(phb->opal_id);
+		if (rc <= 0)
+			break;
+
+		if (system_state < SYSTEM_RUNNING)
+			udelay(1000 * rc);
+		else
+			msleep(rc);
+	}
+
+	return rc;
+}
+
+int pnv_eeh_phb_reset(struct pci_controller *hose, int option)
+{
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc = OPAL_HARDWARE;
+
+	pr_debug("%s: Reset PHB#%x, option=%d\n",
+		 __func__, hose->global_number, option);
+
+	/* Issue PHB complete reset request */
+	if (option == EEH_RESET_FUNDAMENTAL ||
+	    option == EEH_RESET_HOT)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PHB_COMPLETE,
+				    OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_DEACTIVATE)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PHB_COMPLETE,
+				    OPAL_DEASSERT_RESET);
+	if (rc < 0)
+		goto out;
+
+	/*
+	 * Poll state of the PHB until the request is done
+	 * successfully. The PHB reset is usually PHB complete
+	 * reset followed by hot reset on root bus. So we also
+	 * need the PCI bus settlement delay.
+	 */
+	rc = pnv_eeh_phb_poll(phb);
+	if (option == EEH_RESET_DEACTIVATE) {
+		if (system_state < SYSTEM_RUNNING)
+			udelay(1000 * EEH_PE_RST_SETTLE_TIME);
+		else
+			msleep(EEH_PE_RST_SETTLE_TIME);
+	}
+out:
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+static int pnv_eeh_root_reset(struct pci_controller *hose, int option)
+{
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc = OPAL_HARDWARE;
+
+	pr_debug("%s: Reset PHB#%x, option=%d\n",
+		 __func__, hose->global_number, option);
+
+	/*
+	 * During the reset deassert time, we needn't care
+	 * the reset scope because the firmware does nothing
+	 * for fundamental or hot reset during deassert phase.
+	 */
+	if (option == EEH_RESET_FUNDAMENTAL)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PCI_FUNDAMENTAL,
+				    OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_HOT)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PCI_HOT,
+				    OPAL_ASSERT_RESET);
+	else if (option == EEH_RESET_DEACTIVATE)
+		rc = opal_pci_reset(phb->opal_id,
+				    OPAL_RESET_PCI_HOT,
+				    OPAL_DEASSERT_RESET);
+	if (rc < 0)
+		goto out;
+
+	/* Poll state of the PHB until the request is done */
+	rc = pnv_eeh_phb_poll(phb);
+	if (option == EEH_RESET_DEACTIVATE)
+		msleep(EEH_PE_RST_SETTLE_TIME);
+out:
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+
+	return 0;
+}
+
+static int pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
+{
+	struct pci_dn *pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	int aer = edev ? edev->aer_cap : 0;
+	u32 ctrl;
+
+	pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+		 __func__, pci_domain_nr(dev->bus),
+		 dev->bus->number, option);
+
+	switch (option) {
+	case EEH_RESET_FUNDAMENTAL:
+	case EEH_RESET_HOT:
+		/* Don't report linkDown event */
+		if (aer) {
+			eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+					     4, &ctrl);
+			ctrl |= PCI_ERR_UNC_SURPDN;
+			eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+					      4, ctrl);
+		}
+
+		eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+		ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
+		eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+
+		msleep(EEH_PE_RST_HOLD_TIME);
+		break;
+	case EEH_RESET_DEACTIVATE:
+		eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+		ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
+		eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+
+		msleep(EEH_PE_RST_SETTLE_TIME);
+
+		/* Continue reporting linkDown event */
+		if (aer) {
+			eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+					     4, &ctrl);
+			ctrl &= ~PCI_ERR_UNC_SURPDN;
+			eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+					      4, ctrl);
+		}
+
+		break;
+	}
+
+	return 0;
+}
+
+void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
+{
+	struct pci_controller *hose;
+
+	if (pci_is_root_bus(dev->bus)) {
+		hose = pci_bus_to_host(dev->bus);
+		pnv_eeh_root_reset(hose, EEH_RESET_HOT);
+		pnv_eeh_root_reset(hose, EEH_RESET_DEACTIVATE);
+	} else {
+		pnv_eeh_bridge_reset(dev, EEH_RESET_HOT);
+		pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE);
+	}
+}
+
+/**
+ * pnv_eeh_reset - Reset the specified PE
+ * @pe: EEH PE
+ * @option: reset option
+ *
+ * Do reset on the indicated PE. For PCI bus sensitive PE,
+ * we need to reset the parent p2p bridge. The PHB has to
+ * be reinitialized if the p2p bridge is root bridge. For
+ * PCI device sensitive PE, we will try to reset the device
+ * through FLR. For now, we don't have OPAL APIs to do HARD
+ * reset yet, so all reset would be SOFT (HOT) reset.
+ */
+static int pnv_eeh_reset(struct eeh_pe *pe, int option)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pci_bus *bus;
+	int ret;
+
+	/*
+	 * For PHB reset, we always have complete reset. For those PEs whose
+	 * primary bus derived from root complex (root bus) or root port
+	 * (usually bus#1), we apply hot or fundamental reset on the root port.
+	 * For other PEs, we always have hot reset on the PE primary bus.
+	 *
+	 * Here, we have different design to pHyp, which always clear the
+	 * frozen state during PE reset. However, the good idea here from
+	 * benh is to keep frozen state before we get PE reset done completely
+	 * (until BAR restore). With the frozen state, HW drops illegal IO
+	 * or MMIO access, which can incur recrusive frozen PE during PE
+	 * reset. The side effect is that EEH core has to clear the frozen
+	 * state explicitly after BAR restore.
+	 */
+	if (pe->type & EEH_PE_PHB) {
+		ret = pnv_eeh_phb_reset(hose, option);
+	} else {
+		struct pnv_phb *phb;
+		s64 rc;
+
+		/*
+		 * The frozen PE might be caused by PAPR error injection
+		 * registers, which are expected to be cleared after hitting
+		 * frozen PE as stated in the hardware spec. Unfortunately,
+		 * that's not true on P7IOC. So we have to clear it manually
+		 * to avoid recursive EEH errors during recovery.
+		 */
+		phb = hose->private_data;
+		if (phb->model == PNV_PHB_MODEL_P7IOC &&
+		    (option == EEH_RESET_HOT ||
+		    option == EEH_RESET_FUNDAMENTAL)) {
+			rc = opal_pci_reset(phb->opal_id,
+					    OPAL_RESET_PHB_ERROR,
+					    OPAL_ASSERT_RESET);
+			if (rc != OPAL_SUCCESS) {
+				pr_warn("%s: Failure %lld clearing "
+					"error injection registers\n",
+					__func__, rc);
+				return -EIO;
+			}
+		}
+
+		bus = eeh_pe_bus_get(pe);
+		if (pci_is_root_bus(bus) ||
+			pci_is_root_bus(bus->parent))
+			ret = pnv_eeh_root_reset(hose, option);
+		else
+			ret = pnv_eeh_bridge_reset(bus->self, option);
+	}
+
+	return ret;
+}
+
+/**
+ * pnv_eeh_wait_state - Wait for PE state
+ * @pe: EEH PE
+ * @max_wait: maximal period in microsecond
+ *
+ * Wait for the state of associated PE. It might take some time
+ * to retrieve the PE's state.
+ */
+static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
+{
+	int ret;
+	int mwait;
+
+	while (1) {
+		ret = pnv_eeh_get_state(pe, &mwait);
+
+		/*
+		 * If the PE's state is temporarily unavailable,
+		 * we have to wait for the specified time. Otherwise,
+		 * the PE's state will be returned immediately.
+		 */
+		if (ret != EEH_STATE_UNAVAILABLE)
+			return ret;
+
+		max_wait -= mwait;
+		if (max_wait <= 0) {
+			pr_warn("%s: Timeout getting PE#%x's state (%d)\n",
+				__func__, pe->addr, max_wait);
+			return EEH_STATE_NOT_SUPPORT;
+		}
+
+		msleep(mwait);
+	}
+
+	return EEH_STATE_NOT_SUPPORT;
+}
+
+/**
+ * pnv_eeh_get_log - Retrieve error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ * @drv_log: driver log to be combined with retrieved error log
+ * @len: length of driver log
+ *
+ * Retrieve the temporary or permanent error from the PE.
+ */
+static int pnv_eeh_get_log(struct eeh_pe *pe, int severity,
+			   char *drv_log, unsigned long len)
+{
+	if (!eeh_has_flag(EEH_EARLY_DUMP_LOG))
+		pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
+
+	return 0;
+}
+
+/**
+ * pnv_eeh_configure_bridge - Configure PCI bridges in the indicated PE
+ * @pe: EEH PE
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pnv_eeh_configure_bridge(struct eeh_pe *pe)
+{
+	return 0;
+}
+
+/**
+ * pnv_pe_err_inject - Inject specified error to the indicated PE
+ * @pe: the indicated PE
+ * @type: error type
+ * @func: specific error type
+ * @addr: address
+ * @mask: address mask
+ *
+ * The routine is called to inject specified error, which is
+ * determined by @type and @func, to the indicated PE for
+ * testing purpose.
+ */
+static int pnv_eeh_err_inject(struct eeh_pe *pe, int type, int func,
+			      unsigned long addr, unsigned long mask)
+{
+	struct pci_controller *hose = pe->phb;
+	struct pnv_phb *phb = hose->private_data;
+	s64 rc;
+
+	/* Sanity check on error type */
+	if (type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR &&
+	    type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) {
+		pr_warn("%s: Invalid error type %d\n",
+			__func__, type);
+		return -ERANGE;
+	}
+
+	if (func < OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR ||
+	    func > OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET) {
+		pr_warn("%s: Invalid error function %d\n",
+			__func__, func);
+		return -ERANGE;
+	}
+
+	/* Firmware supports error injection ? */
+	if (!opal_check_token(OPAL_PCI_ERR_INJECT)) {
+		pr_warn("%s: Firmware doesn't support error injection\n",
+			__func__);
+		return -ENXIO;
+	}
+
+	/* Do error injection */
+	rc = opal_pci_err_inject(phb->opal_id, pe->addr,
+				 type, func, addr, mask);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld injecting error "
+			"%d-%d to PHB#%x-PE#%x\n",
+			__func__, rc, type, func,
+			hose->global_number, pe->addr);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+
+	if (!edev || !edev->pe)
+		return false;
+
+	if (edev->pe->state & EEH_PE_CFG_BLOCKED)
+		return true;
+
+	return false;
+}
+
+static int pnv_eeh_read_config(struct pci_dn *pdn,
+			       int where, int size, u32 *val)
+{
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (pnv_eeh_cfg_blocked(pdn)) {
+		*val = 0xFFFFFFFF;
+		return PCIBIOS_SET_FAILED;
+	}
+
+	return pnv_pci_cfg_read(pdn, where, size, val);
+}
+
+static int pnv_eeh_write_config(struct pci_dn *pdn,
+				int where, int size, u32 val)
+{
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (pnv_eeh_cfg_blocked(pdn))
+		return PCIBIOS_SET_FAILED;
+
+	return pnv_pci_cfg_write(pdn, where, size, val);
+}
+
+static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data)
+{
+	/* GEM */
+	if (data->gemXfir || data->gemRfir ||
+	    data->gemRirqfir || data->gemMask || data->gemRwof)
+		pr_info("  GEM: %016llx %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->gemXfir),
+			be64_to_cpu(data->gemRfir),
+			be64_to_cpu(data->gemRirqfir),
+			be64_to_cpu(data->gemMask),
+			be64_to_cpu(data->gemRwof));
+
+	/* LEM */
+	if (data->lemFir || data->lemErrMask ||
+	    data->lemAction0 || data->lemAction1 || data->lemWof)
+		pr_info("  LEM: %016llx %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->lemFir),
+			be64_to_cpu(data->lemErrMask),
+			be64_to_cpu(data->lemAction0),
+			be64_to_cpu(data->lemAction1),
+			be64_to_cpu(data->lemWof));
+}
+
+static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose)
+{
+	struct pnv_phb *phb = hose->private_data;
+	struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag;
+	long rc;
+
+	rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data));
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failed to get HUB#%llx diag-data (%ld)\n",
+			__func__, phb->hub_id, rc);
+		return;
+	}
+
+	switch (data->type) {
+	case OPAL_P7IOC_DIAG_TYPE_RGC:
+		pr_info("P7IOC diag-data for RGC\n\n");
+		pnv_eeh_dump_hub_diag_common(data);
+		if (data->rgc.rgcStatus || data->rgc.rgcLdcp)
+			pr_info("  RGC: %016llx %016llx\n",
+				be64_to_cpu(data->rgc.rgcStatus),
+				be64_to_cpu(data->rgc.rgcLdcp));
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_BI:
+		pr_info("P7IOC diag-data for BI %s\n\n",
+			data->bi.biDownbound ? "Downbound" : "Upbound");
+		pnv_eeh_dump_hub_diag_common(data);
+		if (data->bi.biLdcp0 || data->bi.biLdcp1 ||
+		    data->bi.biLdcp2 || data->bi.biFenceStatus)
+			pr_info("  BI:  %016llx %016llx %016llx %016llx\n",
+				be64_to_cpu(data->bi.biLdcp0),
+				be64_to_cpu(data->bi.biLdcp1),
+				be64_to_cpu(data->bi.biLdcp2),
+				be64_to_cpu(data->bi.biFenceStatus));
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_CI:
+		pr_info("P7IOC diag-data for CI Port %d\n\n",
+			data->ci.ciPort);
+		pnv_eeh_dump_hub_diag_common(data);
+		if (data->ci.ciPortStatus || data->ci.ciPortLdcp)
+			pr_info("  CI:  %016llx %016llx\n",
+				be64_to_cpu(data->ci.ciPortStatus),
+				be64_to_cpu(data->ci.ciPortLdcp));
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_MISC:
+		pr_info("P7IOC diag-data for MISC\n\n");
+		pnv_eeh_dump_hub_diag_common(data);
+		break;
+	case OPAL_P7IOC_DIAG_TYPE_I2C:
+		pr_info("P7IOC diag-data for I2C\n\n");
+		pnv_eeh_dump_hub_diag_common(data);
+		break;
+	default:
+		pr_warn("%s: Invalid type of HUB#%llx diag-data (%d)\n",
+			__func__, phb->hub_id, data->type);
+	}
+}
+
+static int pnv_eeh_get_pe(struct pci_controller *hose,
+			  u16 pe_no, struct eeh_pe **pe)
+{
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pnv_pe;
+	struct eeh_pe *dev_pe;
+	struct eeh_dev edev;
+
+	/*
+	 * If PHB supports compound PE, to fetch
+	 * the master PE because slave PE is invisible
+	 * to EEH core.
+	 */
+	pnv_pe = &phb->ioda.pe_array[pe_no];
+	if (pnv_pe->flags & PNV_IODA_PE_SLAVE) {
+		pnv_pe = pnv_pe->master;
+		WARN_ON(!pnv_pe ||
+			!(pnv_pe->flags & PNV_IODA_PE_MASTER));
+		pe_no = pnv_pe->pe_number;
+	}
+
+	/* Find the PE according to PE# */
+	memset(&edev, 0, sizeof(struct eeh_dev));
+	edev.phb = hose;
+	edev.pe_config_addr = pe_no;
+	dev_pe = eeh_pe_get(&edev);
+	if (!dev_pe)
+		return -EEXIST;
+
+	/* Freeze the (compound) PE */
+	*pe = dev_pe;
+	if (!(dev_pe->state & EEH_PE_ISOLATED))
+		phb->freeze_pe(phb, pe_no);
+
+	/*
+	 * At this point, we're sure the (compound) PE should
+	 * have been frozen. However, we still need poke until
+	 * hitting the frozen PE on top level.
+	 */
+	dev_pe = dev_pe->parent;
+	while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) {
+		int ret;
+		int active_flags = (EEH_STATE_MMIO_ACTIVE |
+				    EEH_STATE_DMA_ACTIVE);
+
+		ret = eeh_ops->get_state(dev_pe, NULL);
+		if (ret <= 0 || (ret & active_flags) == active_flags) {
+			dev_pe = dev_pe->parent;
+			continue;
+		}
+
+		/* Frozen parent PE */
+		*pe = dev_pe;
+		if (!(dev_pe->state & EEH_PE_ISOLATED))
+			phb->freeze_pe(phb, dev_pe->addr);
+
+		/* Next one */
+		dev_pe = dev_pe->parent;
+	}
+
+	return 0;
+}
+
+/**
+ * pnv_eeh_next_error - Retrieve next EEH error to handle
+ * @pe: Affected PE
+ *
+ * The function is expected to be called by EEH core while it gets
+ * special EEH event (without binding PE). The function calls to
+ * OPAL APIs for next error to handle. The informational error is
+ * handled internally by platform. However, the dead IOC, dead PHB,
+ * fenced PHB and frozen PE should be handled by EEH core eventually.
+ */
+static int pnv_eeh_next_error(struct eeh_pe **pe)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_pe *phb_pe, *parent_pe;
+	__be64 frozen_pe_no;
+	__be16 err_type, severity;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+	long rc;
+	int state, ret = EEH_NEXT_ERR_NONE;
+
+	/*
+	 * While running here, it's safe to purge the event queue.
+	 * And we should keep the cached OPAL notifier event sychronized
+	 * between the kernel and firmware.
+	 */
+	eeh_remove_event(NULL, false);
+	opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		/*
+		 * If the subordinate PCI buses of the PHB has been
+		 * removed or is exactly under error recovery, we
+		 * needn't take care of it any more.
+		 */
+		phb = hose->private_data;
+		phb_pe = eeh_phb_pe_get(hose);
+		if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED))
+			continue;
+
+		rc = opal_pci_next_error(phb->opal_id,
+					 &frozen_pe_no, &err_type, &severity);
+		if (rc != OPAL_SUCCESS) {
+			pr_devel("%s: Invalid return value on "
+				 "PHB#%x (0x%lx) from opal_pci_next_error",
+				 __func__, hose->global_number, rc);
+			continue;
+		}
+
+		/* If the PHB doesn't have error, stop processing */
+		if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR ||
+		    be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) {
+			pr_devel("%s: No error found on PHB#%x\n",
+				 __func__, hose->global_number);
+			continue;
+		}
+
+		/*
+		 * Processing the error. We're expecting the error with
+		 * highest priority reported upon multiple errors on the
+		 * specific PHB.
+		 */
+		pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n",
+			__func__, be16_to_cpu(err_type),
+			be16_to_cpu(severity), be64_to_cpu(frozen_pe_no),
+			hose->global_number);
+		switch (be16_to_cpu(err_type)) {
+		case OPAL_EEH_IOC_ERROR:
+			if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) {
+				pr_err("EEH: dead IOC detected\n");
+				ret = EEH_NEXT_ERR_DEAD_IOC;
+			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+				pr_info("EEH: IOC informative error "
+					"detected\n");
+				pnv_eeh_get_and_dump_hub_diag(hose);
+				ret = EEH_NEXT_ERR_NONE;
+			}
+
+			break;
+		case OPAL_EEH_PHB_ERROR:
+			if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) {
+				*pe = phb_pe;
+				pr_err("EEH: dead PHB#%x detected, "
+				       "location: %s\n",
+					hose->global_number,
+					eeh_pe_loc_get(phb_pe));
+				ret = EEH_NEXT_ERR_DEAD_PHB;
+			} else if (be16_to_cpu(severity) ==
+				   OPAL_EEH_SEV_PHB_FENCED) {
+				*pe = phb_pe;
+				pr_err("EEH: Fenced PHB#%x detected, "
+				       "location: %s\n",
+					hose->global_number,
+					eeh_pe_loc_get(phb_pe));
+				ret = EEH_NEXT_ERR_FENCED_PHB;
+			} else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) {
+				pr_info("EEH: PHB#%x informative error "
+					"detected, location: %s\n",
+					hose->global_number,
+					eeh_pe_loc_get(phb_pe));
+				pnv_eeh_get_phb_diag(phb_pe);
+				pnv_pci_dump_phb_diag_data(hose, phb_pe->data);
+				ret = EEH_NEXT_ERR_NONE;
+			}
+
+			break;
+		case OPAL_EEH_PE_ERROR:
+			/*
+			 * If we can't find the corresponding PE, we
+			 * just try to unfreeze.
+			 */
+			if (pnv_eeh_get_pe(hose,
+				be64_to_cpu(frozen_pe_no), pe)) {
+				/* Try best to clear it */
+				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+					hose->global_number, frozen_pe_no);
+				pr_info("EEH: PHB location: %s\n",
+					eeh_pe_loc_get(phb_pe));
+				opal_pci_eeh_freeze_clear(phb->opal_id,
+					frozen_pe_no,
+					OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+				ret = EEH_NEXT_ERR_NONE;
+			} else if ((*pe)->state & EEH_PE_ISOLATED ||
+				   eeh_pe_passed(*pe)) {
+				ret = EEH_NEXT_ERR_NONE;
+			} else {
+				pr_err("EEH: Frozen PE#%x "
+				       "on PHB#%x detected\n",
+				       (*pe)->addr,
+					(*pe)->phb->global_number);
+				pr_err("EEH: PE location: %s, "
+				       "PHB location: %s\n",
+				       eeh_pe_loc_get(*pe),
+				       eeh_pe_loc_get(phb_pe));
+				ret = EEH_NEXT_ERR_FROZEN_PE;
+			}
+
+			break;
+		default:
+			pr_warn("%s: Unexpected error type %d\n",
+				__func__, be16_to_cpu(err_type));
+		}
+
+		/*
+		 * EEH core will try recover from fenced PHB or
+		 * frozen PE. In the time for frozen PE, EEH core
+		 * enable IO path for that before collecting logs,
+		 * but it ruins the site. So we have to dump the
+		 * log in advance here.
+		 */
+		if ((ret == EEH_NEXT_ERR_FROZEN_PE  ||
+		    ret == EEH_NEXT_ERR_FENCED_PHB) &&
+		    !((*pe)->state & EEH_PE_ISOLATED)) {
+			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+			pnv_eeh_get_phb_diag(*pe);
+
+			if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
+				pnv_pci_dump_phb_diag_data((*pe)->phb,
+							   (*pe)->data);
+		}
+
+		/*
+		 * We probably have the frozen parent PE out there and
+		 * we need have to handle frozen parent PE firstly.
+		 */
+		if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+			parent_pe = (*pe)->parent;
+			while (parent_pe) {
+				/* Hit the ceiling ? */
+				if (parent_pe->type & EEH_PE_PHB)
+					break;
+
+				/* Frozen parent PE ? */
+				state = eeh_ops->get_state(parent_pe, NULL);
+				if (state > 0 &&
+				    (state & active_flags) != active_flags)
+					*pe = parent_pe;
+
+				/* Next parent level */
+				parent_pe = parent_pe->parent;
+			}
+
+			/* We possibly migrate to another PE */
+			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+		}
+
+		/*
+		 * If we have no errors on the specific PHB or only
+		 * informative error there, we continue poking it.
+		 * Otherwise, we need actions to be taken by upper
+		 * layer.
+		 */
+		if (ret > EEH_NEXT_ERR_INF)
+			break;
+	}
+
+	return ret;
+}
+
+static int pnv_eeh_restore_config(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	struct pnv_phb *phb;
+	s64 ret;
+
+	if (!edev)
+		return -EEXIST;
+
+	phb = edev->phb->private_data;
+	ret = opal_pci_reinit(phb->opal_id,
+			      OPAL_REINIT_PCI_DEV, edev->config_addr);
+	if (ret) {
+		pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
+			__func__, edev->config_addr, ret);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static struct eeh_ops pnv_eeh_ops = {
+	.name                   = "powernv",
+	.init                   = pnv_eeh_init,
+	.post_init              = pnv_eeh_post_init,
+	.probe			= pnv_eeh_probe,
+	.set_option             = pnv_eeh_set_option,
+	.get_pe_addr            = pnv_eeh_get_pe_addr,
+	.get_state              = pnv_eeh_get_state,
+	.reset                  = pnv_eeh_reset,
+	.wait_state             = pnv_eeh_wait_state,
+	.get_log                = pnv_eeh_get_log,
+	.configure_bridge       = pnv_eeh_configure_bridge,
+	.err_inject		= pnv_eeh_err_inject,
+	.read_config            = pnv_eeh_read_config,
+	.write_config           = pnv_eeh_write_config,
+	.next_error		= pnv_eeh_next_error,
+	.restore_config		= pnv_eeh_restore_config
+};
+
+/**
+ * eeh_powernv_init - Register platform dependent EEH operations
+ *
+ * EEH initialization on powernv platform. This function should be
+ * called before any EEH related functions.
+ */
+static int __init eeh_powernv_init(void)
+{
+	int ret = -EINVAL;
+
+	eeh_set_pe_aux_size(PNV_PCI_DIAG_BUF_SIZE);
+	ret = eeh_ops_register(&pnv_eeh_ops);
+	if (!ret)
+		pr_info("EEH: PowerNV platform initialized\n");
+	else
+		pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret);
+
+	return ret;
+}
+machine_early_initcall(powernv, eeh_powernv_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-async.c b/kernel/arch/powerpc/platforms/powernv/opal-async.c
new file mode 100644
index 000000000..693b6cdac
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-async.c
@@ -0,0 +1,208 @@
+/*
+ * PowerNV OPAL asynchronous completion interfaces
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+
+#define N_ASYNC_COMPLETIONS	64
+
+static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL};
+static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS);
+static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
+static DEFINE_SPINLOCK(opal_async_comp_lock);
+static struct semaphore opal_async_sem;
+static struct opal_msg *opal_async_responses;
+static unsigned int opal_max_async_tokens;
+
+int __opal_async_get_token(void)
+{
+	unsigned long flags;
+	int token;
+
+	spin_lock_irqsave(&opal_async_comp_lock, flags);
+	token = find_first_bit(opal_async_complete_map, opal_max_async_tokens);
+	if (token >= opal_max_async_tokens) {
+		token = -EBUSY;
+		goto out;
+	}
+
+	if (__test_and_set_bit(token, opal_async_token_map)) {
+		token = -EBUSY;
+		goto out;
+	}
+
+	__clear_bit(token, opal_async_complete_map);
+
+out:
+	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+	return token;
+}
+
+int opal_async_get_token_interruptible(void)
+{
+	int token;
+
+	/* Wait until a token is available */
+	if (down_interruptible(&opal_async_sem))
+		return -ERESTARTSYS;
+
+	token = __opal_async_get_token();
+	if (token < 0)
+		up(&opal_async_sem);
+
+	return token;
+}
+EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
+
+int __opal_async_release_token(int token)
+{
+	unsigned long flags;
+
+	if (token < 0 || token >= opal_max_async_tokens) {
+		pr_err("%s: Passed token is out of range, token %d\n",
+				__func__, token);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&opal_async_comp_lock, flags);
+	__set_bit(token, opal_async_complete_map);
+	__clear_bit(token, opal_async_token_map);
+	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+	return 0;
+}
+
+int opal_async_release_token(int token)
+{
+	int ret;
+
+	ret = __opal_async_release_token(token);
+	if (ret)
+		return ret;
+
+	up(&opal_async_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(opal_async_release_token);
+
+int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
+{
+	if (token >= opal_max_async_tokens) {
+		pr_err("%s: Invalid token passed\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!msg) {
+		pr_err("%s: Invalid message pointer passed\n", __func__);
+		return -EINVAL;
+	}
+
+	wait_event(opal_async_wait, test_bit(token, opal_async_complete_map));
+	memcpy(msg, &opal_async_responses[token], sizeof(*msg));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response);
+
+static int opal_async_comp_event(struct notifier_block *nb,
+		unsigned long msg_type, void *msg)
+{
+	struct opal_msg *comp_msg = msg;
+	unsigned long flags;
+	uint64_t token;
+
+	if (msg_type != OPAL_MSG_ASYNC_COMP)
+		return 0;
+
+	token = be64_to_cpu(comp_msg->params[0]);
+	memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg));
+	spin_lock_irqsave(&opal_async_comp_lock, flags);
+	__set_bit(token, opal_async_complete_map);
+	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+
+	wake_up(&opal_async_wait);
+
+	return 0;
+}
+
+static struct notifier_block opal_async_comp_nb = {
+		.notifier_call	= opal_async_comp_event,
+		.next		= NULL,
+		.priority	= 0,
+};
+
+static int __init opal_async_comp_init(void)
+{
+	struct device_node *opal_node;
+	const __be32 *async;
+	int err;
+
+	opal_node = of_find_node_by_path("/ibm,opal");
+	if (!opal_node) {
+		pr_err("%s: Opal node not found\n", __func__);
+		err = -ENOENT;
+		goto out;
+	}
+
+	async = of_get_property(opal_node, "opal-msg-async-num", NULL);
+	if (!async) {
+		pr_err("%s: %s has no opal-msg-async-num\n",
+				__func__, opal_node->full_name);
+		err = -ENOENT;
+		goto out_opal_node;
+	}
+
+	opal_max_async_tokens = be32_to_cpup(async);
+	if (opal_max_async_tokens > N_ASYNC_COMPLETIONS)
+		opal_max_async_tokens = N_ASYNC_COMPLETIONS;
+
+	err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP,
+			&opal_async_comp_nb);
+	if (err) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+				__func__, err);
+		goto out_opal_node;
+	}
+
+	opal_async_responses = kzalloc(
+			sizeof(*opal_async_responses) * opal_max_async_tokens,
+			GFP_KERNEL);
+	if (!opal_async_responses) {
+		pr_err("%s: Out of memory, failed to do asynchronous "
+				"completion init\n", __func__);
+		err = -ENOMEM;
+		goto out_opal_node;
+	}
+
+	/* Initialize to 1 less than the maximum tokens available, as we may
+	 * require to pop one during emergency through synchronous call to
+	 * __opal_async_get_token()
+	 */
+	sema_init(&opal_async_sem, opal_max_async_tokens - 1);
+
+out_opal_node:
+	of_node_put(opal_node);
+out:
+	return err;
+}
+machine_subsys_initcall(powernv, opal_async_comp_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-dump.c b/kernel/arch/powerpc/platforms/powernv/opal-dump.c
new file mode 100644
index 000000000..5aa9c1ce4
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-dump.c
@@ -0,0 +1,457 @@
+/*
+ * PowerNV OPAL Dump Interface
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+#define DUMP_TYPE_FSP	0x01
+
+struct dump_obj {
+	struct kobject  kobj;
+	struct bin_attribute dump_attr;
+	uint32_t	id;  /* becomes object name */
+	uint32_t	type;
+	uint32_t	size;
+	char		*buffer;
+};
+#define to_dump_obj(x) container_of(x, struct dump_obj, kobj)
+
+struct dump_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr,
+			 const char *buf, size_t count);
+};
+#define to_dump_attr(x) container_of(x, struct dump_attribute, attr)
+
+static ssize_t dump_id_show(struct dump_obj *dump_obj,
+			    struct dump_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "0x%x\n", dump_obj->id);
+}
+
+static const char* dump_type_to_string(uint32_t type)
+{
+	switch (type) {
+	case 0x01: return "SP Dump";
+	case 0x02: return "System/Platform Dump";
+	case 0x03: return "SMA Dump";
+	default: return "unknown";
+	}
+}
+
+static ssize_t dump_type_show(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      char *buf)
+{
+	
+	return sprintf(buf, "0x%x %s\n", dump_obj->type,
+		       dump_type_to_string(dump_obj->type));
+}
+
+static ssize_t dump_ack_show(struct dump_obj *dump_obj,
+			     struct dump_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "ack - acknowledge dump\n");
+}
+
+/*
+ * Send acknowledgement to OPAL
+ */
+static int64_t dump_send_ack(uint32_t dump_id)
+{
+	int rc;
+
+	rc = opal_dump_ack(dump_id);
+	if (rc)
+		pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n",
+			__func__, dump_id, rc);
+	return rc;
+}
+
+static ssize_t dump_ack_store(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      const char *buf,
+			      size_t count)
+{
+	dump_send_ack(dump_obj->id);
+	sysfs_remove_file_self(&dump_obj->kobj, &attr->attr);
+	kobject_put(&dump_obj->kobj);
+	return count;
+}
+
+/* Attributes of a dump
+ * The binary attribute of the dump itself is dynamic
+ * due to the dynamic size of the dump
+ */
+static struct dump_attribute id_attribute =
+	__ATTR(id, S_IRUGO, dump_id_show, NULL);
+static struct dump_attribute type_attribute =
+	__ATTR(type, S_IRUGO, dump_type_show, NULL);
+static struct dump_attribute ack_attribute =
+	__ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store);
+
+static ssize_t init_dump_show(struct dump_obj *dump_obj,
+			      struct dump_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "1 - initiate Service Processor(FSP) dump\n");
+}
+
+static int64_t dump_fips_init(uint8_t type)
+{
+	int rc;
+
+	rc = opal_dump_init(type);
+	if (rc)
+		pr_warn("%s: Failed to initiate FSP dump (%d)\n",
+			__func__, rc);
+	return rc;
+}
+
+static ssize_t init_dump_store(struct dump_obj *dump_obj,
+			       struct dump_attribute *attr,
+			       const char *buf,
+			       size_t count)
+{
+	int rc;
+
+	rc = dump_fips_init(DUMP_TYPE_FSP);
+	if (rc == OPAL_SUCCESS)
+		pr_info("%s: Initiated FSP dump\n", __func__);
+
+	return count;
+}
+
+static struct dump_attribute initiate_attribute =
+	__ATTR(initiate_dump, 0600, init_dump_show, init_dump_store);
+
+static struct attribute *initiate_attrs[] = {
+	&initiate_attribute.attr,
+	NULL,
+};
+
+static struct attribute_group initiate_attr_group = {
+	.attrs = initiate_attrs,
+};
+
+static struct kset *dump_kset;
+
+static ssize_t dump_attr_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct dump_attribute *attribute;
+	struct dump_obj *dump;
+
+	attribute = to_dump_attr(attr);
+	dump = to_dump_obj(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(dump, attribute, buf);
+}
+
+static ssize_t dump_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct dump_attribute *attribute;
+	struct dump_obj *dump;
+
+	attribute = to_dump_attr(attr);
+	dump = to_dump_obj(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(dump, attribute, buf, len);
+}
+
+static const struct sysfs_ops dump_sysfs_ops = {
+	.show = dump_attr_show,
+	.store = dump_attr_store,
+};
+
+static void dump_release(struct kobject *kobj)
+{
+	struct dump_obj *dump;
+
+	dump = to_dump_obj(kobj);
+	vfree(dump->buffer);
+	kfree(dump);
+}
+
+static struct attribute *dump_default_attrs[] = {
+	&id_attribute.attr,
+	&type_attribute.attr,
+	&ack_attribute.attr,
+	NULL,
+};
+
+static struct kobj_type dump_ktype = {
+	.sysfs_ops = &dump_sysfs_ops,
+	.release = &dump_release,
+	.default_attrs = dump_default_attrs,
+};
+
+static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type)
+{
+	__be32 id, size, type;
+	int rc;
+
+	type = cpu_to_be32(0xffffffff);
+
+	rc = opal_dump_info2(&id, &size, &type);
+	if (rc == OPAL_PARAMETER)
+		rc = opal_dump_info(&id, &size);
+
+	*dump_id = be32_to_cpu(id);
+	*dump_size = be32_to_cpu(size);
+	*dump_type = be32_to_cpu(type);
+
+	if (rc)
+		pr_warn("%s: Failed to get dump info (%d)\n",
+			__func__, rc);
+	return rc;
+}
+
+static int64_t dump_read_data(struct dump_obj *dump)
+{
+	struct opal_sg_list *list;
+	uint64_t addr;
+	int64_t rc;
+
+	/* Allocate memory */
+	dump->buffer = vzalloc(PAGE_ALIGN(dump->size));
+	if (!dump->buffer) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Generate SG list */
+	list = opal_vmalloc_to_sg_list(dump->buffer, dump->size);
+	if (!list) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* First entry address */
+	addr = __pa(list);
+
+	/* Fetch data */
+	rc = OPAL_BUSY_EVENT;
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_dump_read(dump->id, addr);
+		if (rc == OPAL_BUSY_EVENT) {
+			opal_poll_events(NULL);
+			msleep(20);
+		}
+	}
+
+	if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL)
+		pr_warn("%s: Extract dump failed for ID 0x%x\n",
+			__func__, dump->id);
+
+	/* Free SG list */
+	opal_free_sg_list(list);
+
+out:
+	return rc;
+}
+
+static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
+			      struct bin_attribute *bin_attr,
+			      char *buffer, loff_t pos, size_t count)
+{
+	ssize_t rc;
+
+	struct dump_obj *dump = to_dump_obj(kobj);
+
+	if (!dump->buffer) {
+		rc = dump_read_data(dump);
+
+		if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) {
+			vfree(dump->buffer);
+			dump->buffer = NULL;
+
+			return -EIO;
+		}
+		if (rc == OPAL_PARTIAL) {
+			/* On a partial read, we just return EIO
+			 * and rely on userspace to ask us to try
+			 * again.
+			 */
+			pr_info("%s: Platform dump partially read. ID = 0x%x\n",
+				__func__, dump->id);
+			return -EIO;
+		}
+	}
+
+	memcpy(buffer, dump->buffer + pos, count);
+
+	/* You may think we could free the dump buffer now and retrieve
+	 * it again later if needed, but due to current firmware limitation,
+	 * that's not the case. So, once read into userspace once,
+	 * we keep the dump around until it's acknowledged by userspace.
+	 */
+
+	return count;
+}
+
+static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
+					uint32_t type)
+{
+	struct dump_obj *dump;
+	int rc;
+
+	dump = kzalloc(sizeof(*dump), GFP_KERNEL);
+	if (!dump)
+		return NULL;
+
+	dump->kobj.kset = dump_kset;
+
+	kobject_init(&dump->kobj, &dump_ktype);
+
+	sysfs_bin_attr_init(&dump->dump_attr);
+
+	dump->dump_attr.attr.name = "dump";
+	dump->dump_attr.attr.mode = 0400;
+	dump->dump_attr.size = size;
+	dump->dump_attr.read = dump_attr_read;
+
+	dump->id = id;
+	dump->size = size;
+	dump->type = type;
+
+	rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
+	if (rc) {
+		kobject_put(&dump->kobj);
+		return NULL;
+	}
+
+	rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
+	if (rc) {
+		kobject_put(&dump->kobj);
+		return NULL;
+	}
+
+	pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
+		__func__, dump->id, dump->size);
+
+	kobject_uevent(&dump->kobj, KOBJ_ADD);
+
+	return dump;
+}
+
+static int process_dump(void)
+{
+	int rc;
+	uint32_t dump_id, dump_size, dump_type;
+	struct dump_obj *dump;
+	char name[22];
+
+	rc = dump_read_info(&dump_id, &dump_size, &dump_type);
+	if (rc != OPAL_SUCCESS)
+		return rc;
+
+	sprintf(name, "0x%x-0x%x", dump_type, dump_id);
+
+	/* we may get notified twice, let's handle
+	 * that gracefully and not create two conflicting
+	 * entries.
+	 */
+	if (kset_find_obj(dump_kset, name))
+		return 0;
+
+	dump = create_dump_obj(dump_id, dump_size, dump_type);
+	if (!dump)
+		return -1;
+
+	return 0;
+}
+
+static void dump_work_fn(struct work_struct *work)
+{
+	process_dump();
+}
+
+static DECLARE_WORK(dump_work, dump_work_fn);
+
+static void schedule_process_dump(void)
+{
+	schedule_work(&dump_work);
+}
+
+/*
+ * New dump available notification
+ *
+ * Once we get notification, we add sysfs entries for it.
+ * We only fetch the dump on demand, and create sysfs asynchronously.
+ */
+static int dump_event(struct notifier_block *nb,
+		      unsigned long events, void *change)
+{
+	if (events & OPAL_EVENT_DUMP_AVAIL)
+		schedule_process_dump();
+
+	return 0;
+}
+
+static struct notifier_block dump_nb = {
+	.notifier_call  = dump_event,
+	.next           = NULL,
+	.priority       = 0
+};
+
+void __init opal_platform_dump_init(void)
+{
+	int rc;
+
+	/* ELOG not supported by firmware */
+	if (!opal_check_token(OPAL_DUMP_READ))
+		return;
+
+	dump_kset = kset_create_and_add("dump", NULL, opal_kobj);
+	if (!dump_kset) {
+		pr_warn("%s: Failed to create dump kset\n", __func__);
+		return;
+	}
+
+	rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group);
+	if (rc) {
+		pr_warn("%s: Failed to create initiate dump attr group\n",
+			__func__);
+		kobject_put(&dump_kset->kobj);
+		return;
+	}
+
+	rc = opal_notifier_register(&dump_nb);
+	if (rc) {
+		pr_warn("%s: Can't register OPAL event notifier (%d)\n",
+			__func__, rc);
+		return;
+	}
+
+	if (opal_check_token(OPAL_DUMP_RESEND))
+		opal_dump_resend_notification();
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-elog.c b/kernel/arch/powerpc/platforms/powernv/opal-elog.c
new file mode 100644
index 000000000..38ce757e5
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-elog.c
@@ -0,0 +1,320 @@
+/*
+ * Error log support on PowerNV.
+ *
+ * Copyright 2013,2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/fcntl.h>
+#include <linux/kobject.h>
+#include <asm/uaccess.h>
+#include <asm/opal.h>
+
+struct elog_obj {
+	struct kobject kobj;
+	struct bin_attribute raw_attr;
+	uint64_t id;
+	uint64_t type;
+	size_t size;
+	char *buffer;
+};
+#define to_elog_obj(x) container_of(x, struct elog_obj, kobj)
+
+struct elog_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr,
+			 const char *buf, size_t count);
+};
+#define to_elog_attr(x) container_of(x, struct elog_attribute, attr)
+
+static ssize_t elog_id_show(struct elog_obj *elog_obj,
+			    struct elog_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "0x%llx\n", elog_obj->id);
+}
+
+static const char *elog_type_to_string(uint64_t type)
+{
+	switch (type) {
+	case 0: return "PEL";
+	default: return "unknown";
+	}
+}
+
+static ssize_t elog_type_show(struct elog_obj *elog_obj,
+			      struct elog_attribute *attr,
+			      char *buf)
+{
+	return sprintf(buf, "0x%llx %s\n",
+		       elog_obj->type,
+		       elog_type_to_string(elog_obj->type));
+}
+
+static ssize_t elog_ack_show(struct elog_obj *elog_obj,
+			     struct elog_attribute *attr,
+			     char *buf)
+{
+	return sprintf(buf, "ack - acknowledge log message\n");
+}
+
+static ssize_t elog_ack_store(struct elog_obj *elog_obj,
+			      struct elog_attribute *attr,
+			      const char *buf,
+			      size_t count)
+{
+	opal_send_ack_elog(elog_obj->id);
+	sysfs_remove_file_self(&elog_obj->kobj, &attr->attr);
+	kobject_put(&elog_obj->kobj);
+	return count;
+}
+
+static struct elog_attribute id_attribute =
+	__ATTR(id, S_IRUGO, elog_id_show, NULL);
+static struct elog_attribute type_attribute =
+	__ATTR(type, S_IRUGO, elog_type_show, NULL);
+static struct elog_attribute ack_attribute =
+	__ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store);
+
+static struct kset *elog_kset;
+
+static ssize_t elog_attr_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buf)
+{
+	struct elog_attribute *attribute;
+	struct elog_obj *elog;
+
+	attribute = to_elog_attr(attr);
+	elog = to_elog_obj(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(elog, attribute, buf);
+}
+
+static ssize_t elog_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct elog_attribute *attribute;
+	struct elog_obj *elog;
+
+	attribute = to_elog_attr(attr);
+	elog = to_elog_obj(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(elog, attribute, buf, len);
+}
+
+static const struct sysfs_ops elog_sysfs_ops = {
+	.show = elog_attr_show,
+	.store = elog_attr_store,
+};
+
+static void elog_release(struct kobject *kobj)
+{
+	struct elog_obj *elog;
+
+	elog = to_elog_obj(kobj);
+	kfree(elog->buffer);
+	kfree(elog);
+}
+
+static struct attribute *elog_default_attrs[] = {
+	&id_attribute.attr,
+	&type_attribute.attr,
+	&ack_attribute.attr,
+	NULL,
+};
+
+static struct kobj_type elog_ktype = {
+	.sysfs_ops = &elog_sysfs_ops,
+	.release = &elog_release,
+	.default_attrs = elog_default_attrs,
+};
+
+/* Maximum size of a single log on FSP is 16KB */
+#define OPAL_MAX_ERRLOG_SIZE	16384
+
+static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buffer, loff_t pos, size_t count)
+{
+	int opal_rc;
+
+	struct elog_obj *elog = to_elog_obj(kobj);
+
+	/* We may have had an error reading before, so let's retry */
+	if (!elog->buffer) {
+		elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+		if (!elog->buffer)
+			return -EIO;
+
+		opal_rc = opal_read_elog(__pa(elog->buffer),
+					 elog->size, elog->id);
+		if (opal_rc != OPAL_SUCCESS) {
+			pr_err("ELOG: log read failed for log-id=%llx\n",
+			       elog->id);
+			kfree(elog->buffer);
+			elog->buffer = NULL;
+			return -EIO;
+		}
+	}
+
+	memcpy(buffer, elog->buffer + pos, count);
+
+	return count;
+}
+
+static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
+{
+	struct elog_obj *elog;
+	int rc;
+
+	elog = kzalloc(sizeof(*elog), GFP_KERNEL);
+	if (!elog)
+		return NULL;
+
+	elog->kobj.kset = elog_kset;
+
+	kobject_init(&elog->kobj, &elog_ktype);
+
+	sysfs_bin_attr_init(&elog->raw_attr);
+
+	elog->raw_attr.attr.name = "raw";
+	elog->raw_attr.attr.mode = 0400;
+	elog->raw_attr.size = size;
+	elog->raw_attr.read = raw_attr_read;
+
+	elog->id = id;
+	elog->size = size;
+	elog->type = type;
+
+	elog->buffer = kzalloc(elog->size, GFP_KERNEL);
+
+	if (elog->buffer) {
+		rc = opal_read_elog(__pa(elog->buffer),
+					 elog->size, elog->id);
+		if (rc != OPAL_SUCCESS) {
+			pr_err("ELOG: log read failed for log-id=%llx\n",
+			       elog->id);
+			kfree(elog->buffer);
+			elog->buffer = NULL;
+		}
+	}
+
+	rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
+	if (rc) {
+		kobject_put(&elog->kobj);
+		return NULL;
+	}
+
+	rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
+	if (rc) {
+		kobject_put(&elog->kobj);
+		return NULL;
+	}
+
+	kobject_uevent(&elog->kobj, KOBJ_ADD);
+
+	return elog;
+}
+
+static void elog_work_fn(struct work_struct *work)
+{
+	__be64 size;
+	__be64 id;
+	__be64 type;
+	uint64_t elog_size;
+	uint64_t log_id;
+	uint64_t elog_type;
+	int rc;
+	char name[2+16+1];
+
+	rc = opal_get_elog_size(&id, &size, &type);
+	if (rc != OPAL_SUCCESS) {
+		pr_err("ELOG: OPAL log info read failed\n");
+		return;
+	}
+
+	elog_size = be64_to_cpu(size);
+	log_id = be64_to_cpu(id);
+	elog_type = be64_to_cpu(type);
+
+	WARN_ON(elog_size > OPAL_MAX_ERRLOG_SIZE);
+
+	if (elog_size >= OPAL_MAX_ERRLOG_SIZE)
+		elog_size  =  OPAL_MAX_ERRLOG_SIZE;
+
+	sprintf(name, "0x%llx", log_id);
+
+	/* we may get notified twice, let's handle
+	 * that gracefully and not create two conflicting
+	 * entries.
+	 */
+	if (kset_find_obj(elog_kset, name))
+		return;
+
+	create_elog_obj(log_id, elog_size, elog_type);
+}
+
+static DECLARE_WORK(elog_work, elog_work_fn);
+
+static int elog_event(struct notifier_block *nb,
+				unsigned long events, void *change)
+{
+	/* check for error log event */
+	if (events & OPAL_EVENT_ERROR_LOG_AVAIL)
+		schedule_work(&elog_work);
+	return 0;
+}
+
+static struct notifier_block elog_nb = {
+	.notifier_call  = elog_event,
+	.next           = NULL,
+	.priority       = 0
+};
+
+int __init opal_elog_init(void)
+{
+	int rc = 0;
+
+	/* ELOG not supported by firmware */
+	if (!opal_check_token(OPAL_ELOG_READ))
+		return -1;
+
+	elog_kset = kset_create_and_add("elog", NULL, opal_kobj);
+	if (!elog_kset) {
+		pr_warn("%s: failed to create elog kset\n", __func__);
+		return -1;
+	}
+
+	rc = opal_notifier_register(&elog_nb);
+	if (rc) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		__func__, rc);
+		return rc;
+	}
+
+	/* We are now ready to pull error logs from opal. */
+	if (opal_check_token(OPAL_ELOG_RESEND))
+		opal_resend_pending_logs();
+
+	return 0;
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-flash.c b/kernel/arch/powerpc/platforms/powernv/opal-flash.c
new file mode 100644
index 000000000..4ec621928
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-flash.c
@@ -0,0 +1,592 @@
+/*
+ * PowerNV OPAL Firmware Update Interface
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/delay.h>
+
+#include <asm/opal.h>
+
+/* FLASH status codes */
+#define FLASH_NO_OP		-1099	/* No operation initiated by user */
+#define FLASH_NO_AUTH		-9002	/* Not a service authority partition */
+
+/* Validate image status values */
+#define VALIDATE_IMG_READY	-1001	/* Image ready for validation */
+#define VALIDATE_IMG_INCOMPLETE	-1002	/* User copied < VALIDATE_BUF_SIZE */
+
+/* Manage image status values */
+#define MANAGE_ACTIVE_ERR	-9001	/* Cannot overwrite active img */
+
+/* Flash image status values */
+#define FLASH_IMG_READY		0	/* Img ready for flash on reboot */
+#define FLASH_INVALID_IMG	-1003	/* Flash image shorter than expected */
+#define FLASH_IMG_NULL_DATA	-1004	/* Bad data in sg list entry */
+#define FLASH_IMG_BAD_LEN	-1005	/* Bad length in sg list entry */
+
+/* Manage operation tokens */
+#define FLASH_REJECT_TMP_SIDE	0	/* Reject temporary fw image */
+#define FLASH_COMMIT_TMP_SIDE	1	/* Commit temporary fw image */
+
+/* Update tokens */
+#define FLASH_UPDATE_CANCEL	0	/* Cancel update request */
+#define FLASH_UPDATE_INIT	1	/* Initiate update */
+
+/* Validate image update result tokens */
+#define VALIDATE_TMP_UPDATE	0     /* T side will be updated */
+#define VALIDATE_FLASH_AUTH	1     /* Partition does not have authority */
+#define VALIDATE_INVALID_IMG	2     /* Candidate image is not valid */
+#define VALIDATE_CUR_UNKNOWN	3     /* Current fixpack level is unknown */
+/*
+ * Current T side will be committed to P side before being replace with new
+ * image, and the new image is downlevel from current image
+ */
+#define VALIDATE_TMP_COMMIT_DL	4
+/*
+ * Current T side will be committed to P side before being replaced with new
+ * image
+ */
+#define VALIDATE_TMP_COMMIT	5
+/*
+ * T side will be updated with a downlevel image
+ */
+#define VALIDATE_TMP_UPDATE_DL	6
+/*
+ * The candidate image's release date is later than the system's firmware
+ * service entitlement date - service warranty period has expired
+ */
+#define VALIDATE_OUT_OF_WRNTY	7
+
+/* Validate buffer size */
+#define VALIDATE_BUF_SIZE	4096
+
+/* XXX: Assume candidate image size is <= 1GB */
+#define MAX_IMAGE_SIZE	0x40000000
+
+/* Image status */
+enum {
+	IMAGE_INVALID,
+	IMAGE_LOADING,
+	IMAGE_READY,
+};
+
+/* Candidate image data */
+struct image_data_t {
+	int		status;
+	void		*data;
+	uint32_t	size;
+};
+
+/* Candidate image header */
+struct image_header_t {
+	uint16_t	magic;
+	uint16_t	version;
+	uint32_t	size;
+};
+
+struct validate_flash_t {
+	int		status;		/* Return status */
+	void		*buf;		/* Candidate image buffer */
+	uint32_t	buf_size;	/* Image size */
+	uint32_t	result;		/* Update results token */
+};
+
+struct manage_flash_t {
+	int status;		/* Return status */
+};
+
+struct update_flash_t {
+	int status;		/* Return status */
+};
+
+static struct image_header_t	image_header;
+static struct image_data_t	image_data;
+static struct validate_flash_t	validate_flash_data;
+static struct manage_flash_t	manage_flash_data;
+
+/* Initialize update_flash_data status to No Operation */
+static struct update_flash_t	update_flash_data = {
+	.status = FLASH_NO_OP,
+};
+
+static DEFINE_MUTEX(image_data_mutex);
+
+/*
+ * Validate candidate image
+ */
+static inline void opal_flash_validate(void)
+{
+	long ret;
+	void *buf = validate_flash_data.buf;
+	__be32 size = cpu_to_be32(validate_flash_data.buf_size);
+	__be32 result;
+
+	ret = opal_validate_flash(__pa(buf), &size, &result);
+
+	validate_flash_data.status = ret;
+	validate_flash_data.buf_size = be32_to_cpu(size);
+	validate_flash_data.result = be32_to_cpu(result);
+}
+
+/*
+ * Validate output format:
+ *     validate result token
+ *     current image version details
+ *     new image version details
+ */
+static ssize_t validate_show(struct kobject *kobj,
+			     struct kobj_attribute *attr, char *buf)
+{
+	struct validate_flash_t *args_buf = &validate_flash_data;
+	int len;
+
+	/* Candidate image is not validated */
+	if (args_buf->status < VALIDATE_TMP_UPDATE) {
+		len = sprintf(buf, "%d\n", args_buf->status);
+		goto out;
+	}
+
+	/* Result token */
+	len = sprintf(buf, "%d\n", args_buf->result);
+
+	/* Current and candidate image version details */
+	if ((args_buf->result != VALIDATE_TMP_UPDATE) &&
+	    (args_buf->result < VALIDATE_CUR_UNKNOWN))
+		goto out;
+
+	if (args_buf->buf_size > (VALIDATE_BUF_SIZE - len)) {
+		memcpy(buf + len, args_buf->buf, VALIDATE_BUF_SIZE - len);
+		len = VALIDATE_BUF_SIZE;
+	} else {
+		memcpy(buf + len, args_buf->buf, args_buf->buf_size);
+		len += args_buf->buf_size;
+	}
+out:
+	/* Set status to default */
+	args_buf->status = FLASH_NO_OP;
+	return len;
+}
+
+/*
+ * Validate candidate firmware image
+ *
+ * Note:
+ *   We are only interested in first 4K bytes of the
+ *   candidate image.
+ */
+static ssize_t validate_store(struct kobject *kobj,
+			      struct kobj_attribute *attr,
+			      const char *buf, size_t count)
+{
+	struct validate_flash_t *args_buf = &validate_flash_data;
+
+	if (buf[0] != '1')
+		return -EINVAL;
+
+	mutex_lock(&image_data_mutex);
+
+	if (image_data.status != IMAGE_READY ||
+	    image_data.size < VALIDATE_BUF_SIZE) {
+		args_buf->result = VALIDATE_INVALID_IMG;
+		args_buf->status = VALIDATE_IMG_INCOMPLETE;
+		goto out;
+	}
+
+	/* Copy first 4k bytes of candidate image */
+	memcpy(args_buf->buf, image_data.data, VALIDATE_BUF_SIZE);
+
+	args_buf->status = VALIDATE_IMG_READY;
+	args_buf->buf_size = VALIDATE_BUF_SIZE;
+
+	/* Validate candidate image */
+	opal_flash_validate();
+
+out:
+	mutex_unlock(&image_data_mutex);
+	return count;
+}
+
+/*
+ * Manage flash routine
+ */
+static inline void opal_flash_manage(uint8_t op)
+{
+	struct manage_flash_t *const args_buf = &manage_flash_data;
+
+	args_buf->status = opal_manage_flash(op);
+}
+
+/*
+ * Show manage flash status
+ */
+static ssize_t manage_show(struct kobject *kobj,
+			   struct kobj_attribute *attr, char *buf)
+{
+	struct manage_flash_t *const args_buf = &manage_flash_data;
+	int rc;
+
+	rc = sprintf(buf, "%d\n", args_buf->status);
+	/* Set status to default*/
+	args_buf->status = FLASH_NO_OP;
+	return rc;
+}
+
+/*
+ * Manage operations:
+ *   0 - Reject
+ *   1 - Commit
+ */
+static ssize_t manage_store(struct kobject *kobj,
+			    struct kobj_attribute *attr,
+			    const char *buf, size_t count)
+{
+	uint8_t op;
+	switch (buf[0]) {
+	case '0':
+		op = FLASH_REJECT_TMP_SIDE;
+		break;
+	case '1':
+		op = FLASH_COMMIT_TMP_SIDE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* commit/reject temporary image */
+	opal_flash_manage(op);
+	return count;
+}
+
+/*
+ * OPAL update flash
+ */
+static int opal_flash_update(int op)
+{
+	struct opal_sg_list *list;
+	unsigned long addr;
+	int64_t rc = OPAL_PARAMETER;
+
+	if (op == FLASH_UPDATE_CANCEL) {
+		pr_alert("FLASH: Image update cancelled\n");
+		addr = '\0';
+		goto flash;
+	}
+
+	list = opal_vmalloc_to_sg_list(image_data.data, image_data.size);
+	if (!list)
+		goto invalid_img;
+
+	/* First entry address */
+	addr = __pa(list);
+
+flash:
+	rc = opal_update_flash(addr);
+
+invalid_img:
+	return rc;
+}
+
+/* Return CPUs to OPAL before starting FW update */
+static void flash_return_cpu(void *info)
+{
+	int cpu = smp_processor_id();
+
+	if (!cpu_online(cpu))
+		return;
+
+	/* Disable IRQ */
+	hard_irq_disable();
+
+	/* Return the CPU to OPAL */
+	opal_return_cpu();
+}
+
+/* This gets called just before system reboots */
+void opal_flash_term_callback(void)
+{
+	struct cpumask mask;
+
+	if (update_flash_data.status != FLASH_IMG_READY)
+		return;
+
+	pr_alert("FLASH: Flashing new firmware\n");
+	pr_alert("FLASH: Image is %u bytes\n", image_data.size);
+	pr_alert("FLASH: Performing flash and reboot/shutdown\n");
+	pr_alert("FLASH: This will take several minutes. Do not power off!\n");
+
+	/* Small delay to help getting the above message out */
+	msleep(500);
+
+	/* Return secondary CPUs to firmware */
+	cpumask_copy(&mask, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), &mask);
+	if (!cpumask_empty(&mask))
+		smp_call_function_many(&mask,
+				       flash_return_cpu, NULL, false);
+	/* Hard disable interrupts */
+	hard_irq_disable();
+}
+
+/*
+ * Show candidate image status
+ */
+static ssize_t update_show(struct kobject *kobj,
+			   struct kobj_attribute *attr, char *buf)
+{
+	struct update_flash_t *const args_buf = &update_flash_data;
+	return sprintf(buf, "%d\n", args_buf->status);
+}
+
+/*
+ * Set update image flag
+ *  1 - Flash new image
+ *  0 - Cancel flash request
+ */
+static ssize_t update_store(struct kobject *kobj,
+			    struct kobj_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct update_flash_t *const args_buf = &update_flash_data;
+	int rc = count;
+
+	mutex_lock(&image_data_mutex);
+
+	switch (buf[0]) {
+	case '0':
+		if (args_buf->status == FLASH_IMG_READY)
+			opal_flash_update(FLASH_UPDATE_CANCEL);
+		args_buf->status = FLASH_NO_OP;
+		break;
+	case '1':
+		/* Image is loaded? */
+		if (image_data.status == IMAGE_READY)
+			args_buf->status =
+				opal_flash_update(FLASH_UPDATE_INIT);
+		else
+			args_buf->status = FLASH_INVALID_IMG;
+		break;
+	default:
+		rc = -EINVAL;
+	}
+
+	mutex_unlock(&image_data_mutex);
+	return rc;
+}
+
+/*
+ * Free image buffer
+ */
+static void free_image_buf(void)
+{
+	void *addr;
+	int size;
+
+	addr = image_data.data;
+	size = PAGE_ALIGN(image_data.size);
+	while (size > 0) {
+		ClearPageReserved(vmalloc_to_page(addr));
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+	vfree(image_data.data);
+	image_data.data = NULL;
+	image_data.status = IMAGE_INVALID;
+}
+
+/*
+ * Allocate image buffer.
+ */
+static int alloc_image_buf(char *buffer, size_t count)
+{
+	void *addr;
+	int size;
+
+	if (count < sizeof(struct image_header_t)) {
+		pr_warn("FLASH: Invalid candidate image\n");
+		return -EINVAL;
+	}
+
+	memcpy(&image_header, (void *)buffer, sizeof(struct image_header_t));
+	image_data.size = be32_to_cpu(image_header.size);
+	pr_debug("FLASH: Candidate image size = %u\n", image_data.size);
+
+	if (image_data.size > MAX_IMAGE_SIZE) {
+		pr_warn("FLASH: Too large image\n");
+		return -EINVAL;
+	}
+	if (image_data.size < VALIDATE_BUF_SIZE) {
+		pr_warn("FLASH: Image is shorter than expected\n");
+		return -EINVAL;
+	}
+
+	image_data.data = vzalloc(PAGE_ALIGN(image_data.size));
+	if (!image_data.data) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* Pin memory */
+	addr = image_data.data;
+	size = PAGE_ALIGN(image_data.size);
+	while (size > 0) {
+		SetPageReserved(vmalloc_to_page(addr));
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+
+	image_data.status = IMAGE_LOADING;
+	return 0;
+}
+
+/*
+ * Copy candidate image
+ *
+ * Parse candidate image header to get total image size
+ * and pre-allocate required memory.
+ */
+static ssize_t image_data_write(struct file *filp, struct kobject *kobj,
+				struct bin_attribute *bin_attr,
+				char *buffer, loff_t pos, size_t count)
+{
+	int rc;
+
+	mutex_lock(&image_data_mutex);
+
+	/* New image ? */
+	if (pos == 0) {
+		/* Free memory, if already allocated */
+		if (image_data.data)
+			free_image_buf();
+
+		/* Cancel outstanding image update request */
+		if (update_flash_data.status == FLASH_IMG_READY)
+			opal_flash_update(FLASH_UPDATE_CANCEL);
+
+		/* Allocate memory */
+		rc = alloc_image_buf(buffer, count);
+		if (rc)
+			goto out;
+	}
+
+	if (image_data.status != IMAGE_LOADING) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if ((pos + count) > image_data.size) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	memcpy(image_data.data + pos, (void *)buffer, count);
+	rc = count;
+
+	/* Set image status */
+	if ((pos + count) == image_data.size) {
+		pr_debug("FLASH: Candidate image loaded....\n");
+		image_data.status = IMAGE_READY;
+	}
+
+out:
+	mutex_unlock(&image_data_mutex);
+	return rc;
+}
+
+/*
+ * sysfs interface :
+ *  OPAL uses below sysfs files for code update.
+ *  We create these files under /sys/firmware/opal.
+ *
+ *   image		: Interface to load candidate firmware image
+ *   validate_flash	: Validate firmware image
+ *   manage_flash	: Commit/Reject firmware image
+ *   update_flash	: Flash new firmware image
+ *
+ */
+static struct bin_attribute image_data_attr = {
+	.attr = {.name = "image", .mode = 0200},
+	.size = MAX_IMAGE_SIZE,	/* Limit image size */
+	.write = image_data_write,
+};
+
+static struct kobj_attribute validate_attribute =
+	__ATTR(validate_flash, 0600, validate_show, validate_store);
+
+static struct kobj_attribute manage_attribute =
+	__ATTR(manage_flash, 0600, manage_show, manage_store);
+
+static struct kobj_attribute update_attribute =
+	__ATTR(update_flash, 0600, update_show, update_store);
+
+static struct attribute *image_op_attrs[] = {
+	&validate_attribute.attr,
+	&manage_attribute.attr,
+	&update_attribute.attr,
+	NULL	/* need to NULL terminate the list of attributes */
+};
+
+static struct attribute_group image_op_attr_group = {
+	.attrs = image_op_attrs,
+};
+
+void __init opal_flash_update_init(void)
+{
+	int ret;
+
+	/* Allocate validate image buffer */
+	validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
+	if (!validate_flash_data.buf) {
+		pr_err("%s : Failed to allocate memory\n", __func__);
+		return;
+	}
+
+	/* Make sure /sys/firmware/opal directory is created */
+	if (!opal_kobj) {
+		pr_warn("FLASH: opal kobject is not available\n");
+		goto nokobj;
+	}
+
+	/* Create the sysfs files */
+	ret = sysfs_create_group(opal_kobj, &image_op_attr_group);
+	if (ret) {
+		pr_warn("FLASH: Failed to create sysfs files\n");
+		goto nokobj;
+	}
+
+	ret = sysfs_create_bin_file(opal_kobj, &image_data_attr);
+	if (ret) {
+		pr_warn("FLASH: Failed to create sysfs files\n");
+		goto nosysfs_file;
+	}
+
+	/* Set default status */
+	validate_flash_data.status = FLASH_NO_OP;
+	manage_flash_data.status = FLASH_NO_OP;
+	update_flash_data.status = FLASH_NO_OP;
+	image_data.status = IMAGE_INVALID;
+	return;
+
+nosysfs_file:
+	sysfs_remove_group(opal_kobj, &image_op_attr_group);
+
+nokobj:
+	kfree(validate_flash_data.buf);
+	return;
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-hmi.c b/kernel/arch/powerpc/platforms/powernv/opal-hmi.c
new file mode 100644
index 000000000..b322bfb51
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -0,0 +1,189 @@
+/*
+ * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Copyright 2014 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+#include <asm/cputable.h>
+#include <asm/machdep.h>
+
+static int opal_hmi_handler_nb_init;
+struct OpalHmiEvtNode {
+	struct list_head list;
+	struct OpalHMIEvent hmi_evt;
+};
+static LIST_HEAD(opal_hmi_evt_list);
+static DEFINE_SPINLOCK(opal_hmi_evt_lock);
+
+static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
+{
+	const char *level, *sevstr, *error_info;
+	static const char *hmi_error_types[] = {
+		"Malfunction Alert",
+		"Processor Recovery done",
+		"Processor recovery occurred again",
+		"Processor recovery occurred for masked error",
+		"Timer facility experienced an error",
+		"TFMR SPR is corrupted",
+		"UPS (Uniterrupted Power System) Overflow indication",
+		"An XSCOM operation failure",
+		"An XSCOM operation completed",
+		"SCOM has set a reserved FIR bit to cause recovery",
+		"Debug trigger has set a reserved FIR bit to cause recovery",
+		"A hypervisor resource error occurred"
+	};
+
+	/* Print things out */
+	if (hmi_evt->version < OpalHMIEvt_V1) {
+		pr_err("HMI Interrupt, Unknown event version %d !\n",
+			hmi_evt->version);
+		return;
+	}
+	switch (hmi_evt->severity) {
+	case OpalHMI_SEV_NO_ERROR:
+		level = KERN_INFO;
+		sevstr = "Harmless";
+		break;
+	case OpalHMI_SEV_WARNING:
+		level = KERN_WARNING;
+		sevstr = "";
+		break;
+	case OpalHMI_SEV_ERROR_SYNC:
+		level = KERN_ERR;
+		sevstr = "Severe";
+		break;
+	case OpalHMI_SEV_FATAL:
+	default:
+		level = KERN_ERR;
+		sevstr = "Fatal";
+		break;
+	}
+
+	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
+		level, sevstr,
+		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
+		"Recovered" : "Not recovered");
+	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
+			hmi_error_types[hmi_evt->type]
+			: "Unknown";
+	printk("%s Error detail: %s\n", level, error_info);
+	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
+	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
+		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
+		printk("%s	TFMR: %016llx\n", level,
+						be64_to_cpu(hmi_evt->tfmr));
+}
+
+static void hmi_event_handler(struct work_struct *work)
+{
+	unsigned long flags;
+	struct OpalHMIEvent *hmi_evt;
+	struct OpalHmiEvtNode *msg_node;
+	uint8_t disposition;
+
+	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+	while (!list_empty(&opal_hmi_evt_list)) {
+		msg_node = list_entry(opal_hmi_evt_list.next,
+					   struct OpalHmiEvtNode, list);
+		list_del(&msg_node->list);
+		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
+		print_hmi_event_info(hmi_evt);
+		disposition = hmi_evt->disposition;
+		kfree(msg_node);
+
+		/*
+		 * Check if HMI event has been recovered or not. If not
+		 * then we can't continue, invoke panic.
+		 */
+		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
+			panic("Unrecoverable HMI exception");
+
+		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+	}
+	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+}
+
+static DECLARE_WORK(hmi_event_work, hmi_event_handler);
+/*
+ * opal_handle_hmi_event - notifier handler that queues up HMI events
+ * to be preocessed later.
+ */
+static int opal_handle_hmi_event(struct notifier_block *nb,
+			  unsigned long msg_type, void *msg)
+{
+	unsigned long flags;
+	struct OpalHMIEvent *hmi_evt;
+	struct opal_msg *hmi_msg = msg;
+	struct OpalHmiEvtNode *msg_node;
+
+	/* Sanity Checks */
+	if (msg_type != OPAL_MSG_HMI_EVT)
+		return 0;
+
+	/* HMI event info starts from param[0] */
+	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
+
+	/* Delay the logging of HMI events to workqueue. */
+	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+	if (!msg_node) {
+		pr_err("HMI: out of memory, Opal message event not handled\n");
+		return -ENOMEM;
+	}
+	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
+
+	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
+	list_add(&msg_node->list, &opal_hmi_evt_list);
+	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
+
+	schedule_work(&hmi_event_work);
+	return 0;
+}
+
+static struct notifier_block opal_hmi_handler_nb = {
+	.notifier_call	= opal_handle_hmi_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_hmi_handler_init(void)
+{
+	int ret;
+
+	if (!opal_hmi_handler_nb_init) {
+		ret = opal_message_notifier_register(
+				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
+		if (ret) {
+			pr_err("%s: Can't register OPAL event notifier (%d)\n",
+			       __func__, ret);
+			return ret;
+		}
+		opal_hmi_handler_nb_init = 1;
+	}
+	return 0;
+}
+machine_subsys_initcall(powernv, opal_hmi_handler_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-lpc.c b/kernel/arch/powerpc/platforms/powernv/opal-lpc.c
new file mode 100644
index 000000000..e4169d68c
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -0,0 +1,414 @@
+/*
+ * PowerNV LPC bus handling.
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/opal.h>
+#include <asm/prom.h>
+#include <asm/uaccess.h>
+#include <asm/debug.h>
+
+static int opal_lpc_chip_id = -1;
+
+static u8 opal_lpc_inb(unsigned long port)
+{
+	int64_t rc;
+	__be32 data;
+
+	if (opal_lpc_chip_id < 0 || port > 0xffff)
+		return 0xff;
+	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 1);
+	return rc ? 0xff : be32_to_cpu(data);
+}
+
+static __le16 __opal_lpc_inw(unsigned long port)
+{
+	int64_t rc;
+	__be32 data;
+
+	if (opal_lpc_chip_id < 0 || port > 0xfffe)
+		return 0xffff;
+	if (port & 1)
+		return (__le16)opal_lpc_inb(port) << 8 | opal_lpc_inb(port + 1);
+	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 2);
+	return rc ? 0xffff : be32_to_cpu(data);
+}
+static u16 opal_lpc_inw(unsigned long port)
+{
+	return le16_to_cpu(__opal_lpc_inw(port));
+}
+
+static __le32 __opal_lpc_inl(unsigned long port)
+{
+	int64_t rc;
+	__be32 data;
+
+	if (opal_lpc_chip_id < 0 || port > 0xfffc)
+		return 0xffffffff;
+	if (port & 3)
+		return (__le32)opal_lpc_inb(port    ) << 24 |
+		       (__le32)opal_lpc_inb(port + 1) << 16 |
+		       (__le32)opal_lpc_inb(port + 2) <<  8 |
+			       opal_lpc_inb(port + 3);
+	rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 4);
+	return rc ? 0xffffffff : be32_to_cpu(data);
+}
+
+static u32 opal_lpc_inl(unsigned long port)
+{
+	return le32_to_cpu(__opal_lpc_inl(port));
+}
+
+static void opal_lpc_outb(u8 val, unsigned long port)
+{
+	if (opal_lpc_chip_id < 0 || port > 0xffff)
+		return;
+	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 1);
+}
+
+static void __opal_lpc_outw(__le16 val, unsigned long port)
+{
+	if (opal_lpc_chip_id < 0 || port > 0xfffe)
+		return;
+	if (port & 1) {
+		opal_lpc_outb(val >> 8, port);
+		opal_lpc_outb(val     , port + 1);
+		return;
+	}
+	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 2);
+}
+
+static void opal_lpc_outw(u16 val, unsigned long port)
+{
+	__opal_lpc_outw(cpu_to_le16(val), port);
+}
+
+static void __opal_lpc_outl(__le32 val, unsigned long port)
+{
+	if (opal_lpc_chip_id < 0 || port > 0xfffc)
+		return;
+	if (port & 3) {
+		opal_lpc_outb(val >> 24, port);
+		opal_lpc_outb(val >> 16, port + 1);
+		opal_lpc_outb(val >>  8, port + 2);
+		opal_lpc_outb(val      , port + 3);
+		return;
+	}
+	opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 4);
+}
+
+static void opal_lpc_outl(u32 val, unsigned long port)
+{
+	__opal_lpc_outl(cpu_to_le32(val), port);
+}
+
+static void opal_lpc_insb(unsigned long p, void *b, unsigned long c)
+{
+	u8 *ptr = b;
+
+	while(c--)
+		*(ptr++) = opal_lpc_inb(p);
+}
+
+static void opal_lpc_insw(unsigned long p, void *b, unsigned long c)
+{
+	__le16 *ptr = b;
+
+	while(c--)
+		*(ptr++) = __opal_lpc_inw(p);
+}
+
+static void opal_lpc_insl(unsigned long p, void *b, unsigned long c)
+{
+	__le32 *ptr = b;
+
+	while(c--)
+		*(ptr++) = __opal_lpc_inl(p);
+}
+
+static void opal_lpc_outsb(unsigned long p, const void *b, unsigned long c)
+{
+	const u8 *ptr = b;
+
+	while(c--)
+		opal_lpc_outb(*(ptr++), p);
+}
+
+static void opal_lpc_outsw(unsigned long p, const void *b, unsigned long c)
+{
+	const __le16 *ptr = b;
+
+	while(c--)
+		__opal_lpc_outw(*(ptr++), p);
+}
+
+static void opal_lpc_outsl(unsigned long p, const void *b, unsigned long c)
+{
+	const __le32 *ptr = b;
+
+	while(c--)
+		__opal_lpc_outl(*(ptr++), p);
+}
+
+static const struct ppc_pci_io opal_lpc_io = {
+	.inb	= opal_lpc_inb,
+	.inw	= opal_lpc_inw,
+	.inl	= opal_lpc_inl,
+	.outb	= opal_lpc_outb,
+	.outw	= opal_lpc_outw,
+	.outl	= opal_lpc_outl,
+	.insb	= opal_lpc_insb,
+	.insw	= opal_lpc_insw,
+	.insl	= opal_lpc_insl,
+	.outsb	= opal_lpc_outsb,
+	.outsw	= opal_lpc_outsw,
+	.outsl	= opal_lpc_outsl,
+};
+
+#ifdef CONFIG_DEBUG_FS
+struct lpc_debugfs_entry {
+	enum OpalLPCAddressType lpc_type;
+};
+
+static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf,
+			      size_t count, loff_t *ppos)
+{
+	struct lpc_debugfs_entry *lpc = filp->private_data;
+	u32 data, pos, len, todo;
+	int rc;
+
+	if (!access_ok(VERIFY_WRITE, ubuf, count))
+		return -EFAULT;
+
+	todo = count;
+	while (todo) {
+		pos = *ppos;
+
+		/*
+		 * Select access size based on count and alignment and
+		 * access type. IO and MEM only support byte acceses,
+		 * FW supports all 3.
+		 */
+		len = 1;
+		if (lpc->lpc_type == OPAL_LPC_FW) {
+			if (todo > 3 && (pos & 3) == 0)
+				len = 4;
+			else if (todo > 1 && (pos & 1) == 0)
+				len = 2;
+		}
+		rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos,
+				   &data, len);
+		if (rc)
+			return -ENXIO;
+
+		/*
+		 * Now there is some trickery with the data returned by OPAL
+		 * as it's the desired data right justified in a 32-bit BE
+		 * word.
+		 *
+		 * This is a very bad interface and I'm to blame for it :-(
+		 *
+		 * So we can't just apply a 32-bit swap to what comes from OPAL,
+		 * because user space expects the *bytes* to be in their proper
+		 * respective positions (ie, LPC position).
+		 *
+		 * So what we really want to do here is to shift data right
+		 * appropriately on a LE kernel.
+		 *
+		 * IE. If the LPC transaction has bytes B0, B1, B2 and B3 in that
+		 * order, we have in memory written to by OPAL at the "data"
+		 * pointer:
+		 *
+		 *               Bytes:      OPAL "data"   LE "data"
+		 *   32-bit:   B0 B1 B2 B3   B0B1B2B3      B3B2B1B0
+		 *   16-bit:   B0 B1         0000B0B1      B1B00000
+		 *    8-bit:   B0            000000B0      B0000000
+		 *
+		 * So a BE kernel will have the leftmost of the above in the MSB
+		 * and rightmost in the LSB and can just then "cast" the u32 "data"
+		 * down to the appropriate quantity and write it.
+		 *
+		 * However, an LE kernel can't. It doesn't need to swap because a
+		 * load from data followed by a store to user are going to preserve
+		 * the byte ordering which is the wire byte order which is what the
+		 * user wants, but in order to "crop" to the right size, we need to
+		 * shift right first.
+		 */
+		switch(len) {
+		case 4:
+			rc = __put_user((u32)data, (u32 __user *)ubuf);
+			break;
+		case 2:
+#ifdef __LITTLE_ENDIAN__
+			data >>= 16;
+#endif
+			rc = __put_user((u16)data, (u16 __user *)ubuf);
+			break;
+		default:
+#ifdef __LITTLE_ENDIAN__
+			data >>= 24;
+#endif
+			rc = __put_user((u8)data, (u8 __user *)ubuf);
+			break;
+		}
+		if (rc)
+			return -EFAULT;
+		*ppos += len;
+		ubuf += len;
+		todo -= len;
+	}
+
+	return count;
+}
+
+static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf,
+			       size_t count, loff_t *ppos)
+{
+	struct lpc_debugfs_entry *lpc = filp->private_data;
+	u32 data, pos, len, todo;
+	int rc;
+
+	if (!access_ok(VERIFY_READ, ubuf, count))
+		return -EFAULT;
+
+	todo = count;
+	while (todo) {
+		pos = *ppos;
+
+		/*
+		 * Select access size based on count and alignment and
+		 * access type. IO and MEM only support byte acceses,
+		 * FW supports all 3.
+		 */
+		len = 1;
+		if (lpc->lpc_type == OPAL_LPC_FW) {
+			if (todo > 3 && (pos & 3) == 0)
+				len = 4;
+			else if (todo > 1 && (pos & 1) == 0)
+				len = 2;
+		}
+
+		/*
+		 * Similarly to the read case, we have some trickery here but
+		 * it's different to handle. We need to pass the value to OPAL in
+		 * a register whose layout depends on the access size. We want
+		 * to reproduce the memory layout of the user, however we aren't
+		 * doing a load from user and a store to another memory location
+		 * which would achieve that. Here we pass the value to OPAL via
+		 * a register which is expected to contain the "BE" interpretation
+		 * of the byte sequence. IE: for a 32-bit access, byte 0 should be
+		 * in the MSB. So here we *do* need to byteswap on LE.
+		 *
+		 *           User bytes:    LE "data"  OPAL "data"
+		 *  32-bit:  B0 B1 B2 B3    B3B2B1B0   B0B1B2B3
+		 *  16-bit:  B0 B1          0000B1B0   0000B0B1
+		 *   8-bit:  B0             000000B0   000000B0
+		 */
+		switch(len) {
+		case 4:
+			rc = __get_user(data, (u32 __user *)ubuf);
+			data = cpu_to_be32(data);
+			break;
+		case 2:
+			rc = __get_user(data, (u16 __user *)ubuf);
+			data = cpu_to_be16(data);
+			break;
+		default:
+			rc = __get_user(data, (u8 __user *)ubuf);
+			break;
+		}
+		if (rc)
+			return -EFAULT;
+
+		rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos,
+				    data, len);
+		if (rc)
+			return -ENXIO;
+		*ppos += len;
+		ubuf += len;
+		todo -= len;
+	}
+
+	return count;
+}
+
+static const struct file_operations lpc_fops = {
+	.read =		lpc_debug_read,
+	.write =	lpc_debug_write,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
+static int opal_lpc_debugfs_create_type(struct dentry *folder,
+					const char *fname,
+					enum OpalLPCAddressType type)
+{
+	struct lpc_debugfs_entry *entry;
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+	entry->lpc_type = type;
+	debugfs_create_file(fname, 0600, folder, entry, &lpc_fops);
+	return 0;
+}
+
+static int opal_lpc_init_debugfs(void)
+{
+	struct dentry *root;
+	int rc = 0;
+
+	if (opal_lpc_chip_id < 0)
+		return -ENODEV;
+
+	root = debugfs_create_dir("lpc", powerpc_debugfs_root);
+
+	rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO);
+	rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM);
+	rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW);
+	return rc;
+}
+machine_device_initcall(powernv, opal_lpc_init_debugfs);
+#endif  /* CONFIG_DEBUG_FS */
+
+void opal_lpc_init(void)
+{
+	struct device_node *np;
+
+	/*
+	 * Look for a Power8 LPC bus tagged as "primary",
+	 * we currently support only one though the OPAL APIs
+	 * support any number.
+	 */
+	for_each_compatible_node(np, NULL, "ibm,power8-lpc") {
+		if (!of_device_is_available(np))
+			continue;
+		if (!of_get_property(np, "primary", NULL))
+			continue;
+		opal_lpc_chip_id = of_get_ibm_chip_id(np);
+		break;
+	}
+	if (opal_lpc_chip_id < 0)
+		return;
+
+	/* Setup special IO ops */
+	ppc_pci_io = opal_lpc_io;
+	isa_io_special = true;
+
+	pr_info("OPAL: Power8 LPC bus found, chip ID %d\n", opal_lpc_chip_id);
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-memory-errors.c b/kernel/arch/powerpc/platforms/powernv/opal-memory-errors.c
new file mode 100644
index 000000000..43db2136d
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -0,0 +1,147 @@
+/*
+ * OPAL asynchronus Memory error handling support in PowreNV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/cputable.h>
+
+static int opal_mem_err_nb_init;
+static LIST_HEAD(opal_memory_err_list);
+static DEFINE_SPINLOCK(opal_mem_err_lock);
+
+struct OpalMsgNode {
+	struct list_head list;
+	struct opal_msg msg;
+};
+
+static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt)
+{
+	uint64_t paddr_start, paddr_end;
+
+	pr_debug("%s: Retrived memory error event, type: 0x%x\n",
+		  __func__, merr_evt->type);
+	switch (merr_evt->type) {
+	case OPAL_MEM_ERR_TYPE_RESILIENCE:
+		paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start);
+		paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end);
+		break;
+	case OPAL_MEM_ERR_TYPE_DYN_DALLOC:
+		paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start);
+		paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end);
+		break;
+	default:
+		return;
+	}
+
+	for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) {
+		memory_failure(paddr_start >> PAGE_SHIFT, 0, 0);
+	}
+}
+
+static void handle_memory_error(void)
+{
+	unsigned long flags;
+	struct OpalMemoryErrorData *merr_evt;
+	struct OpalMsgNode *msg_node;
+
+	spin_lock_irqsave(&opal_mem_err_lock, flags);
+	while (!list_empty(&opal_memory_err_list)) {
+		 msg_node = list_entry(opal_memory_err_list.next,
+					   struct OpalMsgNode, list);
+		list_del(&msg_node->list);
+		spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+		merr_evt = (struct OpalMemoryErrorData *)
+					&msg_node->msg.params[0];
+		handle_memory_error_event(merr_evt);
+		kfree(msg_node);
+		spin_lock_irqsave(&opal_mem_err_lock, flags);
+	}
+	spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+}
+
+static void mem_error_handler(struct work_struct *work)
+{
+	handle_memory_error();
+}
+
+static DECLARE_WORK(mem_error_work, mem_error_handler);
+
+/*
+ * opal_memory_err_event - notifier handler that queues up the opal message
+ * to be preocessed later.
+ */
+static int opal_memory_err_event(struct notifier_block *nb,
+			  unsigned long msg_type, void *msg)
+{
+	unsigned long flags;
+	struct OpalMsgNode *msg_node;
+
+	if (msg_type != OPAL_MSG_MEM_ERR)
+		return 0;
+
+	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+	if (!msg_node) {
+		pr_err("MEMORY_ERROR: out of memory, Opal message event not"
+		       "handled\n");
+		return -ENOMEM;
+	}
+	memcpy(&msg_node->msg, msg, sizeof(struct opal_msg));
+
+	spin_lock_irqsave(&opal_mem_err_lock, flags);
+	list_add(&msg_node->list, &opal_memory_err_list);
+	spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+	schedule_work(&mem_error_work);
+	return 0;
+}
+
+static struct notifier_block opal_mem_err_nb = {
+	.notifier_call	= opal_memory_err_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_mem_err_init(void)
+{
+	int ret;
+
+	if (!opal_mem_err_nb_init) {
+		ret = opal_message_notifier_register(
+					OPAL_MSG_MEM_ERR, &opal_mem_err_nb);
+		if (ret) {
+			pr_err("%s: Can't register OPAL event notifier (%d)\n",
+			       __func__, ret);
+			return ret;
+		}
+		opal_mem_err_nb_init = 1;
+	}
+	return 0;
+}
+machine_subsys_initcall(powernv, opal_mem_err_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-msglog.c b/kernel/arch/powerpc/platforms/powernv/opal-msglog.c
new file mode 100644
index 000000000..44ed78af1
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -0,0 +1,124 @@
+/*
+ * PowerNV OPAL in-memory console interface
+ *
+ * Copyright 2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/io.h>
+#include <asm/opal.h>
+#include <linux/debugfs.h>
+#include <linux/of.h>
+#include <linux/types.h>
+#include <asm/barrier.h>
+
+/* OPAL in-memory console. Defined in OPAL source at core/console.c */
+struct memcons {
+	__be64 magic;
+#define MEMCONS_MAGIC	0x6630696567726173L
+	__be64 obuf_phys;
+	__be64 ibuf_phys;
+	__be32 obuf_size;
+	__be32 ibuf_size;
+	__be32 out_pos;
+#define MEMCONS_OUT_POS_WRAP	0x80000000u
+#define MEMCONS_OUT_POS_MASK	0x00ffffffu
+	__be32 in_prod;
+	__be32 in_cons;
+};
+
+static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *to,
+				loff_t pos, size_t count)
+{
+	struct memcons *mc = bin_attr->private;
+	const char *conbuf;
+	ssize_t ret;
+	size_t first_read = 0;
+	uint32_t out_pos, avail;
+
+	if (!mc)
+		return -ENODEV;
+
+	out_pos = be32_to_cpu(ACCESS_ONCE(mc->out_pos));
+
+	/* Now we've read out_pos, put a barrier in before reading the new
+	 * data it points to in conbuf. */
+	smp_rmb();
+
+	conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
+
+	/* When the buffer has wrapped, read from the out_pos marker to the end
+	 * of the buffer, and then read the remaining data as in the un-wrapped
+	 * case. */
+	if (out_pos & MEMCONS_OUT_POS_WRAP) {
+
+		out_pos &= MEMCONS_OUT_POS_MASK;
+		avail = be32_to_cpu(mc->obuf_size) - out_pos;
+
+		ret = memory_read_from_buffer(to, count, &pos,
+				conbuf + out_pos, avail);
+
+		if (ret < 0)
+			goto out;
+
+		first_read = ret;
+		to += first_read;
+		count -= first_read;
+		pos -= avail;
+
+		if (count <= 0)
+			goto out;
+	}
+
+	/* Sanity check. The firmware should not do this to us. */
+	if (out_pos > be32_to_cpu(mc->obuf_size)) {
+		pr_err("OPAL: memory console corruption. Aborting read.\n");
+		return -EINVAL;
+	}
+
+	ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos);
+
+	if (ret < 0)
+		goto out;
+
+	ret += first_read;
+out:
+	return ret;
+}
+
+static struct bin_attribute opal_msglog_attr = {
+	.attr = {.name = "msglog", .mode = 0444},
+	.read = opal_msglog_read
+};
+
+void __init opal_msglog_init(void)
+{
+	u64 mcaddr;
+	struct memcons *mc;
+
+	if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) {
+		pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n");
+		return;
+	}
+
+	mc = phys_to_virt(mcaddr);
+	if (!mc) {
+		pr_warn("OPAL: memory console address is invalid\n");
+		return;
+	}
+
+	if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
+		pr_warn("OPAL: memory console version is invalid\n");
+		return;
+	}
+
+	opal_msglog_attr.private = mc;
+
+	if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0)
+		pr_warn("OPAL: sysfs file creation failed\n");
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-nvram.c b/kernel/arch/powerpc/platforms/powernv/opal-nvram.c
new file mode 100644
index 000000000..9db4398de
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -0,0 +1,98 @@
+/*
+ * PowerNV nvram code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+
+#include <asm/opal.h>
+#include <asm/nvram.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+
+static ssize_t opal_nvram_size(void)
+{
+	return nvram_size;
+}
+
+static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index)
+{
+	s64 rc;
+	int off;
+
+	if (*index >= nvram_size)
+		return 0;
+	off = *index;
+	if ((off + count) > nvram_size)
+		count = nvram_size - off;
+	rc = opal_read_nvram(__pa(buf), count, off);
+	if (rc != OPAL_SUCCESS)
+		return -EIO;
+	*index += count;
+	return count;
+}
+
+static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index)
+{
+	s64 rc = OPAL_BUSY;
+	int off;
+
+	if (*index >= nvram_size)
+		return 0;
+	off = *index;
+	if ((off + count) > nvram_size)
+		count = nvram_size - off;
+
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_write_nvram(__pa(buf), count, off);
+		if (rc == OPAL_BUSY_EVENT)
+			opal_poll_events(NULL);
+	}
+	*index += count;
+	return count;
+}
+
+static int __init opal_nvram_init_log_partitions(void)
+{
+	/* Scan nvram for partitions */
+	nvram_scan_partitions();
+	nvram_init_oops_partition(0);
+	return 0;
+}
+machine_arch_initcall(powernv, opal_nvram_init_log_partitions);
+
+void __init opal_nvram_init(void)
+{
+	struct device_node *np;
+	const __be32 *nbytes_p;
+
+	np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram");
+	if (np == NULL)
+		return;
+
+	nbytes_p = of_get_property(np, "#bytes", NULL);
+	if (!nbytes_p) {
+		of_node_put(np);
+		return;
+	}
+	nvram_size = be32_to_cpup(nbytes_p);
+
+	pr_info("OPAL nvram setup, %u bytes\n", nvram_size);
+	of_node_put(np);
+
+	ppc_md.nvram_read = opal_nvram_read;
+	ppc_md.nvram_write = opal_nvram_write;
+	ppc_md.nvram_size = opal_nvram_size;
+}
+
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-power.c b/kernel/arch/powerpc/platforms/powernv/opal-power.c
new file mode 100644
index 000000000..ac46c2c24
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-power.c
@@ -0,0 +1,66 @@
+/*
+ * PowerNV OPAL power control for graceful shutdown handling
+ *
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+#define SOFT_OFF 0x00
+#define SOFT_REBOOT 0x01
+
+static int opal_power_control_event(struct notifier_block *nb,
+				    unsigned long msg_type, void *msg)
+{
+	struct opal_msg *power_msg = msg;
+	uint64_t type;
+
+	type = be64_to_cpu(power_msg->params[0]);
+
+	switch (type) {
+	case SOFT_REBOOT:
+		pr_info("OPAL: reboot requested\n");
+		orderly_reboot();
+		break;
+	case SOFT_OFF:
+		pr_info("OPAL: poweroff requested\n");
+		orderly_poweroff(true);
+		break;
+	default:
+		pr_err("OPAL: power control type unexpected %016llx\n", type);
+	}
+
+	return 0;
+}
+
+static struct notifier_block opal_power_control_nb = {
+	.notifier_call	= opal_power_control_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_power_control_init(void)
+{
+	int ret;
+
+	ret = opal_message_notifier_register(OPAL_MSG_SHUTDOWN,
+					     &opal_power_control_nb);
+	if (ret) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+				__func__, ret);
+		return ret;
+	}
+
+	return 0;
+}
+machine_subsys_initcall(powernv, opal_power_control_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-rtc.c b/kernel/arch/powerpc/platforms/powernv/opal-rtc.c
new file mode 100644
index 000000000..37dbee157
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -0,0 +1,87 @@
+/*
+ * PowerNV Real Time Clock.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/of_platform.h>
+
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
+{
+	tm->tm_year	= ((bcd2bin(y_m_d >> 24) * 100) +
+			   bcd2bin((y_m_d >> 16) & 0xff)) - 1900;
+	tm->tm_mon	= bcd2bin((y_m_d >> 8) & 0xff) - 1;
+	tm->tm_mday	= bcd2bin(y_m_d & 0xff);
+	tm->tm_hour	= bcd2bin((h_m_s_ms >> 56) & 0xff);
+	tm->tm_min	= bcd2bin((h_m_s_ms >> 48) & 0xff);
+	tm->tm_sec	= bcd2bin((h_m_s_ms >> 40) & 0xff);
+
+        GregorianDay(tm);
+}
+
+unsigned long __init opal_get_boot_time(void)
+{
+	struct rtc_time tm;
+	u32 y_m_d;
+	u64 h_m_s_ms;
+	__be32 __y_m_d;
+	__be64 __h_m_s_ms;
+	long rc = OPAL_BUSY;
+
+	if (!opal_check_token(OPAL_RTC_READ))
+		return 0;
+
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms);
+		if (rc == OPAL_BUSY_EVENT)
+			opal_poll_events(NULL);
+		else
+			mdelay(10);
+	}
+	if (rc != OPAL_SUCCESS)
+		return 0;
+
+	y_m_d = be32_to_cpu(__y_m_d);
+	h_m_s_ms = be64_to_cpu(__h_m_s_ms);
+	opal_to_tm(y_m_d, h_m_s_ms, &tm);
+	return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+		      tm.tm_hour, tm.tm_min, tm.tm_sec);
+}
+
+static __init int opal_time_init(void)
+{
+	struct platform_device *pdev;
+	struct device_node *rtc;
+
+	rtc = of_find_node_by_path("/ibm,opal/rtc");
+	if (rtc) {
+		pdev = of_platform_device_create(rtc, "opal-rtc", NULL);
+		of_node_put(rtc);
+	} else {
+		if (opal_check_token(OPAL_RTC_READ) ||
+		    opal_check_token(OPAL_READ_TPO))
+			pdev = platform_device_register_simple("opal-rtc", -1,
+							       NULL, 0);
+		else
+			return -ENODEV;
+	}
+
+	return PTR_ERR_OR_ZERO(pdev);
+}
+machine_subsys_initcall(powernv, opal_time_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-sensor.c b/kernel/arch/powerpc/platforms/powernv/opal-sensor.c
new file mode 100644
index 000000000..655250499
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -0,0 +1,96 @@
+/*
+ * PowerNV sensor code
+ *
+ * Copyright (C) 2013 IBM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <asm/opal.h>
+#include <asm/machdep.h>
+
+static DEFINE_MUTEX(opal_sensor_mutex);
+
+/*
+ * This will return sensor information to driver based on the requested sensor
+ * handle. A handle is an opaque id for the powernv, read by the driver from the
+ * device tree..
+ */
+int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
+{
+	int ret, token;
+	struct opal_msg msg;
+	__be32 data;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		pr_err("%s: Couldn't get the token, returning\n", __func__);
+		ret = token;
+		goto out;
+	}
+
+	mutex_lock(&opal_sensor_mutex);
+	ret = opal_sensor_read(sensor_hndl, token, &data);
+	switch (ret) {
+	case OPAL_ASYNC_COMPLETION:
+		ret = opal_async_wait_response(token, &msg);
+		if (ret) {
+			pr_err("%s: Failed to wait for the async response, %d\n",
+			       __func__, ret);
+			goto out_token;
+		}
+
+		ret = opal_error_code(be64_to_cpu(msg.params[1]));
+		*sensor_data = be32_to_cpu(data);
+		break;
+
+	case OPAL_SUCCESS:
+		ret = 0;
+		*sensor_data = be32_to_cpu(data);
+		break;
+
+	default:
+		ret = opal_error_code(ret);
+		break;
+	}
+
+out_token:
+	mutex_unlock(&opal_sensor_mutex);
+	opal_async_release_token(token);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_get_sensor_data);
+
+static __init int opal_sensor_init(void)
+{
+	struct platform_device *pdev;
+	struct device_node *sensor;
+
+	sensor = of_find_node_by_path("/ibm,opal/sensors");
+	if (!sensor) {
+		pr_err("Opal node 'sensors' not found\n");
+		return -ENODEV;
+	}
+
+	pdev = of_platform_device_create(sensor, "opal-sensor", NULL);
+	of_node_put(sensor);
+
+	return PTR_ERR_OR_ZERO(pdev);
+}
+machine_subsys_initcall(powernv, opal_sensor_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-sysparam.c b/kernel/arch/powerpc/platforms/powernv/opal-sysparam.c
new file mode 100644
index 000000000..9d1acf22a
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -0,0 +1,304 @@
+/*
+ * PowerNV system parameter code
+ *
+ * Copyright (C) 2013 IBM
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kobject.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/gfp.h>
+#include <linux/stat.h>
+#include <asm/opal.h>
+
+#define MAX_PARAM_DATA_LEN	64
+
+static DEFINE_MUTEX(opal_sysparam_mutex);
+static struct kobject *sysparam_kobj;
+static void *param_data_buf;
+
+struct param_attr {
+	struct list_head list;
+	u32 param_id;
+	u32 param_size;
+	struct kobj_attribute kobj_attr;
+};
+
+static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
+{
+	struct opal_msg msg;
+	ssize_t ret;
+	int token;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		if (token != -ERESTARTSYS)
+			pr_err("%s: Couldn't get the token, returning\n",
+					__func__);
+		ret = token;
+		goto out;
+	}
+
+	ret = opal_get_param(token, param_id, (u64)buffer, length);
+	if (ret != OPAL_ASYNC_COMPLETION)
+		goto out_token;
+
+	ret = opal_async_wait_response(token, &msg);
+	if (ret) {
+		pr_err("%s: Failed to wait for the async response, %zd\n",
+				__func__, ret);
+		goto out_token;
+	}
+
+	ret = be64_to_cpu(msg.params[1]);
+
+out_token:
+	opal_async_release_token(token);
+out:
+	return ret;
+}
+
+static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
+{
+	struct opal_msg msg;
+	int ret, token;
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		if (token != -ERESTARTSYS)
+			pr_err("%s: Couldn't get the token, returning\n",
+					__func__);
+		ret = token;
+		goto out;
+	}
+
+	ret = opal_set_param(token, param_id, (u64)buffer, length);
+
+	if (ret != OPAL_ASYNC_COMPLETION)
+		goto out_token;
+
+	ret = opal_async_wait_response(token, &msg);
+	if (ret) {
+		pr_err("%s: Failed to wait for the async response, %d\n",
+				__func__, ret);
+		goto out_token;
+	}
+
+	ret = be64_to_cpu(msg.params[1]);
+
+out_token:
+	opal_async_release_token(token);
+out:
+	return ret;
+}
+
+static ssize_t sys_param_show(struct kobject *kobj,
+		struct kobj_attribute *kobj_attr, char *buf)
+{
+	struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+			kobj_attr);
+	ssize_t ret;
+
+	mutex_lock(&opal_sysparam_mutex);
+	ret = opal_get_sys_param(attr->param_id, attr->param_size,
+			param_data_buf);
+	if (ret)
+		goto out;
+
+	memcpy(buf, param_data_buf, attr->param_size);
+
+	ret = attr->param_size;
+out:
+	mutex_unlock(&opal_sysparam_mutex);
+	return ret;
+}
+
+static ssize_t sys_param_store(struct kobject *kobj,
+		struct kobj_attribute *kobj_attr, const char *buf, size_t count)
+{
+	struct param_attr *attr = container_of(kobj_attr, struct param_attr,
+			kobj_attr);
+	ssize_t ret;
+
+        /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */
+        if (count > MAX_PARAM_DATA_LEN)
+                count = MAX_PARAM_DATA_LEN;
+
+	mutex_lock(&opal_sysparam_mutex);
+	memcpy(param_data_buf, buf, count);
+	ret = opal_set_sys_param(attr->param_id, attr->param_size,
+			param_data_buf);
+	mutex_unlock(&opal_sysparam_mutex);
+	if (!ret)
+		ret = count;
+	return ret;
+}
+
+void __init opal_sys_param_init(void)
+{
+	struct device_node *sysparam;
+	struct param_attr *attr;
+	u32 *id, *size;
+	int count, i;
+	u8 *perm;
+
+	if (!opal_kobj) {
+		pr_warn("SYSPARAM: opal kobject is not available\n");
+		goto out;
+	}
+
+	sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj);
+	if (!sysparam_kobj) {
+		pr_err("SYSPARAM: Failed to create sysparam kobject\n");
+		goto out;
+	}
+
+	/* Allocate big enough buffer for any get/set transactions */
+	param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL);
+	if (!param_data_buf) {
+		pr_err("SYSPARAM: Failed to allocate memory for param data "
+				"buf\n");
+		goto out_kobj_put;
+	}
+
+	sysparam = of_find_node_by_path("/ibm,opal/sysparams");
+	if (!sysparam) {
+		pr_err("SYSPARAM: Opal sysparam node not found\n");
+		goto out_param_buf;
+	}
+
+	if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
+		pr_err("SYSPARAM: Opal sysparam node not compatible\n");
+		goto out_node_put;
+	}
+
+	/* Number of parameters exposed through DT */
+	count = of_property_count_strings(sysparam, "param-name");
+	if (count < 0) {
+		pr_err("SYSPARAM: No string found of property param-name in "
+				"the node %s\n", sysparam->name);
+		goto out_node_put;
+	}
+
+	id = kzalloc(sizeof(*id) * count, GFP_KERNEL);
+	if (!id) {
+		pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+				"id\n");
+		goto out_node_put;
+	}
+
+	size = kzalloc(sizeof(*size) * count, GFP_KERNEL);
+	if (!size) {
+		pr_err("SYSPARAM: Failed to allocate memory to read parameter "
+				"size\n");
+		goto out_free_id;
+	}
+
+	perm = kzalloc(sizeof(*perm) * count, GFP_KERNEL);
+	if (!perm) {
+		pr_err("SYSPARAM: Failed to allocate memory to read supported "
+				"action on the parameter");
+		goto out_free_size;
+	}
+
+	if (of_property_read_u32_array(sysparam, "param-id", id, count)) {
+		pr_err("SYSPARAM: Missing property param-id in the DT\n");
+		goto out_free_perm;
+	}
+
+	if (of_property_read_u32_array(sysparam, "param-len", size, count)) {
+		pr_err("SYSPARAM: Missing property param-len in the DT\n");
+		goto out_free_perm;
+	}
+
+
+	if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) {
+		pr_err("SYSPARAM: Missing property param-perm in the DT\n");
+		goto out_free_perm;
+	}
+
+	attr = kzalloc(sizeof(*attr) * count, GFP_KERNEL);
+	if (!attr) {
+		pr_err("SYSPARAM: Failed to allocate memory for parameter "
+				"attributes\n");
+		goto out_free_perm;
+	}
+
+	/* For each of the parameters, populate the parameter attributes */
+	for (i = 0; i < count; i++) {
+		if (size[i] > MAX_PARAM_DATA_LEN) {
+			pr_warn("SYSPARAM: Not creating parameter %d as size "
+				"exceeds buffer length\n", i);
+			continue;
+		}
+
+		sysfs_attr_init(&attr[i].kobj_attr.attr);
+		attr[i].param_id = id[i];
+		attr[i].param_size = size[i];
+		if (of_property_read_string_index(sysparam, "param-name", i,
+				&attr[i].kobj_attr.attr.name))
+			continue;
+
+		/* If the parameter is read-only or read-write */
+		switch (perm[i] & 3) {
+		case OPAL_SYSPARAM_READ:
+			attr[i].kobj_attr.attr.mode = S_IRUGO;
+			break;
+		case OPAL_SYSPARAM_WRITE:
+			attr[i].kobj_attr.attr.mode = S_IWUSR;
+			break;
+		case OPAL_SYSPARAM_RW:
+			attr[i].kobj_attr.attr.mode = S_IRUGO | S_IWUSR;
+			break;
+		default:
+			break;
+		}
+
+		attr[i].kobj_attr.show = sys_param_show;
+		attr[i].kobj_attr.store = sys_param_store;
+
+		if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) {
+			pr_err("SYSPARAM: Failed to create sysfs file %s\n",
+					attr[i].kobj_attr.attr.name);
+			goto out_free_attr;
+		}
+	}
+
+	kfree(perm);
+	kfree(size);
+	kfree(id);
+	of_node_put(sysparam);
+	return;
+
+out_free_attr:
+	kfree(attr);
+out_free_perm:
+	kfree(perm);
+out_free_size:
+	kfree(size);
+out_free_id:
+	kfree(id);
+out_node_put:
+	of_node_put(sysparam);
+out_param_buf:
+	kfree(param_data_buf);
+out_kobj_put:
+	kobject_put(sysparam_kobj);
+out:
+	return;
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-tracepoints.c b/kernel/arch/powerpc/platforms/powernv/opal-tracepoints.c
new file mode 100644
index 000000000..e11273b23
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -0,0 +1,84 @@
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/trace.h>
+
+#ifdef HAVE_JUMP_LABEL
+struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
+
+void opal_tracepoint_regfunc(void)
+{
+	static_key_slow_inc(&opal_tracepoint_key);
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+	static_key_slow_dec(&opal_tracepoint_key);
+}
+#else
+/*
+ * We optimise OPAL calls by placing opal_tracepoint_refcount
+ * directly in the TOC so we can check if the opal tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long opal_tracepoint_refcount;
+
+void opal_tracepoint_regfunc(void)
+{
+	opal_tracepoint_refcount++;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+	opal_tracepoint_refcount--;
+}
+#endif
+
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+void __trace_opal_entry(unsigned long opcode, unsigned long *args)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = this_cpu_ptr(&opal_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	preempt_disable();
+	trace_opal_entry(opcode, args);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+
+void __trace_opal_exit(long opcode, unsigned long retval)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = this_cpu_ptr(&opal_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	trace_opal_exit(opcode, retval);
+	preempt_enable();
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-wrappers.S b/kernel/arch/powerpc/platforms/powernv/opal-wrappers.S
new file mode 100644
index 000000000..a7ade94cd
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -0,0 +1,297 @@
+/*
+ * PowerNV OPAL API wrappers
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/jump_label.h>
+#include <asm/ppc_asm.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/opal.h>
+
+	.section	".text"
+
+#ifdef CONFIG_TRACEPOINTS
+#ifdef HAVE_JUMP_LABEL
+#define OPAL_BRANCH(LABEL)					\
+	ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
+#else
+
+	.section	".toc","aw"
+
+	.globl opal_tracepoint_refcount
+opal_tracepoint_refcount:
+	.llong	0
+
+	.section	".text"
+
+/*
+ * We branch around this in early init by using an unconditional cpu
+ * feature.
+ */
+#define OPAL_BRANCH(LABEL)					\
+BEGIN_FTR_SECTION;						\
+	b	1f;						\
+END_FTR_SECTION(0, 1);						\
+	ld	r12,opal_tracepoint_refcount@toc(r2);		\
+	cmpdi	r12,0;						\
+	bne-	LABEL;						\
+1:
+
+#endif
+
+#else
+#define OPAL_BRANCH(LABEL)
+#endif
+
+/* TODO:
+ *
+ * - Trace irqs in/off (needs saving/restoring all args, argh...)
+ * - Get r11 feed up by Dave so I can have better register usage
+ */
+
+#define OPAL_CALL(name, token)		\
+ _GLOBAL_TOC(name);			\
+	mflr	r0;			\
+	std	r0,16(r1);		\
+	li	r0,token;		\
+	OPAL_BRANCH(opal_tracepoint_entry) \
+	mfcr	r12;			\
+	stw	r12,8(r1);		\
+	std	r1,PACAR1(r13);		\
+	li	r11,0;			\
+	mfmsr	r12;			\
+	ori	r11,r11,MSR_EE;		\
+	std	r12,PACASAVEDMSR(r13);	\
+	andc	r12,r12,r11;		\
+	mtmsrd	r12,1;			\
+	LOAD_REG_ADDR(r11,opal_return);	\
+	mtlr	r11;			\
+	li	r11,MSR_DR|MSR_IR|MSR_LE;\
+	andc	r12,r12,r11;		\
+	mtspr	SPRN_HSRR1,r12;		\
+	LOAD_REG_ADDR(r11,opal);	\
+	ld	r12,8(r11);		\
+	ld	r2,0(r11);		\
+	mtspr	SPRN_HSRR0,r12;		\
+	hrfid
+
+opal_return:
+	/*
+	 * Fixup endian on OPAL return... we should be able to simplify
+	 * this by instead converting the below trampoline to a set of
+	 * bytes (always BE) since MSR:LE will end up fixed up as a side
+	 * effect of the rfid.
+	 */
+	FIXUP_ENDIAN
+	ld	r2,PACATOC(r13);
+	lwz	r4,8(r1);
+	ld	r5,16(r1);
+	ld	r6,PACASAVEDMSR(r13);
+	mtspr	SPRN_SRR0,r5;
+	mtspr	SPRN_SRR1,r6;
+	mtcr	r4;
+	rfid
+
+#ifdef CONFIG_TRACEPOINTS
+opal_tracepoint_entry:
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r0,STK_REG(R23)(r1)
+	std	r3,STK_REG(R24)(r1)
+	std	r4,STK_REG(R25)(r1)
+	std	r5,STK_REG(R26)(r1)
+	std	r6,STK_REG(R27)(r1)
+	std	r7,STK_REG(R28)(r1)
+	std	r8,STK_REG(R29)(r1)
+	std	r9,STK_REG(R30)(r1)
+	std	r10,STK_REG(R31)(r1)
+	mr	r3,r0
+	addi	r4,r1,STK_REG(R24)
+	bl	__trace_opal_entry
+	ld	r0,STK_REG(R23)(r1)
+	ld	r3,STK_REG(R24)(r1)
+	ld	r4,STK_REG(R25)(r1)
+	ld	r5,STK_REG(R26)(r1)
+	ld	r6,STK_REG(R27)(r1)
+	ld	r7,STK_REG(R28)(r1)
+	ld	r8,STK_REG(R29)(r1)
+	ld	r9,STK_REG(R30)(r1)
+	ld	r10,STK_REG(R31)(r1)
+	LOAD_REG_ADDR(r11,opal_tracepoint_return)
+	mfcr	r12
+	std	r11,16(r1)
+	stw	r12,8(r1)
+	std	r1,PACAR1(r13)
+	li	r11,0
+	mfmsr	r12
+	ori	r11,r11,MSR_EE
+	std	r12,PACASAVEDMSR(r13)
+	andc	r12,r12,r11
+	mtmsrd	r12,1
+	LOAD_REG_ADDR(r11,opal_return)
+	mtlr	r11
+	li	r11,MSR_DR|MSR_IR|MSR_LE
+	andc	r12,r12,r11
+	mtspr	SPRN_HSRR1,r12
+	LOAD_REG_ADDR(r11,opal)
+	ld	r12,8(r11)
+	ld	r2,0(r11)
+	mtspr	SPRN_HSRR0,r12
+	hrfid
+
+opal_tracepoint_return:
+	std	r3,STK_REG(R31)(r1)
+	mr	r4,r3
+	ld	r0,STK_REG(R23)(r1)
+	bl	__trace_opal_exit
+	ld	r3,STK_REG(R31)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+#endif
+
+/*
+ * Make opal call in realmode. This is a generic function to be called
+ * from realmode. It handles endianness.
+ *
+ * r13 - paca pointer
+ * r1  - stack pointer
+ * r0  - opal token
+ */
+_GLOBAL(opal_call_realmode)
+	mflr	r12
+	std	r12,PPC_LR_STKOFF(r1)
+	ld	r2,PACATOC(r13)
+	/* Set opal return address */
+	LOAD_REG_ADDR(r12,return_from_opal_call)
+	mtlr	r12
+
+	mfmsr	r12
+#ifdef __LITTLE_ENDIAN__
+	/* Handle endian-ness */
+	li	r11,MSR_LE
+	andc	r12,r12,r11
+#endif
+	mtspr	SPRN_HSRR1,r12
+	LOAD_REG_ADDR(r11,opal)
+	ld	r12,8(r11)
+	ld	r2,0(r11)
+	mtspr	SPRN_HSRR0,r12
+	hrfid
+
+return_from_opal_call:
+#ifdef __LITTLE_ENDIAN__
+	FIXUP_ENDIAN
+#endif
+	ld	r12,PPC_LR_STKOFF(r1)
+	mtlr	r12
+	blr
+
+OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
+OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space,	OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read,			OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write,			OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down,			OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot,			OPAL_CEC_REBOOT);
+OPAL_CALL(opal_read_nvram,			OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram,			OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt,		OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events,			OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory,		OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory,		OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte,		OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word,	OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word,		OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte,		OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word,	OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word,		OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive,			OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive,			OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler,	OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status,		OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear,		OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_eeh_freeze_set,		OPAL_PCI_EEH_FREEZE_SET);
+OPAL_CALL(opal_pci_err_inject,			OPAL_PCI_ERR_INJECT);
+OPAL_CALL(opal_pci_shpc,			OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable,		OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window,		OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window,		OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory,	OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe,			OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv,			OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_set_mve,			OPAL_PCI_SET_MVE);
+OPAL_CALL(opal_pci_set_mve_enable,		OPAL_PCI_SET_MVE_ENABLE);
+OPAL_CALL(opal_pci_get_xive_reissue,		OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue,		OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe,			OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source,			OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32,			OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64,			OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu,			OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status,		OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel,			OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window,		OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real,	OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset,			OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data,		OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data,		OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb,			OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit,			OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error,		OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status,		OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status,			OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_set_system_attention_led,	OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error,			OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll,			OPAL_PCI_POLL);
+OPAL_CALL(opal_pci_msi_eoi,			OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2,		OPAL_PCI_GET_PHB_DIAG_DATA2);
+OPAL_CALL(opal_xscom_read,			OPAL_XSCOM_READ);
+OPAL_CALL(opal_xscom_write,			OPAL_XSCOM_WRITE);
+OPAL_CALL(opal_lpc_read,			OPAL_LPC_READ);
+OPAL_CALL(opal_lpc_write,			OPAL_LPC_WRITE);
+OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
+OPAL_CALL(opal_reinit_cpus,			OPAL_REINIT_CPUS);
+OPAL_CALL(opal_read_elog,			OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog,			OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size,			OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs,		OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog,			OPAL_ELOG_WRITE);
+OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE);
+OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
+OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase,			OPAL_RESYNC_TIMEBASE);
+OPAL_CALL(opal_check_token,			OPAL_CHECK_TOKEN);
+OPAL_CALL(opal_dump_init,			OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info,			OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2,			OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read,			OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack,			OPAL_DUMP_ACK);
+OPAL_CALL(opal_get_msg,				OPAL_GET_MSG);
+OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification,	OPAL_DUMP_RESEND);
+OPAL_CALL(opal_sync_host_reboot,		OPAL_SYNC_HOST_REBOOT);
+OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ);
+OPAL_CALL(opal_get_param,			OPAL_GET_PARAM);
+OPAL_CALL(opal_set_param,			OPAL_SET_PARAM);
+OPAL_CALL(opal_handle_hmi,			OPAL_HANDLE_HMI);
+OPAL_CALL(opal_slw_set_reg,			OPAL_SLW_SET_REG);
+OPAL_CALL(opal_register_dump_region,		OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region,		OPAL_UNREGISTER_DUMP_REGION);
+OPAL_CALL(opal_pci_set_phb_cxl_mode,		OPAL_PCI_SET_PHB_CAPI_MODE);
+OPAL_CALL(opal_tpo_write,			OPAL_WRITE_TPO);
+OPAL_CALL(opal_tpo_read,			OPAL_READ_TPO);
+OPAL_CALL(opal_ipmi_send,			OPAL_IPMI_SEND);
+OPAL_CALL(opal_ipmi_recv,			OPAL_IPMI_RECV);
+OPAL_CALL(opal_i2c_request,			OPAL_I2C_REQUEST);
+OPAL_CALL(opal_flash_read,			OPAL_FLASH_READ);
+OPAL_CALL(opal_flash_write,			OPAL_FLASH_WRITE);
+OPAL_CALL(opal_flash_erase,			OPAL_FLASH_ERASE);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal-xscom.c b/kernel/arch/powerpc/platforms/powernv/opal-xscom.c
new file mode 100644
index 000000000..7634d1c62
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -0,0 +1,133 @@
+/*
+ * PowerNV LPC bus handling.
+ *
+ * Copyright 2013 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/bug.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/scom.h>
+
+/*
+ * We could probably fit that inside the scom_map_t
+ * which is a void* after all but it's really too ugly
+ * so let's kmalloc it for now
+ */
+struct opal_scom_map {
+	uint32_t chip;
+	uint64_t addr;
+};
+
+static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
+{
+	struct opal_scom_map *m;
+	const __be32 *gcid;
+
+	if (!of_get_property(dev, "scom-controller", NULL)) {
+		pr_err("%s: device %s is not a SCOM controller\n",
+			__func__, dev->full_name);
+		return SCOM_MAP_INVALID;
+	}
+	gcid = of_get_property(dev, "ibm,chip-id", NULL);
+	if (!gcid) {
+		pr_err("%s: device %s has no ibm,chip-id\n",
+			__func__, dev->full_name);
+		return SCOM_MAP_INVALID;
+	}
+	m = kmalloc(sizeof(struct opal_scom_map), GFP_KERNEL);
+	if (!m)
+		return NULL;
+	m->chip = be32_to_cpup(gcid);
+	m->addr = reg;
+
+	return (scom_map_t)m;
+}
+
+static void opal_scom_unmap(scom_map_t map)
+{
+	kfree(map);
+}
+
+static int opal_xscom_err_xlate(int64_t rc)
+{
+	switch(rc) {
+	case 0:
+		return 0;
+	/* Add more translations if necessary */
+	default:
+		return -EIO;
+	}
+}
+
+static u64 opal_scom_unmangle(u64 addr)
+{
+	/*
+	 * XSCOM indirect addresses have the top bit set. Additionally
+	 * the rest of the top 3 nibbles is always 0.
+	 *
+	 * Because the debugfs interface uses signed offsets and shifts
+	 * the address left by 3, we basically cannot use the top 4 bits
+	 * of the 64-bit address, and thus cannot use the indirect bit.
+	 *
+	 * To deal with that, we support the indirect bit being in bit
+	 * 4 (IBM notation) instead of bit 0 in this API, we do the
+	 * conversion here. To leave room for further xscom address
+	 * expansion, we only clear out the top byte
+	 *
+	 * For in-kernel use, we also support the real indirect bit, so
+	 * we test for any of the top 5 bits
+	 *
+	 */
+	if (addr & (0x1full << 59))
+		addr = (addr & ~(0xffull << 56)) | (1ull << 63);
+	return addr;
+}
+
+static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
+{
+	struct opal_scom_map *m = map;
+	int64_t rc;
+	__be64 v;
+
+	reg = opal_scom_unmangle(m->addr + reg);
+	rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
+	*value = be64_to_cpu(v);
+	return opal_xscom_err_xlate(rc);
+}
+
+static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
+{
+	struct opal_scom_map *m = map;
+	int64_t rc;
+
+	reg = opal_scom_unmangle(m->addr + reg);
+	rc = opal_xscom_write(m->chip, reg, value);
+	return opal_xscom_err_xlate(rc);
+}
+
+static const struct scom_controller opal_scom_controller = {
+	.map	= opal_scom_map,
+	.unmap	= opal_scom_unmap,
+	.read	= opal_scom_read,
+	.write	= opal_scom_write
+};
+
+static int opal_xscom_init(void)
+{
+	if (firmware_has_feature(FW_FEATURE_OPALv3))
+		scom_init(&opal_scom_controller);
+	return 0;
+}
+machine_arch_initcall(powernv, opal_xscom_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/opal.c b/kernel/arch/powerpc/platforms/powernv/opal.c
new file mode 100644
index 000000000..2241565b0
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/opal.c
@@ -0,0 +1,972 @@
+/*
+ * PowerNV OPAL high level interfaces
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt)	"opal: " fmt
+
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_platform.h>
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/delay.h>
+#include <linux/memblock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/firmware.h>
+#include <asm/mce.h>
+
+#include "powernv.h"
+
+/* /sys/firmware/opal */
+struct kobject *opal_kobj;
+
+struct opal {
+	u64 base;
+	u64 entry;
+	u64 size;
+} opal;
+
+struct mcheck_recoverable_range {
+	u64 start_addr;
+	u64 end_addr;
+	u64 recover_addr;
+};
+
+static struct mcheck_recoverable_range *mc_recoverable_range;
+static int mc_recoverable_range_len;
+
+struct device_node *opal_node;
+static DEFINE_SPINLOCK(opal_write_lock);
+static unsigned int *opal_irqs;
+static unsigned int opal_irq_count;
+static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
+static DEFINE_SPINLOCK(opal_notifier_lock);
+static uint64_t last_notified_mask = 0x0ul;
+static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
+static uint32_t opal_heartbeat;
+
+static void opal_reinit_cores(void)
+{
+	/* Do the actual re-init, This will clobber all FPRs, VRs, etc...
+	 *
+	 * It will preserve non volatile GPRs and HSPRG0/1. It will
+	 * also restore HIDs and other SPRs to their original value
+	 * but it might clobber a bunch.
+	 */
+#ifdef __BIG_ENDIAN__
+	opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE);
+#else
+	opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE);
+#endif
+}
+
+int __init early_init_dt_scan_opal(unsigned long node,
+				   const char *uname, int depth, void *data)
+{
+	const void *basep, *entryp, *sizep;
+	int basesz, entrysz, runtimesz;
+
+	if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+		return 0;
+
+	basep  = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
+	entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
+	sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz);
+
+	if (!basep || !entryp || !sizep)
+		return 1;
+
+	opal.base = of_read_number(basep, basesz/4);
+	opal.entry = of_read_number(entryp, entrysz/4);
+	opal.size = of_read_number(sizep, runtimesz/4);
+
+	pr_debug("OPAL Base  = 0x%llx (basep=%p basesz=%d)\n",
+		 opal.base, basep, basesz);
+	pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n",
+		 opal.entry, entryp, entrysz);
+	pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n",
+		 opal.size, sizep, runtimesz);
+
+	powerpc_firmware_features |= FW_FEATURE_OPAL;
+	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
+		powerpc_firmware_features |= FW_FEATURE_OPALv2;
+		powerpc_firmware_features |= FW_FEATURE_OPALv3;
+		pr_info("OPAL V3 detected !\n");
+	} else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) {
+		powerpc_firmware_features |= FW_FEATURE_OPALv2;
+		pr_info("OPAL V2 detected !\n");
+	} else {
+		pr_info("OPAL V1 detected !\n");
+	}
+
+	/* Reinit all cores with the right endian */
+	opal_reinit_cores();
+
+	/* Restore some bits */
+	if (cur_cpu_spec->cpu_restore)
+		cur_cpu_spec->cpu_restore();
+
+	return 1;
+}
+
+int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
+				   const char *uname, int depth, void *data)
+{
+	int i, psize, size;
+	const __be32 *prop;
+
+	if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize);
+
+	if (!prop)
+		return 1;
+
+	pr_debug("Found machine check recoverable ranges.\n");
+
+	/*
+	 * Calculate number of available entries.
+	 *
+	 * Each recoverable address range entry is (start address, len,
+	 * recovery address), 2 cells each for start and recovery address,
+	 * 1 cell for len, totalling 5 cells per entry.
+	 */
+	mc_recoverable_range_len = psize / (sizeof(*prop) * 5);
+
+	/* Sanity check */
+	if (!mc_recoverable_range_len)
+		return 1;
+
+	/* Size required to hold all the entries. */
+	size = mc_recoverable_range_len *
+			sizeof(struct mcheck_recoverable_range);
+
+	/*
+	 * Allocate a buffer to hold the MC recoverable ranges. We would be
+	 * accessing them in real mode, hence it needs to be within
+	 * RMO region.
+	 */
+	mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64),
+							ppc64_rma_size));
+	memset(mc_recoverable_range, 0, size);
+
+	for (i = 0; i < mc_recoverable_range_len; i++) {
+		mc_recoverable_range[i].start_addr =
+					of_read_number(prop + (i * 5) + 0, 2);
+		mc_recoverable_range[i].end_addr =
+					mc_recoverable_range[i].start_addr +
+					of_read_number(prop + (i * 5) + 2, 1);
+		mc_recoverable_range[i].recover_addr =
+					of_read_number(prop + (i * 5) + 3, 2);
+
+		pr_debug("Machine check recoverable range: %llx..%llx: %llx\n",
+				mc_recoverable_range[i].start_addr,
+				mc_recoverable_range[i].end_addr,
+				mc_recoverable_range[i].recover_addr);
+	}
+	return 1;
+}
+
+static int __init opal_register_exception_handlers(void)
+{
+#ifdef __BIG_ENDIAN__
+	u64 glue;
+
+	if (!(powerpc_firmware_features & FW_FEATURE_OPAL))
+		return -ENODEV;
+
+	/* Hookup some exception handlers except machine check. We use the
+	 * fwnmi area at 0x7000 to provide the glue space to OPAL
+	 */
+	glue = 0x7000;
+
+	/*
+	 * Check if we are running on newer firmware that exports
+	 * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch
+	 * the HMI interrupt and we catch it directly in Linux.
+	 *
+	 * For older firmware (i.e currently released POWER8 System Firmware
+	 * as of today <= SV810_087), we fallback to old behavior and let OPAL
+	 * patch the HMI vector and handle it inside OPAL firmware.
+	 *
+	 * For newer firmware (in development/yet to be released) we will
+	 * start catching/handling HMI directly in Linux.
+	 */
+	if (!opal_check_token(OPAL_HANDLE_HMI)) {
+		pr_info("Old firmware detected, OPAL handles HMIs.\n");
+		opal_register_exception_handler(
+				OPAL_HYPERVISOR_MAINTENANCE_HANDLER,
+				0, glue);
+		glue += 128;
+	}
+
+	opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
+#endif
+
+	return 0;
+}
+machine_early_initcall(powernv, opal_register_exception_handlers);
+
+int opal_notifier_register(struct notifier_block *nb)
+{
+	if (!nb) {
+		pr_warning("%s: Invalid argument (%p)\n",
+			   __func__, nb);
+		return -EINVAL;
+	}
+
+	atomic_notifier_chain_register(&opal_notifier_head, nb);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(opal_notifier_register);
+
+int opal_notifier_unregister(struct notifier_block *nb)
+{
+	if (!nb) {
+		pr_warning("%s: Invalid argument (%p)\n",
+			   __func__, nb);
+		return -EINVAL;
+	}
+
+	atomic_notifier_chain_unregister(&opal_notifier_head, nb);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(opal_notifier_unregister);
+
+static void opal_do_notifier(uint64_t events)
+{
+	unsigned long flags;
+	uint64_t changed_mask;
+
+	if (atomic_read(&opal_notifier_hold))
+		return;
+
+	spin_lock_irqsave(&opal_notifier_lock, flags);
+	changed_mask = last_notified_mask ^ events;
+	last_notified_mask = events;
+	spin_unlock_irqrestore(&opal_notifier_lock, flags);
+
+	/*
+	 * We feed with the event bits and changed bits for
+	 * enough information to the callback.
+	 */
+	atomic_notifier_call_chain(&opal_notifier_head,
+				   events, (void *)changed_mask);
+}
+
+void opal_notifier_update_evt(uint64_t evt_mask,
+			      uint64_t evt_val)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&opal_notifier_lock, flags);
+	last_notified_mask &= ~evt_mask;
+	last_notified_mask |= evt_val;
+	spin_unlock_irqrestore(&opal_notifier_lock, flags);
+}
+
+void opal_notifier_enable(void)
+{
+	int64_t rc;
+	__be64 evt = 0;
+
+	atomic_set(&opal_notifier_hold, 0);
+
+	/* Process pending events */
+	rc = opal_poll_events(&evt);
+	if (rc == OPAL_SUCCESS && evt)
+		opal_do_notifier(be64_to_cpu(evt));
+}
+
+void opal_notifier_disable(void)
+{
+	atomic_set(&opal_notifier_hold, 1);
+}
+
+/*
+ * Opal message notifier based on message type. Allow subscribers to get
+ * notified for specific messgae type.
+ */
+int opal_message_notifier_register(enum opal_msg_type msg_type,
+					struct notifier_block *nb)
+{
+	if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Invalid arguments, msg_type:%d\n",
+			   __func__, msg_type);
+		return -EINVAL;
+	}
+
+	return atomic_notifier_chain_register(
+				&opal_msg_notifier_head[msg_type], nb);
+}
+
+int opal_message_notifier_unregister(enum opal_msg_type msg_type,
+				     struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(
+			&opal_msg_notifier_head[msg_type], nb);
+}
+
+static void opal_message_do_notify(uint32_t msg_type, void *msg)
+{
+	/* notify subscribers */
+	atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+					msg_type, msg);
+}
+
+static void opal_handle_message(void)
+{
+	s64 ret;
+	/*
+	 * TODO: pre-allocate a message buffer depending on opal-msg-size
+	 * value in /proc/device-tree.
+	 */
+	static struct opal_msg msg;
+	u32 type;
+
+	ret = opal_get_msg(__pa(&msg), sizeof(msg));
+	/* No opal message pending. */
+	if (ret == OPAL_RESOURCE)
+		return;
+
+	/* check for errors. */
+	if (ret) {
+		pr_warning("%s: Failed to retrieve opal message, err=%lld\n",
+				__func__, ret);
+		return;
+	}
+
+	type = be32_to_cpu(msg.msg_type);
+
+	/* Sanity check */
+	if (type >= OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Unknown message type: %u\n", __func__, type);
+		return;
+	}
+	opal_message_do_notify(type, (void *)&msg);
+}
+
+static int opal_message_notify(struct notifier_block *nb,
+			  unsigned long events, void *change)
+{
+	if (events & OPAL_EVENT_MSG_PENDING)
+		opal_handle_message();
+	return 0;
+}
+
+static struct notifier_block opal_message_nb = {
+	.notifier_call	= opal_message_notify,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_message_init(void)
+{
+	int ret, i;
+
+	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
+		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
+
+	ret = opal_notifier_register(&opal_message_nb);
+	if (ret) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		       __func__, ret);
+		return ret;
+	}
+	return 0;
+}
+machine_early_initcall(powernv, opal_message_init);
+
+int opal_get_chars(uint32_t vtermno, char *buf, int count)
+{
+	s64 rc;
+	__be64 evt, len;
+
+	if (!opal.entry)
+		return -ENODEV;
+	opal_poll_events(&evt);
+	if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0)
+		return 0;
+	len = cpu_to_be64(count);
+	rc = opal_console_read(vtermno, &len, buf);
+	if (rc == OPAL_SUCCESS)
+		return be64_to_cpu(len);
+	return 0;
+}
+
+int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+{
+	int written = 0;
+	__be64 olen;
+	s64 len, rc;
+	unsigned long flags;
+	__be64 evt;
+
+	if (!opal.entry)
+		return -ENODEV;
+
+	/* We want put_chars to be atomic to avoid mangling of hvsi
+	 * packets. To do that, we first test for room and return
+	 * -EAGAIN if there isn't enough.
+	 *
+	 * Unfortunately, opal_console_write_buffer_space() doesn't
+	 * appear to work on opal v1, so we just assume there is
+	 * enough room and be done with it
+	 */
+	spin_lock_irqsave(&opal_write_lock, flags);
+	if (firmware_has_feature(FW_FEATURE_OPALv2)) {
+		rc = opal_console_write_buffer_space(vtermno, &olen);
+		len = be64_to_cpu(olen);
+		if (rc || len < total_len) {
+			spin_unlock_irqrestore(&opal_write_lock, flags);
+			/* Closed -> drop characters */
+			if (rc)
+				return total_len;
+			opal_poll_events(NULL);
+			return -EAGAIN;
+		}
+	}
+
+	/* We still try to handle partial completions, though they
+	 * should no longer happen.
+	 */
+	rc = OPAL_BUSY;
+	while(total_len > 0 && (rc == OPAL_BUSY ||
+				rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) {
+		olen = cpu_to_be64(total_len);
+		rc = opal_console_write(vtermno, &olen, data);
+		len = be64_to_cpu(olen);
+
+		/* Closed or other error drop */
+		if (rc != OPAL_SUCCESS && rc != OPAL_BUSY &&
+		    rc != OPAL_BUSY_EVENT) {
+			written = total_len;
+			break;
+		}
+		if (rc == OPAL_SUCCESS) {
+			total_len -= len;
+			data += len;
+			written += len;
+		}
+		/* This is a bit nasty but we need that for the console to
+		 * flush when there aren't any interrupts. We will clean
+		 * things a bit later to limit that to synchronous path
+		 * such as the kernel console and xmon/udbg
+		 */
+		do
+			opal_poll_events(&evt);
+		while(rc == OPAL_SUCCESS &&
+			(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT));
+	}
+	spin_unlock_irqrestore(&opal_write_lock, flags);
+	return written;
+}
+
+static int opal_recover_mce(struct pt_regs *regs,
+					struct machine_check_event *evt)
+{
+	int recovered = 0;
+	uint64_t ea = get_mce_fault_addr(evt);
+
+	if (!(regs->msr & MSR_RI)) {
+		/* If MSR_RI isn't set, we cannot recover */
+		recovered = 0;
+	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
+		/* Platform corrected itself */
+		recovered = 1;
+	} else if (ea && !is_kernel_addr(ea)) {
+		/*
+		 * Faulting address is not in kernel text. We should be fine.
+		 * We need to find which process uses this address.
+		 * For now, kill the task if we have received exception when
+		 * in userspace.
+		 *
+		 * TODO: Queue up this address for hwpoisioning later.
+		 */
+		if (user_mode(regs) && !is_global_init(current)) {
+			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+			recovered = 1;
+		} else
+			recovered = 0;
+	} else if (user_mode(regs) && !is_global_init(current) &&
+		evt->severity == MCE_SEV_ERROR_SYNC) {
+		/*
+		 * If we have received a synchronous error when in userspace
+		 * kill the task.
+		 */
+		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+		recovered = 1;
+	}
+	return recovered;
+}
+
+int opal_machine_check(struct pt_regs *regs)
+{
+	struct machine_check_event evt;
+
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return 0;
+
+	/* Print things out */
+	if (evt.version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt.version);
+		return 0;
+	}
+	machine_check_print_event_info(&evt);
+
+	if (opal_recover_mce(regs, &evt))
+		return 1;
+	return 0;
+}
+
+/* Early hmi handler called in real mode. */
+int opal_hmi_exception_early(struct pt_regs *regs)
+{
+	s64 rc;
+
+	/*
+	 * call opal hmi handler. Pass paca address as token.
+	 * The return value OPAL_SUCCESS is an indication that there is
+	 * an HMI event generated waiting to pull by Linux.
+	 */
+	rc = opal_handle_hmi();
+	if (rc == OPAL_SUCCESS) {
+		local_paca->hmi_event_available = 1;
+		return 1;
+	}
+	return 0;
+}
+
+/* HMI exception handler called in virtual mode during check_irq_replay. */
+int opal_handle_hmi_exception(struct pt_regs *regs)
+{
+	s64 rc;
+	__be64 evt = 0;
+
+	/*
+	 * Check if HMI event is available.
+	 * if Yes, then call opal_poll_events to pull opal messages and
+	 * process them.
+	 */
+	if (!local_paca->hmi_event_available)
+		return 0;
+
+	local_paca->hmi_event_available = 0;
+	rc = opal_poll_events(&evt);
+	if (rc == OPAL_SUCCESS && evt)
+		opal_do_notifier(be64_to_cpu(evt));
+
+	return 1;
+}
+
+static uint64_t find_recovery_address(uint64_t nip)
+{
+	int i;
+
+	for (i = 0; i < mc_recoverable_range_len; i++)
+		if ((nip >= mc_recoverable_range[i].start_addr) &&
+		    (nip < mc_recoverable_range[i].end_addr))
+		    return mc_recoverable_range[i].recover_addr;
+	return 0;
+}
+
+bool opal_mce_check_early_recovery(struct pt_regs *regs)
+{
+	uint64_t recover_addr = 0;
+
+	if (!opal.base || !opal.size)
+		goto out;
+
+	if ((regs->nip >= opal.base) &&
+			(regs->nip <= (opal.base + opal.size)))
+		recover_addr = find_recovery_address(regs->nip);
+
+	/*
+	 * Setup regs->nip to rfi into fixup address.
+	 */
+	if (recover_addr)
+		regs->nip = recover_addr;
+
+out:
+	return !!recover_addr;
+}
+
+static irqreturn_t opal_interrupt(int irq, void *data)
+{
+	__be64 events;
+
+	opal_handle_interrupt(virq_to_hw(irq), &events);
+
+	opal_do_notifier(be64_to_cpu(events));
+
+	return IRQ_HANDLED;
+}
+
+static int opal_sysfs_init(void)
+{
+	opal_kobj = kobject_create_and_add("opal", firmware_kobj);
+	if (!opal_kobj) {
+		pr_warn("kobject_create_and_add opal failed\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
+			       struct bin_attribute *bin_attr,
+			       char *buf, loff_t off, size_t count)
+{
+	return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+				       bin_attr->size);
+}
+
+static BIN_ATTR_RO(symbol_map, 0);
+
+static void opal_export_symmap(void)
+{
+	const __be64 *syms;
+	unsigned int size;
+	struct device_node *fw;
+	int rc;
+
+	fw = of_find_node_by_path("/ibm,opal/firmware");
+	if (!fw)
+		return;
+	syms = of_get_property(fw, "symbol-map", &size);
+	if (!syms || size != 2 * sizeof(__be64))
+		return;
+
+	/* Setup attributes */
+	bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0]));
+	bin_attr_symbol_map.size = be64_to_cpu(syms[1]);
+
+	rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map);
+	if (rc)
+		pr_warn("Error %d creating OPAL symbols file\n", rc);
+}
+
+static void __init opal_dump_region_init(void)
+{
+	void *addr;
+	uint64_t size;
+	int rc;
+
+	if (!opal_check_token(OPAL_REGISTER_DUMP_REGION))
+		return;
+
+	/* Register kernel log buffer */
+	addr = log_buf_addr_get();
+	if (addr == NULL)
+		return;
+
+	size = log_buf_len_get();
+	if (size == 0)
+		return;
+
+	rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
+				       __pa(addr), size);
+	/* Don't warn if this is just an older OPAL that doesn't
+	 * know about that call
+	 */
+	if (rc && rc != OPAL_UNSUPPORTED)
+		pr_warn("DUMP: Failed to register kernel log buffer. "
+			"rc = %d\n", rc);
+}
+
+static void opal_flash_init(struct device_node *opal_node)
+{
+	struct device_node *np;
+
+	for_each_child_of_node(opal_node, np)
+		if (of_device_is_compatible(np, "ibm,opal-flash"))
+			of_platform_device_create(np, NULL, NULL);
+}
+
+static void opal_ipmi_init(struct device_node *opal_node)
+{
+	struct device_node *np;
+
+	for_each_child_of_node(opal_node, np)
+		if (of_device_is_compatible(np, "ibm,opal-ipmi"))
+			of_platform_device_create(np, NULL, NULL);
+}
+
+static void opal_i2c_create_devs(void)
+{
+	struct device_node *np;
+
+	for_each_compatible_node(np, NULL, "ibm,opal-i2c")
+		of_platform_device_create(np, NULL, NULL);
+}
+
+static void __init opal_irq_init(struct device_node *dn)
+{
+	const __be32 *irqs;
+	int i, irqlen;
+
+	/* Get interrupt property */
+	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
+	opal_irq_count = irqs ? (irqlen / 4) : 0;
+	pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count);
+	if (!opal_irq_count)
+		return;
+
+	/* Install interrupt handlers */
+	opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL);
+	for (i = 0; irqs && i < opal_irq_count; i++, irqs++) {
+		unsigned int irq, virq;
+		int rc;
+
+		/* Get hardware and virtual IRQ */
+		irq = be32_to_cpup(irqs);
+		virq = irq_create_mapping(NULL, irq);
+		if (virq == NO_IRQ) {
+			pr_warn("Failed to map irq 0x%x\n", irq);
+			continue;
+		}
+
+		/* Install interrupt handler */
+		rc = request_irq(virq, opal_interrupt, 0, "opal", NULL);
+		if (rc) {
+			irq_dispose_mapping(virq);
+			pr_warn("Error %d requesting irq %d (0x%x)\n",
+				 rc, virq, irq);
+			continue;
+		}
+
+		/* Cache IRQ */
+		opal_irqs[i] = virq;
+	}
+}
+
+static int kopald(void *unused)
+{
+	set_freezable();
+	do {
+		try_to_freeze();
+		opal_poll_events(NULL);
+		msleep_interruptible(opal_heartbeat);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+static void opal_init_heartbeat(void)
+{
+	/* Old firwmware, we assume the HVC heartbeat is sufficient */
+	if (of_property_read_u32(opal_node, "ibm,heartbeat-ms",
+				 &opal_heartbeat) != 0)
+		opal_heartbeat = 0;
+
+	if (opal_heartbeat)
+		kthread_run(kopald, NULL, "kopald");
+}
+
+static int __init opal_init(void)
+{
+	struct device_node *np, *consoles;
+	int rc;
+
+	opal_node = of_find_node_by_path("/ibm,opal");
+	if (!opal_node) {
+		pr_warn("Device node not found\n");
+		return -ENODEV;
+	}
+
+	/* Register OPAL consoles if any ports */
+	if (firmware_has_feature(FW_FEATURE_OPALv2))
+		consoles = of_find_node_by_path("/ibm,opal/consoles");
+	else
+		consoles = of_node_get(opal_node);
+	if (consoles) {
+		for_each_child_of_node(consoles, np) {
+			if (strcmp(np->name, "serial"))
+				continue;
+			of_platform_device_create(np, NULL, NULL);
+		}
+		of_node_put(consoles);
+	}
+
+	/* Create i2c platform devices */
+	opal_i2c_create_devs();
+
+	/* Setup a heatbeat thread if requested by OPAL */
+	opal_init_heartbeat();
+
+	/* Find all OPAL interrupts and request them */
+	opal_irq_init(opal_node);
+
+	/* Create "opal" kobject under /sys/firmware */
+	rc = opal_sysfs_init();
+	if (rc == 0) {
+		/* Export symbol map to userspace */
+		opal_export_symmap();
+		/* Setup dump region interface */
+		opal_dump_region_init();
+		/* Setup error log interface */
+		rc = opal_elog_init();
+		/* Setup code update interface */
+		opal_flash_update_init();
+		/* Setup platform dump extract interface */
+		opal_platform_dump_init();
+		/* Setup system parameters interface */
+		opal_sys_param_init();
+		/* Setup message log interface. */
+		opal_msglog_init();
+	}
+
+	/* Initialize OPAL IPMI backend */
+	opal_ipmi_init(opal_node);
+
+	opal_flash_init(opal_node);
+
+	return 0;
+}
+machine_subsys_initcall(powernv, opal_init);
+
+void opal_shutdown(void)
+{
+	unsigned int i;
+	long rc = OPAL_BUSY;
+
+	/* First free interrupts, which will also mask them */
+	for (i = 0; i < opal_irq_count; i++) {
+		if (opal_irqs[i])
+			free_irq(opal_irqs[i], NULL);
+		opal_irqs[i] = 0;
+	}
+
+	/*
+	 * Then sync with OPAL which ensure anything that can
+	 * potentially write to our memory has completed such
+	 * as an ongoing dump retrieval
+	 */
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_sync_host_reboot();
+		if (rc == OPAL_BUSY)
+			opal_poll_events(NULL);
+		else
+			mdelay(10);
+	}
+
+	/* Unregister memory dump region */
+	if (opal_check_token(OPAL_UNREGISTER_DUMP_REGION))
+		opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
+}
+
+/* Export this so that test modules can use it */
+EXPORT_SYMBOL_GPL(opal_invalid_call);
+EXPORT_SYMBOL_GPL(opal_ipmi_send);
+EXPORT_SYMBOL_GPL(opal_ipmi_recv);
+EXPORT_SYMBOL_GPL(opal_flash_read);
+EXPORT_SYMBOL_GPL(opal_flash_write);
+EXPORT_SYMBOL_GPL(opal_flash_erase);
+
+/* Convert a region of vmalloc memory to an opal sg list */
+struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
+					     unsigned long vmalloc_size)
+{
+	struct opal_sg_list *sg, *first = NULL;
+	unsigned long i = 0;
+
+	sg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!sg)
+		goto nomem;
+
+	first = sg;
+
+	while (vmalloc_size > 0) {
+		uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT;
+		uint64_t length = min(vmalloc_size, PAGE_SIZE);
+
+		sg->entry[i].data = cpu_to_be64(data);
+		sg->entry[i].length = cpu_to_be64(length);
+		i++;
+
+		if (i >= SG_ENTRIES_PER_NODE) {
+			struct opal_sg_list *next;
+
+			next = kzalloc(PAGE_SIZE, GFP_KERNEL);
+			if (!next)
+				goto nomem;
+
+			sg->length = cpu_to_be64(
+					i * sizeof(struct opal_sg_entry) + 16);
+			i = 0;
+			sg->next = cpu_to_be64(__pa(next));
+			sg = next;
+		}
+
+		vmalloc_addr += length;
+		vmalloc_size -= length;
+	}
+
+	sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16);
+
+	return first;
+
+nomem:
+	pr_err("%s : Failed to allocate memory\n", __func__);
+	opal_free_sg_list(first);
+	return NULL;
+}
+
+void opal_free_sg_list(struct opal_sg_list *sg)
+{
+	while (sg) {
+		uint64_t next = be64_to_cpu(sg->next);
+
+		kfree(sg);
+
+		if (next)
+			sg = __va(next);
+		else
+			sg = NULL;
+	}
+}
+
+int opal_error_code(int rc)
+{
+	switch (rc) {
+	case OPAL_SUCCESS:		return 0;
+
+	case OPAL_PARAMETER:		return -EINVAL;
+	case OPAL_ASYNC_COMPLETION:	return -EINPROGRESS;
+	case OPAL_BUSY_EVENT:		return -EBUSY;
+	case OPAL_NO_MEM:		return -ENOMEM;
+
+	case OPAL_UNSUPPORTED:		return -EIO;
+	case OPAL_HARDWARE:		return -EIO;
+	case OPAL_INTERNAL_ERROR:	return -EIO;
+	default:
+		pr_err("%s: unexpected OPAL error %d\n", __func__, rc);
+		return -EIO;
+	}
+}
+
+EXPORT_SYMBOL_GPL(opal_poll_events);
+EXPORT_SYMBOL_GPL(opal_rtc_read);
+EXPORT_SYMBOL_GPL(opal_rtc_write);
+EXPORT_SYMBOL_GPL(opal_tpo_read);
+EXPORT_SYMBOL_GPL(opal_tpo_write);
+EXPORT_SYMBOL_GPL(opal_i2c_request);
diff --git a/kernel/arch/powerpc/platforms/powernv/pci-ioda.c b/kernel/arch/powerpc/platforms/powernv/pci-ioda.c
new file mode 100644
index 000000000..f8bc950ef
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -0,0 +1,2871 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/crash_dump.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/memblock.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/firmware.h>
+#include <asm/pnv-pci.h>
+
+#include <misc/cxl.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+#define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
+
+static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+			    const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+	char pfix[32];
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (pe->flags & PNV_IODA_PE_DEV)
+		strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
+	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+		sprintf(pfix, "%04x:%02x     ",
+			pci_domain_nr(pe->pbus), pe->pbus->number);
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		sprintf(pfix, "%04x:%02x:%2x.%d",
+			pci_domain_nr(pe->parent_dev->bus),
+			(pe->rid & 0xff00) >> 8,
+			PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
+#endif /* CONFIG_PCI_IOV*/
+
+	printk("%spci %s: [PE# %.3d] %pV",
+	       level, pfix, pe->pe_number, &vaf);
+
+	va_end(args);
+}
+
+#define pe_err(pe, fmt, ...)					\
+	pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
+#define pe_warn(pe, fmt, ...)					\
+	pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define pe_info(pe, fmt, ...)					\
+	pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
+
+static bool pnv_iommu_bypass_disabled __read_mostly;
+
+static int __init iommu_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	while (*str) {
+		if (!strncmp(str, "nobypass", 8)) {
+			pnv_iommu_bypass_disabled = true;
+			pr_info("PowerNV: IOMMU bypass window disabled.\n");
+			break;
+		}
+		str += strcspn(str, ",");
+		if (*str == ',')
+			str++;
+	}
+
+	return 0;
+}
+early_param("iommu", iommu_setup);
+
+/*
+ * stdcix is only supposed to be used in hypervisor real mode as per
+ * the architecture spec
+ */
+static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
+{
+	__asm__ __volatile__("stdcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
+{
+	return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
+		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
+}
+
+static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
+{
+	if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
+		pr_warn("%s: Invalid PE %d on PHB#%x\n",
+			__func__, pe_no, phb->hose->global_number);
+		return;
+	}
+
+	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
+		pr_warn("%s: PE %d was assigned on PHB#%x\n",
+			__func__, pe_no, phb->hose->global_number);
+		return;
+	}
+
+	phb->ioda.pe_array[pe_no].phb = phb;
+	phb->ioda.pe_array[pe_no].pe_number = pe_no;
+}
+
+static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
+{
+	unsigned long pe;
+
+	do {
+		pe = find_next_zero_bit(phb->ioda.pe_alloc,
+					phb->ioda.total_pe, 0);
+		if (pe >= phb->ioda.total_pe)
+			return IODA_INVALID_PE;
+	} while(test_and_set_bit(pe, phb->ioda.pe_alloc));
+
+	phb->ioda.pe_array[pe].phb = phb;
+	phb->ioda.pe_array[pe].pe_number = pe;
+	return pe;
+}
+
+static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
+{
+	WARN_ON(phb->ioda.pe_array[pe].pdev);
+
+	memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
+	clear_bit(pe, phb->ioda.pe_alloc);
+}
+
+/* The default M64 BAR is shared by all PEs */
+static int pnv_ioda2_init_m64(struct pnv_phb *phb)
+{
+	const char *desc;
+	struct resource *r;
+	s64 rc;
+
+	/* Configure the default M64 BAR */
+	rc = opal_pci_set_phb_mem_window(phb->opal_id,
+					 OPAL_M64_WINDOW_TYPE,
+					 phb->ioda.m64_bar_idx,
+					 phb->ioda.m64_base,
+					 0, /* unused */
+					 phb->ioda.m64_size);
+	if (rc != OPAL_SUCCESS) {
+		desc = "configuring";
+		goto fail;
+	}
+
+	/* Enable the default M64 BAR */
+	rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				      OPAL_M64_WINDOW_TYPE,
+				      phb->ioda.m64_bar_idx,
+				      OPAL_ENABLE_M64_SPLIT);
+	if (rc != OPAL_SUCCESS) {
+		desc = "enabling";
+		goto fail;
+	}
+
+	/* Mark the M64 BAR assigned */
+	set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc);
+
+	/*
+	 * Strip off the segment used by the reserved PE, which is
+	 * expected to be 0 or last one of PE capabicity.
+	 */
+	r = &phb->hose->mem_resources[1];
+	if (phb->ioda.reserved_pe == 0)
+		r->start += phb->ioda.m64_segsize;
+	else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1))
+		r->end -= phb->ioda.m64_segsize;
+	else
+		pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
+			phb->ioda.reserved_pe);
+
+	return 0;
+
+fail:
+	pr_warn("  Failure %lld %s M64 BAR#%d\n",
+		rc, desc, phb->ioda.m64_bar_idx);
+	opal_pci_phb_mmio_enable(phb->opal_id,
+				 OPAL_M64_WINDOW_TYPE,
+				 phb->ioda.m64_bar_idx,
+				 OPAL_DISABLE_M64);
+	return -EIO;
+}
+
+static void pnv_ioda2_reserve_m64_pe(struct pnv_phb *phb)
+{
+	resource_size_t sgsz = phb->ioda.m64_segsize;
+	struct pci_dev *pdev;
+	struct resource *r;
+	int base, step, i;
+
+	/*
+	 * Root bus always has full M64 range and root port has
+	 * M64 range used in reality. So we're checking root port
+	 * instead of root bus.
+	 */
+	list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) {
+		for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
+			r = &pdev->resource[PCI_BRIDGE_RESOURCES + i];
+			if (!r->parent ||
+			    !pnv_pci_is_mem_pref_64(r->flags))
+				continue;
+
+			base = (r->start - phb->ioda.m64_base) / sgsz;
+			for (step = 0; step < resource_size(r) / sgsz; step++)
+				pnv_ioda_reserve_pe(phb, base + step);
+		}
+	}
+}
+
+static int pnv_ioda2_pick_m64_pe(struct pnv_phb *phb,
+				 struct pci_bus *bus, int all)
+{
+	resource_size_t segsz = phb->ioda.m64_segsize;
+	struct pci_dev *pdev;
+	struct resource *r;
+	struct pnv_ioda_pe *master_pe, *pe;
+	unsigned long size, *pe_alloc;
+	bool found;
+	int start, i, j;
+
+	/* Root bus shouldn't use M64 */
+	if (pci_is_root_bus(bus))
+		return IODA_INVALID_PE;
+
+	/* We support only one M64 window on each bus */
+	found = false;
+	pci_bus_for_each_resource(bus, r, i) {
+		if (r && r->parent &&
+		    pnv_pci_is_mem_pref_64(r->flags)) {
+			found = true;
+			break;
+		}
+	}
+
+	/* No M64 window found ? */
+	if (!found)
+		return IODA_INVALID_PE;
+
+	/* Allocate bitmap */
+	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+	pe_alloc = kzalloc(size, GFP_KERNEL);
+	if (!pe_alloc) {
+		pr_warn("%s: Out of memory !\n",
+			__func__);
+		return IODA_INVALID_PE;
+	}
+
+	/*
+	 * Figure out reserved PE numbers by the PE
+	 * the its child PEs.
+	 */
+	start = (r->start - phb->ioda.m64_base) / segsz;
+	for (i = 0; i < resource_size(r) / segsz; i++)
+		set_bit(start + i, pe_alloc);
+
+	if (all)
+		goto done;
+
+	/*
+	 * If the PE doesn't cover all subordinate buses,
+	 * we need subtract from reserved PEs for children.
+	 */
+	list_for_each_entry(pdev, &bus->devices, bus_list) {
+		if (!pdev->subordinate)
+			continue;
+
+		pci_bus_for_each_resource(pdev->subordinate, r, i) {
+			if (!r || !r->parent ||
+			    !pnv_pci_is_mem_pref_64(r->flags))
+				continue;
+
+			start = (r->start - phb->ioda.m64_base) / segsz;
+			for (j = 0; j < resource_size(r) / segsz ; j++)
+				clear_bit(start + j, pe_alloc);
+                }
+        }
+
+	/*
+	 * the current bus might not own M64 window and that's all
+	 * contributed by its child buses. For the case, we needn't
+	 * pick M64 dependent PE#.
+	 */
+	if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) {
+		kfree(pe_alloc);
+		return IODA_INVALID_PE;
+	}
+
+	/*
+	 * Figure out the master PE and put all slave PEs to master
+	 * PE's list to form compound PE.
+	 */
+done:
+	master_pe = NULL;
+	i = -1;
+	while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) <
+		phb->ioda.total_pe) {
+		pe = &phb->ioda.pe_array[i];
+
+		if (!master_pe) {
+			pe->flags |= PNV_IODA_PE_MASTER;
+			INIT_LIST_HEAD(&pe->slaves);
+			master_pe = pe;
+		} else {
+			pe->flags |= PNV_IODA_PE_SLAVE;
+			pe->master = master_pe;
+			list_add_tail(&pe->list, &master_pe->slaves);
+		}
+	}
+
+	kfree(pe_alloc);
+	return master_pe->pe_number;
+}
+
+static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
+{
+	struct pci_controller *hose = phb->hose;
+	struct device_node *dn = hose->dn;
+	struct resource *res;
+	const u32 *r;
+	u64 pci_addr;
+
+	/* FIXME: Support M64 for P7IOC */
+	if (phb->type != PNV_PHB_IODA2) {
+		pr_info("  Not support M64 window\n");
+		return;
+	}
+
+	if (!firmware_has_feature(FW_FEATURE_OPALv3)) {
+		pr_info("  Firmware too old to support M64 window\n");
+		return;
+	}
+
+	r = of_get_property(dn, "ibm,opal-m64-window", NULL);
+	if (!r) {
+		pr_info("  No <ibm,opal-m64-window> on %s\n",
+			dn->full_name);
+		return;
+	}
+
+	res = &hose->mem_resources[1];
+	res->start = of_translate_address(dn, r + 2);
+	res->end = res->start + of_read_number(r + 4, 2) - 1;
+	res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+	pci_addr = of_read_number(r, 2);
+	hose->mem_offset[1] = res->start - pci_addr;
+
+	phb->ioda.m64_size = resource_size(res);
+	phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe;
+	phb->ioda.m64_base = pci_addr;
+
+	pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n",
+			res->start, res->end, pci_addr);
+
+	/* Use last M64 BAR to cover M64 window */
+	phb->ioda.m64_bar_idx = 15;
+	phb->init_m64 = pnv_ioda2_init_m64;
+	phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe;
+	phb->pick_m64_pe = pnv_ioda2_pick_m64_pe;
+}
+
+static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
+{
+	struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
+	struct pnv_ioda_pe *slave;
+	s64 rc;
+
+	/* Fetch master PE */
+	if (pe->flags & PNV_IODA_PE_SLAVE) {
+		pe = pe->master;
+		if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)))
+			return;
+
+		pe_no = pe->pe_number;
+	}
+
+	/* Freeze master PE */
+	rc = opal_pci_eeh_freeze_set(phb->opal_id,
+				     pe_no,
+				     OPAL_EEH_ACTION_SET_FREEZE_ALL);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+			__func__, rc, phb->hose->global_number, pe_no);
+		return;
+	}
+
+	/* Freeze slave PEs */
+	if (!(pe->flags & PNV_IODA_PE_MASTER))
+		return;
+
+	list_for_each_entry(slave, &pe->slaves, list) {
+		rc = opal_pci_eeh_freeze_set(phb->opal_id,
+					     slave->pe_number,
+					     OPAL_EEH_ACTION_SET_FREEZE_ALL);
+		if (rc != OPAL_SUCCESS)
+			pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n",
+				__func__, rc, phb->hose->global_number,
+				slave->pe_number);
+	}
+}
+
+static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
+{
+	struct pnv_ioda_pe *pe, *slave;
+	s64 rc;
+
+	/* Find master PE */
+	pe = &phb->ioda.pe_array[pe_no];
+	if (pe->flags & PNV_IODA_PE_SLAVE) {
+		pe = pe->master;
+		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
+		pe_no = pe->pe_number;
+	}
+
+	/* Clear frozen state for master PE */
+	rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
+			__func__, rc, opt, phb->hose->global_number, pe_no);
+		return -EIO;
+	}
+
+	if (!(pe->flags & PNV_IODA_PE_MASTER))
+		return 0;
+
+	/* Clear frozen state for slave PEs */
+	list_for_each_entry(slave, &pe->slaves, list) {
+		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
+					     slave->pe_number,
+					     opt);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n",
+				__func__, rc, opt, phb->hose->global_number,
+				slave->pe_number);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
+{
+	struct pnv_ioda_pe *slave, *pe;
+	u8 fstate, state;
+	__be16 pcierr;
+	s64 rc;
+
+	/* Sanity check on PE number */
+	if (pe_no < 0 || pe_no >= phb->ioda.total_pe)
+		return OPAL_EEH_STOPPED_PERM_UNAVAIL;
+
+	/*
+	 * Fetch the master PE and the PE instance might be
+	 * not initialized yet.
+	 */
+	pe = &phb->ioda.pe_array[pe_no];
+	if (pe->flags & PNV_IODA_PE_SLAVE) {
+		pe = pe->master;
+		WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER));
+		pe_no = pe->pe_number;
+	}
+
+	/* Check the master PE */
+	rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no,
+					&state, &pcierr, NULL);
+	if (rc != OPAL_SUCCESS) {
+		pr_warn("%s: Failure %lld getting "
+			"PHB#%x-PE#%x state\n",
+			__func__, rc,
+			phb->hose->global_number, pe_no);
+		return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
+	}
+
+	/* Check the slave PE */
+	if (!(pe->flags & PNV_IODA_PE_MASTER))
+		return state;
+
+	list_for_each_entry(slave, &pe->slaves, list) {
+		rc = opal_pci_eeh_freeze_status(phb->opal_id,
+						slave->pe_number,
+						&fstate,
+						&pcierr,
+						NULL);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("%s: Failure %lld getting "
+				"PHB#%x-PE#%x state\n",
+				__func__, rc,
+				phb->hose->global_number, slave->pe_number);
+			return OPAL_EEH_STOPPED_TEMP_UNAVAIL;
+		}
+
+		/*
+		 * Override the result based on the ascending
+		 * priority.
+		 */
+		if (fstate > state)
+			state = fstate;
+	}
+
+	return state;
+}
+
+/* Currently those 2 are only used when MSIs are enabled, this will change
+ * but in the meantime, we need to protect them to avoid warnings
+ */
+#ifdef CONFIG_PCI_MSI
+static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pci_get_pdn(dev);
+
+	if (!pdn)
+		return NULL;
+	if (pdn->pe_number == IODA_INVALID_PE)
+		return NULL;
+	return &phb->ioda.pe_array[pdn->pe_number];
+}
+#endif /* CONFIG_PCI_MSI */
+
+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
+				  struct pnv_ioda_pe *parent,
+				  struct pnv_ioda_pe *child,
+				  bool is_add)
+{
+	const char *desc = is_add ? "adding" : "removing";
+	uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
+			      OPAL_REMOVE_PE_FROM_DOMAIN;
+	struct pnv_ioda_pe *slave;
+	long rc;
+
+	/* Parent PE affects child PE */
+	rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+				child->pe_number, op);
+	if (rc != OPAL_SUCCESS) {
+		pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
+			rc, desc);
+		return -ENXIO;
+	}
+
+	if (!(child->flags & PNV_IODA_PE_MASTER))
+		return 0;
+
+	/* Compound case: parent PE affects slave PEs */
+	list_for_each_entry(slave, &child->slaves, list) {
+		rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
+					slave->pe_number, op);
+		if (rc != OPAL_SUCCESS) {
+			pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
+				rc, desc);
+			return -ENXIO;
+		}
+	}
+
+	return 0;
+}
+
+static int pnv_ioda_set_peltv(struct pnv_phb *phb,
+			      struct pnv_ioda_pe *pe,
+			      bool is_add)
+{
+	struct pnv_ioda_pe *slave;
+	struct pci_dev *pdev = NULL;
+	int ret;
+
+	/*
+	 * Clear PE frozen state. If it's master PE, we need
+	 * clear slave PE frozen state as well.
+	 */
+	if (is_add) {
+		opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+					  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		if (pe->flags & PNV_IODA_PE_MASTER) {
+			list_for_each_entry(slave, &pe->slaves, list)
+				opal_pci_eeh_freeze_clear(phb->opal_id,
+							  slave->pe_number,
+							  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		}
+	}
+
+	/*
+	 * Associate PE in PELT. We need add the PE into the
+	 * corresponding PELT-V as well. Otherwise, the error
+	 * originated from the PE might contribute to other
+	 * PEs.
+	 */
+	ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
+	if (ret)
+		return ret;
+
+	/* For compound PEs, any one affects all of them */
+	if (pe->flags & PNV_IODA_PE_MASTER) {
+		list_for_each_entry(slave, &pe->slaves, list) {
+			ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
+			if (ret)
+				return ret;
+		}
+	}
+
+	if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
+		pdev = pe->pbus->self;
+	else if (pe->flags & PNV_IODA_PE_DEV)
+		pdev = pe->pdev->bus->self;
+#ifdef CONFIG_PCI_IOV
+	else if (pe->flags & PNV_IODA_PE_VF)
+		pdev = pe->parent_dev->bus->self;
+#endif /* CONFIG_PCI_IOV */
+	while (pdev) {
+		struct pci_dn *pdn = pci_get_pdn(pdev);
+		struct pnv_ioda_pe *parent;
+
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			parent = &phb->ioda.pe_array[pdn->pe_number];
+			ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
+			if (ret)
+				return ret;
+		}
+
+		pdev = pdev->bus->self;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_PCI_IOV
+static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *parent;
+	uint8_t bcomp, dcomp, fcomp;
+	int64_t rc;
+	long rid_end, rid;
+
+	/* Currently, we just deconfigure VF PE. Bus PE will always there.*/
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		parent = pe->pbus->self;
+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
+			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+		else
+			count = 1;
+
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;         break;
+		case  2: bcomp = OpalPciBus7Bits;       break;
+		case  4: bcomp = OpalPciBus6Bits;       break;
+		case  8: bcomp = OpalPciBus5Bits;       break;
+		case 16: bcomp = OpalPciBus4Bits;       break;
+		case 32: bcomp = OpalPciBus3Bits;       break;
+		default:
+			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+			        count);
+			/* Do an exact match only */
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+		if (pe->flags & PNV_IODA_PE_VF)
+			parent = pe->parent_dev;
+		else
+			parent = pe->pdev->bus->self;
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/* Clear the reverse map */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = 0;
+
+	/* Release from all parents PELT-V */
+	while (parent) {
+		struct pci_dn *pdn = pci_get_pdn(parent);
+		if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+			rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+						pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+			/* XXX What to do in case of error ? */
+		}
+		parent = parent->bus->self;
+	}
+
+	opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
+				  OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+	/* Disassociate PE in PELT */
+	rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+				pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+	if (rc)
+		pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
+	if (rc)
+		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+
+	pe->pbus = NULL;
+	pe->pdev = NULL;
+	pe->parent_dev = NULL;
+
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
+static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *parent;
+	uint8_t bcomp, dcomp, fcomp;
+	long rc, rid_end, rid;
+
+	/* Bus validation ? */
+	if (pe->pbus) {
+		int count;
+
+		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
+		parent = pe->pbus->self;
+		if (pe->flags & PNV_IODA_PE_BUS_ALL)
+			count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+		else
+			count = 1;
+
+		switch(count) {
+		case  1: bcomp = OpalPciBusAll;		break;
+		case  2: bcomp = OpalPciBus7Bits;	break;
+		case  4: bcomp = OpalPciBus6Bits;	break;
+		case  8: bcomp = OpalPciBus5Bits;	break;
+		case 16: bcomp = OpalPciBus4Bits;	break;
+		case 32: bcomp = OpalPciBus3Bits;	break;
+		default:
+			dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n",
+			        count);
+			/* Do an exact match only */
+			bcomp = OpalPciBusAll;
+		}
+		rid_end = pe->rid + (count << 8);
+	} else {
+#ifdef CONFIG_PCI_IOV
+		if (pe->flags & PNV_IODA_PE_VF)
+			parent = pe->parent_dev;
+		else
+#endif /* CONFIG_PCI_IOV */
+			parent = pe->pdev->bus->self;
+		bcomp = OpalPciBusAll;
+		dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
+		fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
+		rid_end = pe->rid + 1;
+	}
+
+	/*
+	 * Associate PE in PELT. We need add the PE into the
+	 * corresponding PELT-V as well. Otherwise, the error
+	 * originated from the PE might contribute to other
+	 * PEs.
+	 */
+	rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
+			     bcomp, dcomp, fcomp, OPAL_MAP_PE);
+	if (rc) {
+		pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+		return -ENXIO;
+	}
+
+	/* Configure PELTV */
+	pnv_ioda_set_peltv(phb, pe, true);
+
+	/* Setup reverse map */
+	for (rid = pe->rid; rid < rid_end; rid++)
+		phb->ioda.pe_rmap[rid] = pe->pe_number;
+
+	/* Setup one MVTs on IODA1 */
+	if (phb->type != PNV_PHB_IODA1) {
+		pe->mve_number = 0;
+		goto out;
+	}
+
+	pe->mve_number = pe->pe_number;
+	rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
+	if (rc != OPAL_SUCCESS) {
+		pe_err(pe, "OPAL error %ld setting up MVE %d\n",
+		       rc, pe->mve_number);
+		pe->mve_number = -1;
+	} else {
+		rc = opal_pci_set_mve_enable(phb->opal_id,
+					     pe->mve_number, OPAL_ENABLE_MVE);
+		if (rc) {
+			pe_err(pe, "OPAL error %ld enabling MVE %d\n",
+			       rc, pe->mve_number);
+			pe->mve_number = -1;
+		}
+	}
+
+out:
+	return 0;
+}
+
+static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe)
+{
+	struct pnv_ioda_pe *lpe;
+
+	list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) {
+		if (lpe->dma_weight < pe->dma_weight) {
+			list_add_tail(&pe->dma_link, &lpe->dma_link);
+			return;
+		}
+	}
+	list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list);
+}
+
+static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev)
+{
+	/* This is quite simplistic. The "base" weight of a device
+	 * is 10. 0 means no DMA is to be accounted for it.
+	 */
+
+	/* If it's a bridge, no DMA */
+	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return 0;
+
+	/* Reduce the weight of slow USB controllers */
+	if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
+	    dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
+	    dev->class == PCI_CLASS_SERIAL_USB_EHCI)
+		return 3;
+
+	/* Increase the weight of RAID (includes Obsidian) */
+	if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
+		return 15;
+
+	/* Default */
+	return 10;
+}
+
+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+	struct pci_dn *pdn = pci_get_pdn(dev);
+	int i;
+	struct resource *res, res2;
+	resource_size_t size;
+	u16 num_vfs;
+
+	if (!dev->is_physfn)
+		return -EINVAL;
+
+	/*
+	 * "offset" is in VFs.  The M64 windows are sized so that when they
+	 * are segmented, each segment is the same size as the IOV BAR.
+	 * Each segment is in a separate PE, and the high order bits of the
+	 * address are the PE number.  Therefore, each VF's BAR is in a
+	 * separate PE, and changing the IOV BAR start address changes the
+	 * range of PEs the VFs are in.
+	 */
+	num_vfs = pdn->num_vfs;
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		if (!pnv_pci_is_mem_pref_64(res->flags))
+			continue;
+
+		/*
+		 * The actual IOV BAR range is determined by the start address
+		 * and the actual size for num_vfs VFs BAR.  This check is to
+		 * make sure that after shifting, the range will not overlap
+		 * with another device.
+		 */
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2.flags = res->flags;
+		res2.start = res->start + (size * offset);
+		res2.end = res2.start + (size * num_vfs) - 1;
+
+		if (res2.end > res->end) {
+			dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+				i, &res2, res, num_vfs, offset);
+			return -EBUSY;
+		}
+	}
+
+	/*
+	 * After doing so, there would be a "hole" in the /proc/iomem when
+	 * offset is a positive value. It looks like the device return some
+	 * mmio back to the system, which actually no one could use it.
+	 */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &dev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		if (!pnv_pci_is_mem_pref_64(res->flags))
+			continue;
+
+		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+		res2 = *res;
+		res->start += size * offset;
+
+		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n",
+			 i, &res2, res, num_vfs, offset);
+		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+	}
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
+#if 0
+static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn = pci_get_pdn(dev);
+	struct pnv_ioda_pe *pe;
+	int pe_num;
+
+	if (!pdn) {
+		pr_err("%s: Device tree node not associated properly\n",
+			   pci_name(dev));
+		return NULL;
+	}
+	if (pdn->pe_number != IODA_INVALID_PE)
+		return NULL;
+
+	/* PE#0 has been pre-set */
+	if (dev->bus->number == 0)
+		pe_num = 0;
+	else
+		pe_num = pnv_ioda_alloc_pe(phb);
+	if (pe_num == IODA_INVALID_PE) {
+		pr_warning("%s: Not enough PE# available, disabling device\n",
+			   pci_name(dev));
+		return NULL;
+	}
+
+	/* NOTE: We get only one ref to the pci_dev for the pdn, not for the
+	 * pointer in the PE data structure, both should be destroyed at the
+	 * same time. However, this needs to be looked at more closely again
+	 * once we actually start removing things (Hotplug, SR-IOV, ...)
+	 *
+	 * At some point we want to remove the PDN completely anyways
+	 */
+	pe = &phb->ioda.pe_array[pe_num];
+	pci_dev_get(dev);
+	pdn->pcidev = dev;
+	pdn->pe_number = pe_num;
+	pe->pdev = dev;
+	pe->pbus = NULL;
+	pe->tce32_seg = -1;
+	pe->mve_number = -1;
+	pe->rid = dev->bus->number << 8 | pdn->devfn;
+
+	pe_info(pe, "Associated device to PE\n");
+
+	if (pnv_ioda_configure_pe(phb, pe)) {
+		/* XXX What do we do here ? */
+		if (pe_num)
+			pnv_ioda_free_pe(phb, pe_num);
+		pdn->pe_number = IODA_INVALID_PE;
+		pe->pdev = NULL;
+		pci_dev_put(dev);
+		return NULL;
+	}
+
+	/* Assign a DMA weight to the device */
+	pe->dma_weight = pnv_ioda_dma_weight(dev);
+	if (pe->dma_weight != 0) {
+		phb->ioda.dma_weight += pe->dma_weight;
+		phb->ioda.dma_pe_count++;
+	}
+
+	/* Link the PE */
+	pnv_ioda_link_pe_by_weight(phb, pe);
+
+	return pe;
+}
+#endif /* Useful for SRIOV case */
+
+static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		struct pci_dn *pdn = pci_get_pdn(dev);
+
+		if (pdn == NULL) {
+			pr_warn("%s: No device node associated with device !\n",
+				pci_name(dev));
+			continue;
+		}
+		pdn->pe_number = pe->pe_number;
+		pe->dma_weight += pnv_ioda_dma_weight(dev);
+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+			pnv_ioda_setup_same_PE(dev->subordinate, pe);
+	}
+}
+
+/*
+ * There're 2 types of PCI bus sensitive PEs: One that is compromised of
+ * single PCI bus. Another one that contains the primary PCI bus and its
+ * subordinate PCI devices and buses. The second type of PE is normally
+ * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports.
+ */
+static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
+{
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+	int pe_num = IODA_INVALID_PE;
+
+	/* Check if PE is determined by M64 */
+	if (phb->pick_m64_pe)
+		pe_num = phb->pick_m64_pe(phb, bus, all);
+
+	/* The PE number isn't pinned by M64 */
+	if (pe_num == IODA_INVALID_PE)
+		pe_num = pnv_ioda_alloc_pe(phb);
+
+	if (pe_num == IODA_INVALID_PE) {
+		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
+			__func__, pci_domain_nr(bus), bus->number);
+		return;
+	}
+
+	pe = &phb->ioda.pe_array[pe_num];
+	pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
+	pe->pbus = bus;
+	pe->pdev = NULL;
+	pe->tce32_seg = -1;
+	pe->mve_number = -1;
+	pe->rid = bus->busn_res.start << 8;
+	pe->dma_weight = 0;
+
+	if (all)
+		pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
+			bus->busn_res.start, bus->busn_res.end, pe_num);
+	else
+		pe_info(pe, "Secondary bus %d associated with PE#%d\n",
+			bus->busn_res.start, pe_num);
+
+	if (pnv_ioda_configure_pe(phb, pe)) {
+		/* XXX What do we do here ? */
+		if (pe_num)
+			pnv_ioda_free_pe(phb, pe_num);
+		pe->pbus = NULL;
+		return;
+	}
+
+	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
+			GFP_KERNEL, hose->node);
+	pe->tce32_table->data = pe;
+
+	/* Associate it with all child devices */
+	pnv_ioda_setup_same_PE(bus, pe);
+
+	/* Put PE to the list */
+	list_add_tail(&pe->list, &phb->ioda.pe_list);
+
+	/* Account for one DMA PE if at least one DMA capable device exist
+	 * below the bridge
+	 */
+	if (pe->dma_weight != 0) {
+		phb->ioda.dma_weight += pe->dma_weight;
+		phb->ioda.dma_pe_count++;
+	}
+
+	/* Link the PE */
+	pnv_ioda_link_pe_by_weight(phb, pe);
+}
+
+static void pnv_ioda_setup_PEs(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	pnv_ioda_setup_bus_PE(bus, 0);
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		if (dev->subordinate) {
+			if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE)
+				pnv_ioda_setup_bus_PE(dev->subordinate, 1);
+			else
+				pnv_ioda_setup_PEs(dev->subordinate);
+		}
+	}
+}
+
+/*
+ * Configure PEs so that the downstream PCI buses and devices
+ * could have their associated PE#. Unfortunately, we didn't
+ * figure out the way to identify the PLX bridge yet. So we
+ * simply put the PCI bus and the subordinate behind the root
+ * port to PE# here. The game rule here is expected to be changed
+ * as soon as we can detected PLX bridge correctly.
+ */
+static void pnv_pci_ioda_setup_PEs(void)
+{
+	struct pci_controller *hose, *tmp;
+	struct pnv_phb *phb;
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		/* M64 layout might affect PE allocation */
+		if (phb->reserve_m64_pe)
+			phb->reserve_m64_pe(phb);
+
+		pnv_ioda_setup_PEs(hose->bus);
+	}
+}
+
+#ifdef CONFIG_PCI_IOV
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	int                    i, j;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+		for (j = 0; j < M64_PER_IOV; j++) {
+			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
+				continue;
+			opal_pci_phb_mmio_enable(phb->opal_id,
+				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
+			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
+			pdn->m64_wins[i][j] = IODA_INVALID_M64;
+		}
+
+	return 0;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	unsigned int           win;
+	struct resource       *res;
+	int                    i, j;
+	int64_t                rc;
+	int                    total_vfs;
+	resource_size_t        size, start;
+	int                    pe_num;
+	int                    vf_groups;
+	int                    vf_per_group;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+	total_vfs = pci_sriov_get_totalvfs(pdev);
+
+	/* Initialize the m64_wins to IODA_INVALID_M64 */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+		for (j = 0; j < M64_PER_IOV; j++)
+			pdn->m64_wins[i][j] = IODA_INVALID_M64;
+
+	if (pdn->m64_per_iov == M64_PER_IOV) {
+		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
+		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
+			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
+	} else {
+		vf_groups = 1;
+		vf_per_group = 1;
+	}
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || !res->parent)
+			continue;
+
+		if (!pnv_pci_is_mem_pref_64(res->flags))
+			continue;
+
+		for (j = 0; j < vf_groups; j++) {
+			do {
+				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+						phb->ioda.m64_bar_idx + 1, 0);
+
+				if (win >= phb->ioda.m64_bar_idx + 1)
+					goto m64_failed;
+			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+			pdn->m64_wins[i][j] = win;
+
+			if (pdn->m64_per_iov == M64_PER_IOV) {
+				size = pci_iov_resource_size(pdev,
+							PCI_IOV_RESOURCES + i);
+				size = size * vf_per_group;
+				start = res->start + size * j;
+			} else {
+				size = resource_size(res);
+				start = res->start;
+			}
+
+			/* Map the M64 here */
+			if (pdn->m64_per_iov == M64_PER_IOV) {
+				pe_num = pdn->offset + j;
+				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+						pe_num, OPAL_M64_WINDOW_TYPE,
+						pdn->m64_wins[i][j], 0);
+			}
+
+			rc = opal_pci_set_phb_mem_window(phb->opal_id,
+						 OPAL_M64_WINDOW_TYPE,
+						 pdn->m64_wins[i][j],
+						 start,
+						 0, /* unused */
+						 size);
+
+
+			if (rc != OPAL_SUCCESS) {
+				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
+					win, rc);
+				goto m64_failed;
+			}
+
+			if (pdn->m64_per_iov == M64_PER_IOV)
+				rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
+			else
+				rc = opal_pci_phb_mmio_enable(phb->opal_id,
+				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
+
+			if (rc != OPAL_SUCCESS) {
+				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
+					win, rc);
+				goto m64_failed;
+			}
+		}
+	}
+	return 0;
+
+m64_failed:
+	pnv_pci_vf_release_m64(pdev);
+	return -EBUSY;
+}
+
+static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct iommu_table    *tbl;
+	unsigned long         addr;
+	int64_t               rc;
+
+	bus = dev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	tbl = pe->tce32_table;
+	addr = tbl->it_base;
+
+	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+				   pe->pe_number << 1, 1, __pa(addr),
+				   0, 0x1000);
+
+	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+				        pe->pe_number,
+				        (pe->pe_number << 1) + 1,
+				        pe->tce_bypass_base,
+				        0);
+	if (rc)
+		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
+
+	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+	free_pages(addr, get_order(TCE32_TABLE_SIZE));
+	pe->tce32_table = NULL;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe, *pe_n;
+	struct pci_dn         *pdn;
+	u16                    vf_index;
+	int64_t                rc;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	if (!pdev->is_physfn)
+		return;
+
+	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
+		int   vf_group;
+		int   vf_per_group;
+		int   vf_index1;
+
+		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
+
+		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
+			for (vf_index = vf_group * vf_per_group;
+				vf_index < (vf_group + 1) * vf_per_group &&
+				vf_index < num_vfs;
+				vf_index++)
+				for (vf_index1 = vf_group * vf_per_group;
+					vf_index1 < (vf_group + 1) * vf_per_group &&
+					vf_index1 < num_vfs;
+					vf_index1++){
+
+					rc = opal_pci_set_peltv(phb->opal_id,
+						pdn->offset + vf_index,
+						pdn->offset + vf_index1,
+						OPAL_REMOVE_PE_FROM_DOMAIN);
+
+					if (rc)
+					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
+						__func__,
+						pdn->offset + vf_index1, rc);
+				}
+	}
+
+	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+		if (pe->parent_dev != pdev)
+			continue;
+
+		pnv_pci_ioda2_release_dma_pe(pdev, pe);
+
+		/* Remove from list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_del(&pe->list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		pnv_ioda_deconfigure_pe(phb, pe);
+
+		pnv_ioda_free_pe(phb, pe->pe_number);
+	}
+}
+
+void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	struct pci_sriov      *iov;
+	u16 num_vfs;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+	iov = pdev->sriov;
+	num_vfs = pdn->num_vfs;
+
+	/* Release VF PEs */
+	pnv_ioda_release_vf_PE(pdev, num_vfs);
+
+	if (phb->type == PNV_PHB_IODA2) {
+		if (pdn->m64_per_iov == 1)
+			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+
+		/* Release M64 windows */
+		pnv_pci_vf_release_m64(pdev);
+
+		/* Release PE numbers */
+		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
+		pdn->offset = 0;
+	}
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe);
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pnv_ioda_pe    *pe;
+	int                    pe_num;
+	u16                    vf_index;
+	struct pci_dn         *pdn;
+	int64_t                rc;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	if (!pdev->is_physfn)
+		return;
+
+	/* Reserve PE for each VF */
+	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+		pe_num = pdn->offset + vf_index;
+
+		pe = &phb->ioda.pe_array[pe_num];
+		pe->pe_number = pe_num;
+		pe->phb = phb;
+		pe->flags = PNV_IODA_PE_VF;
+		pe->pbus = NULL;
+		pe->parent_dev = pdev;
+		pe->tce32_seg = -1;
+		pe->mve_number = -1;
+		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
+			   pci_iov_virtfn_devfn(pdev, vf_index);
+
+		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
+			hose->global_number, pdev->bus->number,
+			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
+			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
+
+		if (pnv_ioda_configure_pe(phb, pe)) {
+			/* XXX What do we do here ? */
+			if (pe_num)
+				pnv_ioda_free_pe(phb, pe_num);
+			pe->pdev = NULL;
+			continue;
+		}
+
+		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
+				GFP_KERNEL, hose->node);
+		pe->tce32_table->data = pe;
+
+		/* Put PE to the list */
+		mutex_lock(&phb->ioda.pe_list_mutex);
+		list_add_tail(&pe->list, &phb->ioda.pe_list);
+		mutex_unlock(&phb->ioda.pe_list_mutex);
+
+		pnv_pci_ioda2_setup_dma_pe(phb, pe);
+	}
+
+	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
+		int   vf_group;
+		int   vf_per_group;
+		int   vf_index1;
+
+		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
+
+		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
+			for (vf_index = vf_group * vf_per_group;
+			     vf_index < (vf_group + 1) * vf_per_group &&
+			     vf_index < num_vfs;
+			     vf_index++) {
+				for (vf_index1 = vf_group * vf_per_group;
+				     vf_index1 < (vf_group + 1) * vf_per_group &&
+				     vf_index1 < num_vfs;
+				     vf_index1++) {
+
+					rc = opal_pci_set_peltv(phb->opal_id,
+						pdn->offset + vf_index,
+						pdn->offset + vf_index1,
+						OPAL_ADD_PE_TO_DOMAIN);
+
+					if (rc)
+					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
+						__func__,
+						pdn->offset + vf_index1, rc);
+				}
+			}
+		}
+	}
+}
+
+int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	struct pci_bus        *bus;
+	struct pci_controller *hose;
+	struct pnv_phb        *phb;
+	struct pci_dn         *pdn;
+	int                    ret;
+
+	bus = pdev->bus;
+	hose = pci_bus_to_host(bus);
+	phb = hose->private_data;
+	pdn = pci_get_pdn(pdev);
+
+	if (phb->type == PNV_PHB_IODA2) {
+		/* Calculate available PE for required VFs */
+		mutex_lock(&phb->ioda.pe_alloc_mutex);
+		pdn->offset = bitmap_find_next_zero_area(
+			phb->ioda.pe_alloc, phb->ioda.total_pe,
+			0, num_vfs, 0);
+		if (pdn->offset >= phb->ioda.total_pe) {
+			mutex_unlock(&phb->ioda.pe_alloc_mutex);
+			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
+			pdn->offset = 0;
+			return -EBUSY;
+		}
+		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
+		pdn->num_vfs = num_vfs;
+		mutex_unlock(&phb->ioda.pe_alloc_mutex);
+
+		/* Assign M64 window accordingly */
+		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+		if (ret) {
+			dev_info(&pdev->dev, "Not enough M64 window resources\n");
+			goto m64_failed;
+		}
+
+		/*
+		 * When using one M64 BAR to map one IOV BAR, we need to shift
+		 * the IOV BAR according to the PE# allocated to the VFs.
+		 * Otherwise, the PE# for the VF will conflict with others.
+		 */
+		if (pdn->m64_per_iov == 1) {
+			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
+			if (ret)
+				goto m64_failed;
+		}
+	}
+
+	/* Setup VF PEs */
+	pnv_ioda_setup_vf_PE(pdev, num_vfs);
+
+	return 0;
+
+m64_failed:
+	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
+	pdn->offset = 0;
+
+	return ret;
+}
+
+int pcibios_sriov_disable(struct pci_dev *pdev)
+{
+	pnv_pci_sriov_disable(pdev);
+
+	/* Release PCI data */
+	remove_dev_pci_data(pdev);
+	return 0;
+}
+
+int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+	/* Allocate PCI data */
+	add_dev_pci_data(pdev);
+
+	pnv_pci_sriov_enable(pdev, num_vfs);
+	return 0;
+}
+#endif /* CONFIG_PCI_IOV */
+
+static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+
+	/*
+	 * The function can be called while the PE#
+	 * hasn't been assigned. Do nothing for the
+	 * case.
+	 */
+	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+		return;
+
+	pe = &phb->ioda.pe_array[pdn->pe_number];
+	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
+	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
+}
+
+static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
+				     struct pci_dev *pdev, u64 dma_mask)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+	uint64_t top;
+	bool bypass = false;
+
+	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+		return -ENODEV;;
+
+	pe = &phb->ioda.pe_array[pdn->pe_number];
+	if (pe->tce_bypass_enabled) {
+		top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+		bypass = (dma_mask >= top);
+	}
+
+	if (bypass) {
+		dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
+		set_dma_ops(&pdev->dev, &dma_direct_ops);
+		set_dma_offset(&pdev->dev, pe->tce_bypass_base);
+	} else {
+		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
+		set_dma_ops(&pdev->dev, &dma_iommu_ops);
+		set_iommu_table_base(&pdev->dev, pe->tce32_table);
+	}
+	*pdev->dev.dma_mask = dma_mask;
+	return 0;
+}
+
+static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb,
+					      struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	struct pnv_ioda_pe *pe;
+	u64 end, mask;
+
+	if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
+		return 0;
+
+	pe = &phb->ioda.pe_array[pdn->pe_number];
+	if (!pe->tce_bypass_enabled)
+		return __dma_get_required_mask(&pdev->dev);
+
+
+	end = pe->tce_bypass_base + memblock_end_of_DRAM();
+	mask = 1ULL << (fls64(end) - 1);
+	mask += mask - 1;
+
+	return mask;
+}
+
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
+				   struct pci_bus *bus,
+				   bool add_to_iommu_group)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		if (add_to_iommu_group)
+			set_iommu_table_base_and_group(&dev->dev,
+						       pe->tce32_table);
+		else
+			set_iommu_table_base(&dev->dev, pe->tce32_table);
+
+		if (dev->subordinate)
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
+					       add_to_iommu_group);
+	}
+}
+
+static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
+					 struct iommu_table *tbl,
+					 __be64 *startp, __be64 *endp, bool rm)
+{
+	__be64 __iomem *invalidate = rm ?
+		(__be64 __iomem *)pe->tce_inval_reg_phys :
+		(__be64 __iomem *)tbl->it_index;
+	unsigned long start, end, inc;
+	const unsigned shift = tbl->it_page_shift;
+
+	start = __pa(startp);
+	end = __pa(endp);
+
+	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
+	if (tbl->it_busno) {
+		start <<= shift;
+		end <<= shift;
+		inc = 128ull << shift;
+		start |= tbl->it_busno;
+		end |= tbl->it_busno;
+	} else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
+		/* p7ioc-style invalidation, 2 TCEs per write */
+		start |= (1ull << 63);
+		end |= (1ull << 63);
+		inc = 16;
+        } else {
+		/* Default (older HW) */
+                inc = 128;
+	}
+
+        end |= inc - 1;	/* round up end to be different than start */
+
+        mb(); /* Ensure above stores are visible */
+        while (start <= end) {
+		if (rm)
+			__raw_rm_writeq(cpu_to_be64(start), invalidate);
+		else
+			__raw_writeq(cpu_to_be64(start), invalidate);
+                start += inc;
+        }
+
+	/*
+	 * The iommu layer will do another mb() for us on build()
+	 * and we don't care on free()
+	 */
+}
+
+static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
+					 struct iommu_table *tbl,
+					 __be64 *startp, __be64 *endp, bool rm)
+{
+	unsigned long start, end, inc;
+	__be64 __iomem *invalidate = rm ?
+		(__be64 __iomem *)pe->tce_inval_reg_phys :
+		(__be64 __iomem *)tbl->it_index;
+	const unsigned shift = tbl->it_page_shift;
+
+	/* We'll invalidate DMA address in PE scope */
+	start = 0x2ull << 60;
+	start |= (pe->pe_number & 0xFF);
+	end = start;
+
+	/* Figure out the start, end and step */
+	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
+	start |= (inc << shift);
+	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
+	end |= (inc << shift);
+	inc = (0x1ull << shift);
+	mb();
+
+	while (start <= end) {
+		if (rm)
+			__raw_rm_writeq(cpu_to_be64(start), invalidate);
+		else
+			__raw_writeq(cpu_to_be64(start), invalidate);
+		start += inc;
+	}
+}
+
+void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
+				 __be64 *startp, __be64 *endp, bool rm)
+{
+	struct pnv_ioda_pe *pe = tbl->data;
+	struct pnv_phb *phb = pe->phb;
+
+	if (phb->type == PNV_PHB_IODA1)
+		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
+	else
+		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
+}
+
+static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
+				      struct pnv_ioda_pe *pe, unsigned int base,
+				      unsigned int segs)
+{
+
+	struct page *tce_mem = NULL;
+	const __be64 *swinvp;
+	struct iommu_table *tbl;
+	unsigned int i;
+	int64_t rc;
+	void *addr;
+
+	/* XXX FIXME: Handle 64-bit only DMA devices */
+	/* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
+	/* XXX FIXME: Allocate multi-level tables on PHB3 */
+
+	/* We shouldn't already have a 32-bit DMA associated */
+	if (WARN_ON(pe->tce32_seg >= 0))
+		return;
+
+	/* Grab a 32-bit TCE table */
+	pe->tce32_seg = base;
+	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
+		(base << 28), ((base + segs) << 28) - 1);
+
+	/* XXX Currently, we allocate one big contiguous table for the
+	 * TCEs. We only really need one chunk per 256M of TCE space
+	 * (ie per segment) but that's an optimization for later, it
+	 * requires some added smarts with our get/put_tce implementation
+	 */
+	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+				   get_order(TCE32_TABLE_SIZE * segs));
+	if (!tce_mem) {
+		pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
+		goto fail;
+	}
+	addr = page_address(tce_mem);
+	memset(addr, 0, TCE32_TABLE_SIZE * segs);
+
+	/* Configure HW */
+	for (i = 0; i < segs; i++) {
+		rc = opal_pci_map_pe_dma_window(phb->opal_id,
+					      pe->pe_number,
+					      base + i, 1,
+					      __pa(addr) + TCE32_TABLE_SIZE * i,
+					      TCE32_TABLE_SIZE, 0x1000);
+		if (rc) {
+			pe_err(pe, " Failed to configure 32-bit TCE table,"
+			       " err %ld\n", rc);
+			goto fail;
+		}
+	}
+
+	/* Setup linux iommu table */
+	tbl = pe->tce32_table;
+	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
+				  base << 28, IOMMU_PAGE_SHIFT_4K);
+
+	/* OPAL variant of P7IOC SW invalidated TCEs */
+	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+	if (swinvp) {
+		/* We need a couple more fields -- an address and a data
+		 * to or.  Since the bus is only printed out on table free
+		 * errors, and on the first pass the data will be a relative
+		 * bus number, print that out instead.
+		 */
+		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
+		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
+				8);
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE |
+				 TCE_PCI_SWINV_FREE   |
+				 TCE_PCI_SWINV_PAIR);
+	}
+	iommu_init_table(tbl, phb->hose->node);
+
+	if (pe->flags & PNV_IODA_PE_DEV) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+	} else if (pe->flags & PNV_IODA_PE_VF) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+	}
+
+	return;
+ fail:
+	/* XXX Failure: Try to fallback to 64-bit only ? */
+	if (pe->tce32_seg >= 0)
+		pe->tce32_seg = -1;
+	if (tce_mem)
+		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+}
+
+static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+{
+	struct pnv_ioda_pe *pe = tbl->data;
+	uint16_t window_id = (pe->pe_number << 1 ) + 1;
+	int64_t rc;
+
+	pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
+	if (enable) {
+		phys_addr_t top = memblock_end_of_DRAM();
+
+		top = roundup_pow_of_two(top);
+		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+						     pe->pe_number,
+						     window_id,
+						     pe->tce_bypass_base,
+						     top);
+	} else {
+		rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
+						     pe->pe_number,
+						     window_id,
+						     pe->tce_bypass_base,
+						     0);
+
+		/*
+		 * EEH needs the mapping between IOMMU table and group
+		 * of those VFIO/KVM pass-through devices. We can postpone
+		 * resetting DMA ops until the DMA mask is configured in
+		 * host side.
+		 */
+		if (pe->pdev)
+			set_iommu_table_base(&pe->pdev->dev, tbl);
+		else
+			pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+	}
+	if (rc)
+		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
+	else
+		pe->tce_bypass_enabled = enable;
+}
+
+static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
+					  struct pnv_ioda_pe *pe)
+{
+	/* TVE #1 is selected by PCI address bit 59 */
+	pe->tce_bypass_base = 1ull << 59;
+
+	/* Install set_bypass callback for VFIO */
+	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
+
+	/* Enable bypass by default */
+	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe)
+{
+	struct page *tce_mem = NULL;
+	void *addr;
+	const __be64 *swinvp;
+	struct iommu_table *tbl;
+	unsigned int tce_table_size, end;
+	int64_t rc;
+
+	/* We shouldn't already have a 32-bit DMA associated */
+	if (WARN_ON(pe->tce32_seg >= 0))
+		return;
+
+	/* The PE will reserve all possible 32-bits space */
+	pe->tce32_seg = 0;
+	end = (1 << ilog2(phb->ioda.m32_pci_base));
+	tce_table_size = (end / 0x1000) * 8;
+	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
+		end);
+
+	/* Allocate TCE table */
+	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+				   get_order(tce_table_size));
+	if (!tce_mem) {
+		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
+		goto fail;
+	}
+	addr = page_address(tce_mem);
+	memset(addr, 0, tce_table_size);
+
+	/*
+	 * Map TCE table through TVT. The TVE index is the PE number
+	 * shifted by 1 bit for 32-bits DMA space.
+	 */
+	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+					pe->pe_number << 1, 1, __pa(addr),
+					tce_table_size, 0x1000);
+	if (rc) {
+		pe_err(pe, "Failed to configure 32-bit TCE table,"
+		       " err %ld\n", rc);
+		goto fail;
+	}
+
+	/* Setup linux iommu table */
+	tbl = pe->tce32_table;
+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+			IOMMU_PAGE_SHIFT_4K);
+
+	/* OPAL variant of PHB3 invalidated TCEs */
+	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+	if (swinvp) {
+		/* We need a couple more fields -- an address and a data
+		 * to or.  Since the bus is only printed out on table free
+		 * errors, and on the first pass the data will be a relative
+		 * bus number, print that out instead.
+		 */
+		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
+		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
+				8);
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+	}
+	iommu_init_table(tbl, phb->hose->node);
+
+	if (pe->flags & PNV_IODA_PE_DEV) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+	} else if (pe->flags & PNV_IODA_PE_VF) {
+		iommu_register_group(tbl, phb->hose->global_number,
+				     pe->pe_number);
+	}
+
+	/* Also create a bypass window */
+	if (!pnv_iommu_bypass_disabled)
+		pnv_pci_ioda2_setup_bypass_pe(phb, pe);
+
+	return;
+fail:
+	if (pe->tce32_seg >= 0)
+		pe->tce32_seg = -1;
+	if (tce_mem)
+		__free_pages(tce_mem, get_order(tce_table_size));
+}
+
+static void pnv_ioda_setup_dma(struct pnv_phb *phb)
+{
+	struct pci_controller *hose = phb->hose;
+	unsigned int residual, remaining, segs, tw, base;
+	struct pnv_ioda_pe *pe;
+
+	/* If we have more PE# than segments available, hand out one
+	 * per PE until we run out and let the rest fail. If not,
+	 * then we assign at least one segment per PE, plus more based
+	 * on the amount of devices under that PE
+	 */
+	if (phb->ioda.dma_pe_count > phb->ioda.tce32_count)
+		residual = 0;
+	else
+		residual = phb->ioda.tce32_count -
+			phb->ioda.dma_pe_count;
+
+	pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n",
+		hose->global_number, phb->ioda.tce32_count);
+	pr_info("PCI: %d PE# for a total weight of %d\n",
+		phb->ioda.dma_pe_count, phb->ioda.dma_weight);
+
+	/* Walk our PE list and configure their DMA segments, hand them
+	 * out one base segment plus any residual segments based on
+	 * weight
+	 */
+	remaining = phb->ioda.tce32_count;
+	tw = phb->ioda.dma_weight;
+	base = 0;
+	list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) {
+		if (!pe->dma_weight)
+			continue;
+		if (!remaining) {
+			pe_warn(pe, "No DMA32 resources available\n");
+			continue;
+		}
+		segs = 1;
+		if (residual) {
+			segs += ((pe->dma_weight * residual)  + (tw / 2)) / tw;
+			if (segs > remaining)
+				segs = remaining;
+		}
+
+		/*
+		 * For IODA2 compliant PHB3, we needn't care about the weight.
+		 * The all available 32-bits DMA space will be assigned to
+		 * the specific PE.
+		 */
+		if (phb->type == PNV_PHB_IODA1) {
+			pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
+				pe->dma_weight, segs);
+			pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+		} else {
+			pe_info(pe, "Assign DMA32 space\n");
+			segs = 0;
+			pnv_pci_ioda2_setup_dma_pe(phb, pe);
+		}
+
+		remaining -= segs;
+		base += segs;
+	}
+}
+
+#ifdef CONFIG_PCI_MSI
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	struct irq_chip *chip = irq_data_get_irq_chip(d);
+	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
+					   ioda.irq_chip);
+	int64_t rc;
+
+	rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+	WARN_ON_ONCE(rc);
+
+	icp_native_eoi(d);
+}
+
+
+static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
+{
+	struct irq_data *idata;
+	struct irq_chip *ichip;
+
+	if (phb->type != PNV_PHB_IODA2)
+		return;
+
+	if (!phb->ioda.irq_chip_init) {
+		/*
+		 * First time we setup an MSI IRQ, we need to setup the
+		 * corresponding IRQ chip to route correctly.
+		 */
+		idata = irq_get_irq_data(virq);
+		ichip = irq_data_get_irq_chip(idata);
+		phb->ioda.irq_chip_init = 1;
+		phb->ioda.irq_chip = *ichip;
+		phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
+	}
+	irq_set_chip(virq, &phb->ioda.irq_chip);
+}
+
+#ifdef CONFIG_CXL_BASE
+
+struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+
+	return of_node_get(hose->dn);
+}
+EXPORT_SYMBOL(pnv_pci_get_phb_node);
+
+int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+	int rc;
+
+	pe = pnv_ioda_get_pe(dev);
+	if (!pe)
+		return -ENODEV;
+
+	pe_info(pe, "Switching PHB to CXL\n");
+
+	rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
+	if (rc)
+		dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
+
+	return rc;
+}
+EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
+
+/* Find PHB for cxl dev and allocate MSI hwirqs?
+ * Returns the absolute hardware IRQ number
+ */
+int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
+
+	if (hwirq < 0) {
+		dev_warn(&dev->dev, "Failed to find a free MSI\n");
+		return -ENOSPC;
+	}
+
+	return phb->msi_base + hwirq;
+}
+EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
+
+void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
+}
+EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
+
+void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
+				  struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int i, hwirq;
+
+	for (i = 1; i < CXL_IRQ_RANGES; i++) {
+		if (!irqs->range[i])
+			continue;
+		pr_devel("cxl release irq range 0x%x: offset: 0x%lx  limit: %ld\n",
+			 i, irqs->offset[i],
+			 irqs->range[i]);
+		hwirq = irqs->offset[i] - phb->msi_base;
+		msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
+				       irqs->range[i]);
+	}
+}
+EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
+
+int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
+			       struct pci_dev *dev, int num)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int i, hwirq, try;
+
+	memset(irqs, 0, sizeof(struct cxl_irq_ranges));
+
+	/* 0 is reserved for the multiplexed PSL DSI interrupt */
+	for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
+		try = num;
+		while (try) {
+			hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
+			if (hwirq >= 0)
+				break;
+			try /= 2;
+		}
+		if (!try)
+			goto fail;
+
+		irqs->offset[i] = phb->msi_base + hwirq;
+		irqs->range[i] = try;
+		pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx  limit: %li\n",
+			 i, irqs->offset[i], irqs->range[i]);
+		num -= try;
+	}
+	if (num)
+		goto fail;
+
+	return 0;
+fail:
+	pnv_cxl_release_hwirq_ranges(irqs, dev);
+	return -ENOSPC;
+}
+EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
+
+int pnv_cxl_get_irq_count(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	return phb->msi_bmp.irq_count;
+}
+EXPORT_SYMBOL(pnv_cxl_get_irq_count);
+
+int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
+			   unsigned int virq)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	unsigned int xive_num = hwirq - phb->msi_base;
+	struct pnv_ioda_pe *pe;
+	int rc;
+
+	if (!(pe = pnv_ioda_get_pe(dev)))
+		return -ENODEV;
+
+	/* Assign XIVE to PE */
+	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+	if (rc) {
+		pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
+			"hwirq 0x%x XIVE 0x%x PE\n",
+			pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
+		return -EIO;
+	}
+	set_msi_irq_chip(phb, virq);
+
+	return 0;
+}
+EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
+#endif
+
+static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+				  unsigned int hwirq, unsigned int virq,
+				  unsigned int is_64, struct msi_msg *msg)
+{
+	struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+	unsigned int xive_num = hwirq - phb->msi_base;
+	__be32 data;
+	int rc;
+
+	/* No PE assigned ? bail out ... no MSI for you ! */
+	if (pe == NULL)
+		return -ENXIO;
+
+	/* Check if we have an MVE */
+	if (pe->mve_number < 0)
+		return -ENXIO;
+
+	/* Force 32-bit MSI on some broken devices */
+	if (dev->no_64bit_msi)
+		is_64 = 0;
+
+	/* Assign XIVE to PE */
+	rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
+	if (rc) {
+		pr_warn("%s: OPAL error %d setting XIVE %d PE\n",
+			pci_name(dev), rc, xive_num);
+		return -EIO;
+	}
+
+	if (is_64) {
+		__be64 addr64;
+
+		rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1,
+				     &addr64, &data);
+		if (rc) {
+			pr_warn("%s: OPAL error %d getting 64-bit MSI data\n",
+				pci_name(dev), rc);
+			return -EIO;
+		}
+		msg->address_hi = be64_to_cpu(addr64) >> 32;
+		msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful;
+	} else {
+		__be32 addr32;
+
+		rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1,
+				     &addr32, &data);
+		if (rc) {
+			pr_warn("%s: OPAL error %d getting 32-bit MSI data\n",
+				pci_name(dev), rc);
+			return -EIO;
+		}
+		msg->address_hi = 0;
+		msg->address_lo = be32_to_cpu(addr32);
+	}
+	msg->data = be32_to_cpu(data);
+
+	set_msi_irq_chip(phb, virq);
+
+	pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
+		 " address=%x_%08x data=%x PE# %d\n",
+		 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
+		 msg->address_hi, msg->address_lo, data, pe->pe_number);
+
+	return 0;
+}
+
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+{
+	unsigned int count;
+	const __be32 *prop = of_get_property(phb->hose->dn,
+					     "ibm,opal-msi-ranges", NULL);
+	if (!prop) {
+		/* BML Fallback */
+		prop = of_get_property(phb->hose->dn, "msi-ranges", NULL);
+	}
+	if (!prop)
+		return;
+
+	phb->msi_base = be32_to_cpup(prop);
+	count = be32_to_cpup(prop + 1);
+	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
+		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+		       phb->hose->global_number);
+		return;
+	}
+
+	phb->msi_setup = pnv_pci_ioda_msi_setup;
+	phb->msi32_support = 1;
+	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+		count, phb->msi_base);
+}
+#else
+static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct resource *res;
+	int i;
+	resource_size_t size;
+	struct pci_dn *pdn;
+	int mul, total_vfs;
+
+	if (!pdev->is_physfn || pdev->is_added)
+		return;
+
+	hose = pci_bus_to_host(pdev->bus);
+	phb = hose->private_data;
+
+	pdn = pci_get_pdn(pdev);
+	pdn->vfs_expanded = 0;
+
+	total_vfs = pci_sriov_get_totalvfs(pdev);
+	pdn->m64_per_iov = 1;
+	mul = phb->ioda.total_pe;
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || res->parent)
+			continue;
+		if (!pnv_pci_is_mem_pref_64(res->flags)) {
+			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
+				 i, res);
+			continue;
+		}
+
+		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+
+		/* bigger than 64M */
+		if (size > (1 << 26)) {
+			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
+				 i, res);
+			pdn->m64_per_iov = M64_PER_IOV;
+			mul = roundup_pow_of_two(total_vfs);
+			break;
+		}
+	}
+
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		if (!res->flags || res->parent)
+			continue;
+		if (!pnv_pci_is_mem_pref_64(res->flags)) {
+			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
+				 i, res);
+			continue;
+		}
+
+		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
+		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+		res->end = res->start + size * mul - 1;
+		dev_dbg(&pdev->dev, "                       %pR\n", res);
+		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+			 i, res, mul);
+	}
+	pdn->vfs_expanded = mul;
+}
+#endif /* CONFIG_PCI_IOV */
+
+/*
+ * This function is supposed to be called on basis of PE from top
+ * to bottom style. So the the I/O or MMIO segment assigned to
+ * parent PE could be overrided by its child PEs if necessary.
+ */
+static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
+				  struct pnv_ioda_pe *pe)
+{
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_bus_region region;
+	struct resource *res;
+	int i, index;
+	int rc;
+
+	/*
+	 * NOTE: We only care PCI bus based PE for now. For PCI
+	 * device based PE, for example SRIOV sensitive VF should
+	 * be figured out later.
+	 */
+	BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)));
+
+	pci_bus_for_each_resource(pe->pbus, res, i) {
+		if (!res || !res->flags ||
+		    res->start > res->end)
+			continue;
+
+		if (res->flags & IORESOURCE_IO) {
+			region.start = res->start - phb->ioda.io_pci_base;
+			region.end   = res->end - phb->ioda.io_pci_base;
+			index = region.start / phb->ioda.io_segsize;
+
+			while (index < phb->ioda.total_pe &&
+			       region.start <= region.end) {
+				phb->ioda.io_segmap[index] = pe->pe_number;
+				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+					pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
+				if (rc != OPAL_SUCCESS) {
+					pr_err("%s: OPAL error %d when mapping IO "
+					       "segment #%d to PE#%d\n",
+					       __func__, rc, index, pe->pe_number);
+					break;
+				}
+
+				region.start += phb->ioda.io_segsize;
+				index++;
+			}
+		} else if ((res->flags & IORESOURCE_MEM) &&
+			   !pnv_pci_is_mem_pref_64(res->flags)) {
+			region.start = res->start -
+				       hose->mem_offset[0] -
+				       phb->ioda.m32_pci_base;
+			region.end   = res->end -
+				       hose->mem_offset[0] -
+				       phb->ioda.m32_pci_base;
+			index = region.start / phb->ioda.m32_segsize;
+
+			while (index < phb->ioda.total_pe &&
+			       region.start <= region.end) {
+				phb->ioda.m32_segmap[index] = pe->pe_number;
+				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+					pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
+				if (rc != OPAL_SUCCESS) {
+					pr_err("%s: OPAL error %d when mapping M32 "
+					       "segment#%d to PE#%d",
+					       __func__, rc, index, pe->pe_number);
+					break;
+				}
+
+				region.start += phb->ioda.m32_segsize;
+				index++;
+			}
+		}
+	}
+}
+
+static void pnv_pci_ioda_setup_seg(void)
+{
+	struct pci_controller *tmp, *hose;
+	struct pnv_phb *phb;
+	struct pnv_ioda_pe *pe;
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		phb = hose->private_data;
+		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+			pnv_ioda_setup_pe_seg(hose, pe);
+		}
+	}
+}
+
+static void pnv_pci_ioda_setup_DMA(void)
+{
+	struct pci_controller *hose, *tmp;
+	struct pnv_phb *phb;
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		pnv_ioda_setup_dma(hose->private_data);
+
+		/* Mark the PHB initialization done */
+		phb = hose->private_data;
+		phb->initialized = 1;
+	}
+}
+
+static void pnv_pci_ioda_create_dbgfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	struct pci_controller *hose, *tmp;
+	struct pnv_phb *phb;
+	char name[16];
+
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		sprintf(name, "PCI%04x", hose->global_number);
+		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
+		if (!phb->dbgfs)
+			pr_warning("%s: Error on creating debugfs on PHB#%x\n",
+				__func__, hose->global_number);
+	}
+#endif /* CONFIG_DEBUG_FS */
+}
+
+static void pnv_pci_ioda_fixup(void)
+{
+	pnv_pci_ioda_setup_PEs();
+	pnv_pci_ioda_setup_seg();
+	pnv_pci_ioda_setup_DMA();
+
+	pnv_pci_ioda_create_dbgfs();
+
+#ifdef CONFIG_EEH
+	eeh_init();
+	eeh_addr_cache_build();
+#endif
+}
+
+/*
+ * Returns the alignment for I/O or memory windows for P2P
+ * bridges. That actually depends on how PEs are segmented.
+ * For now, we return I/O or M32 segment size for PE sensitive
+ * P2P bridges. Otherwise, the default values (4KiB for I/O,
+ * 1MiB for memory) will be returned.
+ *
+ * The current PCI bus might be put into one PE, which was
+ * create against the parent PCI bridge. For that case, we
+ * needn't enlarge the alignment so that we can save some
+ * resources.
+ */
+static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
+						unsigned long type)
+{
+	struct pci_dev *bridge;
+	struct pci_controller *hose = pci_bus_to_host(bus);
+	struct pnv_phb *phb = hose->private_data;
+	int num_pci_bridges = 0;
+
+	bridge = bus->self;
+	while (bridge) {
+		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) {
+			num_pci_bridges++;
+			if (num_pci_bridges >= 2)
+				return 1;
+		}
+
+		bridge = bridge->bus->self;
+	}
+
+	/* We fail back to M32 if M64 isn't supported */
+	if (phb->ioda.m64_segsize &&
+	    pnv_pci_is_mem_pref_64(type))
+		return phb->ioda.m64_segsize;
+	if (type & IORESOURCE_MEM)
+		return phb->ioda.m32_segsize;
+
+	return phb->ioda.io_segsize;
+}
+
+#ifdef CONFIG_PCI_IOV
+static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
+						      int resno)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	resource_size_t align, iov_align;
+
+	iov_align = resource_size(&pdev->resource[resno]);
+	if (iov_align)
+		return iov_align;
+
+	align = pci_iov_resource_size(pdev, resno);
+	if (pdn->vfs_expanded)
+		return pdn->vfs_expanded * align;
+
+	return align;
+}
+#endif /* CONFIG_PCI_IOV */
+
+/* Prevent enabling devices for which we couldn't properly
+ * assign a PE
+ */
+static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct pci_dn *pdn;
+
+	/* The function is probably called while the PEs have
+	 * not be created yet. For example, resource reassignment
+	 * during PCI probe period. We just skip the check if
+	 * PEs isn't ready.
+	 */
+	if (!phb->initialized)
+		return true;
+
+	pdn = pci_get_pdn(dev);
+	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+		return false;
+
+	return true;
+}
+
+static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
+			       u32 devfn)
+{
+	return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
+}
+
+static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
+{
+	opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
+		       OPAL_ASSERT_RESET);
+}
+
+static void __init pnv_pci_init_ioda_phb(struct device_node *np,
+					 u64 hub_id, int ioda_type)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	unsigned long size, m32map_off, pemap_off, iomap_off = 0;
+	const __be64 *prop64;
+	const __be32 *prop32;
+	int len;
+	u64 phb_id;
+	void *aux;
+	long rc;
+
+	pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
+
+	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+	if (!prop64) {
+		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
+		return;
+	}
+	phb_id = be64_to_cpup(prop64);
+	pr_debug("  PHB-ID  : 0x%016llx\n", phb_id);
+
+	phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0);
+
+	/* Allocate PCI controller */
+	phb->hose = hose = pcibios_alloc_controller(np);
+	if (!phb->hose) {
+		pr_err("  Can't allocate PCI controller for %s\n",
+		       np->full_name);
+		memblock_free(__pa(phb), sizeof(struct pnv_phb));
+		return;
+	}
+
+	spin_lock_init(&phb->lock);
+	prop32 = of_get_property(np, "bus-range", &len);
+	if (prop32 && len == 8) {
+		hose->first_busno = be32_to_cpu(prop32[0]);
+		hose->last_busno = be32_to_cpu(prop32[1]);
+	} else {
+		pr_warn("  Broken <bus-range> on %s\n", np->full_name);
+		hose->first_busno = 0;
+		hose->last_busno = 0xff;
+	}
+	hose->private_data = phb;
+	phb->hub_id = hub_id;
+	phb->opal_id = phb_id;
+	phb->type = ioda_type;
+	mutex_init(&phb->ioda.pe_alloc_mutex);
+
+	/* Detect specific models for error handling */
+	if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
+		phb->model = PNV_PHB_MODEL_P7IOC;
+	else if (of_device_is_compatible(np, "ibm,power8-pciex"))
+		phb->model = PNV_PHB_MODEL_PHB3;
+	else
+		phb->model = PNV_PHB_MODEL_UNKNOWN;
+
+	/* Parse 32-bit and IO ranges (if any) */
+	pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
+
+	/* Get registers */
+	phb->regs = of_iomap(np, 0);
+	if (phb->regs == NULL)
+		pr_err("  Failed to map registers !\n");
+
+	/* Initialize more IODA stuff */
+	phb->ioda.total_pe = 1;
+	prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
+	if (prop32)
+		phb->ioda.total_pe = be32_to_cpup(prop32);
+	prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL);
+	if (prop32)
+		phb->ioda.reserved_pe = be32_to_cpup(prop32);
+
+	/* Parse 64-bit MMIO range */
+	pnv_ioda_parse_m64_window(phb);
+
+	phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
+	/* FW Has already off top 64k of M32 space (MSI space) */
+	phb->ioda.m32_size += 0x10000;
+
+	phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
+	phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
+	phb->ioda.io_size = hose->pci_io_size;
+	phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
+	phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
+
+	/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
+	size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
+	m32map_off = size;
+	size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
+	if (phb->type == PNV_PHB_IODA1) {
+		iomap_off = size;
+		size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]);
+	}
+	pemap_off = size;
+	size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe);
+	aux = memblock_virt_alloc(size, 0);
+	phb->ioda.pe_alloc = aux;
+	phb->ioda.m32_segmap = aux + m32map_off;
+	if (phb->type == PNV_PHB_IODA1)
+		phb->ioda.io_segmap = aux + iomap_off;
+	phb->ioda.pe_array = aux + pemap_off;
+	set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc);
+
+	INIT_LIST_HEAD(&phb->ioda.pe_dma_list);
+	INIT_LIST_HEAD(&phb->ioda.pe_list);
+	mutex_init(&phb->ioda.pe_list_mutex);
+
+	/* Calculate how many 32-bit TCE segments we have */
+	phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28;
+
+#if 0 /* We should really do that ... */
+	rc = opal_pci_set_phb_mem_window(opal->phb_id,
+					 window_type,
+					 window_num,
+					 starting_real_address,
+					 starting_pci_address,
+					 segment_size);
+#endif
+
+	pr_info("  %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n",
+		phb->ioda.total_pe, phb->ioda.reserved_pe,
+		phb->ioda.m32_size, phb->ioda.m32_segsize);
+	if (phb->ioda.m64_size)
+		pr_info("                 M64: 0x%lx [segment=0x%lx]\n",
+			phb->ioda.m64_size, phb->ioda.m64_segsize);
+	if (phb->ioda.io_size)
+		pr_info("                  IO: 0x%x [segment=0x%x]\n",
+			phb->ioda.io_size, phb->ioda.io_segsize);
+
+
+	phb->hose->ops = &pnv_pci_ops;
+	phb->get_pe_state = pnv_ioda_get_pe_state;
+	phb->freeze_pe = pnv_ioda_freeze_pe;
+	phb->unfreeze_pe = pnv_ioda_unfreeze_pe;
+
+	/* Setup RID -> PE mapping function */
+	phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe;
+
+	/* Setup TCEs */
+	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
+	phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
+	phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask;
+
+	/* Setup shutdown function for kexec */
+	phb->shutdown = pnv_pci_ioda_shutdown;
+
+	/* Setup MSI support */
+	pnv_pci_init_ioda_msis(phb);
+
+	/*
+	 * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here
+	 * to let the PCI core do resource assignment. It's supposed
+	 * that the PCI core will do correct I/O and MMIO alignment
+	 * for the P2P bridge bars so that each PCI bus (excluding
+	 * the child P2P bridges) can form individual PE.
+	 */
+	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
+	pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook;
+	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
+	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
+	hose->controller_ops = pnv_pci_controller_ops;
+
+#ifdef CONFIG_PCI_IOV
+	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
+	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
+#endif
+
+	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
+
+	/* Reset IODA tables to a clean state */
+	rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
+	if (rc)
+		pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc);
+
+	/* If we're running in kdump kerenl, the previous kerenl never
+	 * shutdown PCI devices correctly. We already got IODA table
+	 * cleaned out. So we have to issue PHB reset to stop all PCI
+	 * transactions from previous kerenl.
+	 */
+	if (is_kdump_kernel()) {
+		pr_info("  Issue PHB reset ...\n");
+		pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
+		pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
+	}
+
+	/* Remove M64 resource if we can't configure it successfully */
+	if (!phb->init_m64 || phb->init_m64(phb))
+		hose->mem_resources[1].flags = 0;
+}
+
+void __init pnv_pci_init_ioda2_phb(struct device_node *np)
+{
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
+}
+
+void __init pnv_pci_init_ioda_hub(struct device_node *np)
+{
+	struct device_node *phbn;
+	const __be64 *prop64;
+	u64 hub_id;
+
+	pr_info("Probing IODA IO-Hub %s\n", np->full_name);
+
+	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+	if (!prop64) {
+		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+		return;
+	}
+	hub_id = be64_to_cpup(prop64);
+	pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
+
+	/* Count child PHBs */
+	for_each_child_of_node(np, phbn) {
+		/* Look for IODA1 PHBs */
+		if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
+			pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
+	}
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/kernel/arch/powerpc/platforms/powernv/pci-p5ioc2.c
new file mode 100644
index 000000000..4729ca793
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -0,0 +1,236 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Currently supports only P5IOC2
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* For now, use a fixed amount of TCE memory for each p5ioc2
+ * hub, 16M will do
+ */
+#define P5IOC2_TCE_MEMORY	0x01000000
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+				    unsigned int hwirq, unsigned int virq,
+				    unsigned int is_64, struct msi_msg *msg)
+{
+	if (WARN_ON(!is_64))
+		return -ENXIO;
+	msg->data = hwirq - phb->msi_base;
+	msg->address_hi = 0x10000000;
+	msg->address_lo = 0;
+
+	return 0;
+}
+
+static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
+{
+	unsigned int count;
+	const __be32 *prop = of_get_property(phb->hose->dn,
+					     "ibm,opal-msi-ranges", NULL);
+	if (!prop)
+		return;
+
+	/* Don't do MSI's on p5ioc2 PCI-X are they are not properly
+	 * verified in HW
+	 */
+	if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix"))
+		return;
+	phb->msi_base = be32_to_cpup(prop);
+	count = be32_to_cpup(prop + 1);
+	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
+		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
+		       phb->hose->global_number);
+		return;
+	}
+	phb->msi_setup = pnv_pci_p5ioc2_msi_setup;
+	phb->msi32_support = 0;
+	pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
+		count, phb->msi_base);
+}
+#else
+static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
+#endif /* CONFIG_PCI_MSI */
+
+static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
+					 struct pci_dev *pdev)
+{
+	if (phb->p5ioc2.iommu_table.it_map == NULL) {
+		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
+		iommu_register_group(&phb->p5ioc2.iommu_table,
+				pci_domain_nr(phb->hose->bus), phb->opal_id);
+	}
+
+	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
+}
+
+static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
+					   void *tce_mem, u64 tce_size)
+{
+	struct pnv_phb *phb;
+	const __be64 *prop64;
+	u64 phb_id;
+	int64_t rc;
+	static int primary = 1;
+
+	pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
+
+	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
+	if (!prop64) {
+		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
+		return;
+	}
+	phb_id = be64_to_cpup(prop64);
+	pr_devel("  PHB-ID  : 0x%016llx\n", phb_id);
+	pr_devel("  TCE AT  : 0x%016lx\n", __pa(tce_mem));
+	pr_devel("  TCE SZ  : 0x%016llx\n", tce_size);
+
+	rc = opal_pci_set_phb_tce_memory(phb_id, __pa(tce_mem), tce_size);
+	if (rc != OPAL_SUCCESS) {
+		pr_err("  Failed to set TCE memory, OPAL error %lld\n", rc);
+		return;
+	}
+
+	phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0);
+	phb->hose = pcibios_alloc_controller(np);
+	if (!phb->hose) {
+		pr_err("  Failed to allocate PCI controller\n");
+		return;
+	}
+
+	spin_lock_init(&phb->lock);
+	phb->hose->first_busno = 0;
+	phb->hose->last_busno = 0xff;
+	phb->hose->private_data = phb;
+	phb->hose->controller_ops = pnv_pci_controller_ops;
+	phb->hub_id = hub_id;
+	phb->opal_id = phb_id;
+	phb->type = PNV_PHB_P5IOC2;
+	phb->model = PNV_PHB_MODEL_P5IOC2;
+
+	phb->regs = of_iomap(np, 0);
+
+	if (phb->regs == NULL)
+		pr_err("  Failed to map registers !\n");
+	else {
+		pr_devel("  P_BUID     = 0x%08x\n", in_be32(phb->regs + 0x100));
+		pr_devel("  P_IOSZ     = 0x%08x\n", in_be32(phb->regs + 0x1b0));
+		pr_devel("  P_IO_ST    = 0x%08x\n", in_be32(phb->regs + 0x1e0));
+		pr_devel("  P_MEM1_H   = 0x%08x\n", in_be32(phb->regs + 0x1a0));
+		pr_devel("  P_MEM1_L   = 0x%08x\n", in_be32(phb->regs + 0x190));
+		pr_devel("  P_MSZ1_L   = 0x%08x\n", in_be32(phb->regs + 0x1c0));
+		pr_devel("  P_MEM_ST   = 0x%08x\n", in_be32(phb->regs + 0x1d0));
+		pr_devel("  P_MEM2_H   = 0x%08x\n", in_be32(phb->regs + 0x2c0));
+		pr_devel("  P_MEM2_L   = 0x%08x\n", in_be32(phb->regs + 0x2b0));
+		pr_devel("  P_MSZ2_H   = 0x%08x\n", in_be32(phb->regs + 0x2d0));
+		pr_devel("  P_MSZ2_L   = 0x%08x\n", in_be32(phb->regs + 0x2e0));
+	}
+
+	/* Interpret the "ranges" property */
+	/* This also maps the I/O region and sets isa_io/mem_base */
+	pci_process_bridge_OF_ranges(phb->hose, np, primary);
+	primary = 0;
+
+	phb->hose->ops = &pnv_pci_ops;
+
+	/* Setup MSI support */
+	pnv_pci_init_p5ioc2_msis(phb);
+
+	/* Setup TCEs */
+	phb->dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup;
+	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
+				  tce_mem, tce_size, 0,
+				  IOMMU_PAGE_SHIFT_4K);
+}
+
+void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
+{
+	struct device_node *phbn;
+	const __be64 *prop64;
+	u64 hub_id;
+	void *tce_mem;
+	uint64_t tce_per_phb;
+	int64_t rc;
+	int phb_count = 0;
+
+	pr_info("Probing p5ioc2 IO-Hub %s\n", np->full_name);
+
+	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
+	if (!prop64) {
+		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
+		return;
+	}
+	hub_id = be64_to_cpup(prop64);
+	pr_info(" HUB-ID : 0x%016llx\n", hub_id);
+
+	/* Count child PHBs and calculate TCE space per PHB */
+	for_each_child_of_node(np, phbn) {
+		if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
+		    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex"))
+			phb_count++;
+	}
+
+	if (phb_count <= 0) {
+		pr_info(" No PHBs for Hub %s\n", np->full_name);
+		return;
+	}
+
+	tce_per_phb = __rounddown_pow_of_two(P5IOC2_TCE_MEMORY / phb_count);
+	pr_info(" Allocating %lld MB of TCE memory per PHB\n",
+		tce_per_phb >> 20);
+
+	/* Currently allocate 16M of TCE memory for every Hub
+	 *
+	 * XXX TODO: Make it chip local if possible
+	 */
+	tce_mem = memblock_virt_alloc(P5IOC2_TCE_MEMORY, P5IOC2_TCE_MEMORY);
+	pr_debug(" TCE    : 0x%016lx..0x%016lx\n",
+		__pa(tce_mem), __pa(tce_mem) + P5IOC2_TCE_MEMORY - 1);
+	rc = opal_pci_set_hub_tce_memory(hub_id, __pa(tce_mem),
+					P5IOC2_TCE_MEMORY);
+	if (rc != OPAL_SUCCESS) {
+		pr_err(" Failed to allocate TCE memory, OPAL error %lld\n", rc);
+		return;
+	}
+
+	/* Initialize PHBs */
+	for_each_child_of_node(np, phbn) {
+		if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
+		    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) {
+			pnv_pci_init_p5ioc2_phb(phbn, hub_id,
+					tce_mem, tce_per_phb);
+			tce_mem += tce_per_phb;
+		}
+	}
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/pci.c b/kernel/arch/powerpc/platforms/powernv/pci.c
new file mode 100644
index 000000000..bca2aeb6e
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/pci.c
@@ -0,0 +1,783 @@
+/*
+ * Support PCI/PCIe on PowerNV platforms
+ *
+ * Currently supports only P5IOC2
+ *
+ * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/io.h>
+#include <linux/msi.h>
+#include <linux/iommu.h>
+
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
+#include <asm/ppc-pci.h>
+#include <asm/opal.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/firmware.h>
+#include <asm/eeh_event.h>
+#include <asm/eeh.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/* Delay in usec */
+#define PCI_RESET_DELAY_US	3000000
+
+#define cfg_dbg(fmt...)	do { } while(0)
+//#define cfg_dbg(fmt...)	printk(fmt)
+
+#ifdef CONFIG_PCI_MSI
+static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct msi_desc *entry;
+	struct msi_msg msg;
+	int hwirq;
+	unsigned int virq;
+	int rc;
+
+	if (WARN_ON(!phb) || !phb->msi_bmp.bitmap)
+		return -ENODEV;
+
+	if (pdev->no_64bit_msi && !phb->msi32_support)
+		return -ENODEV;
+
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
+			pr_warn("%s: Supports only 64-bit MSIs\n",
+				pci_name(pdev));
+			return -ENXIO;
+		}
+		hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1);
+		if (hwirq < 0) {
+			pr_warn("%s: Failed to find a free MSI\n",
+				pci_name(pdev));
+			return -ENOSPC;
+		}
+		virq = irq_create_mapping(NULL, phb->msi_base + hwirq);
+		if (virq == NO_IRQ) {
+			pr_warn("%s: Failed to map MSI to linux irq\n",
+				pci_name(pdev));
+			msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+			return -ENOMEM;
+		}
+		rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq,
+				    virq, entry->msi_attrib.is_64, &msg);
+		if (rc) {
+			pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
+			irq_dispose_mapping(virq);
+			msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
+			return rc;
+		}
+		irq_set_msi_desc(virq, entry);
+		pci_write_msi_msg(virq, &msg);
+	}
+	return 0;
+}
+
+static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct msi_desc *entry;
+
+	if (WARN_ON(!phb))
+		return;
+
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		if (entry->irq == NO_IRQ)
+			continue;
+		irq_set_msi_desc(entry->irq, NULL);
+		msi_bitmap_free_hwirqs(&phb->msi_bmp,
+			virq_to_hw(entry->irq) - phb->msi_base, 1);
+		irq_dispose_mapping(entry->irq);
+	}
+}
+#endif /* CONFIG_PCI_MSI */
+
+static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
+					 struct OpalIoPhbErrorCommon *common)
+{
+	struct OpalIoP7IOCPhbErrorData *data;
+	int i;
+
+	data = (struct OpalIoP7IOCPhbErrorData *)common;
+	pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n",
+		hose->global_number, be32_to_cpu(common->version));
+
+	if (data->brdgCtl)
+		pr_info("brdgCtl:     %08x\n",
+			be32_to_cpu(data->brdgCtl));
+	if (data->portStatusReg || data->rootCmplxStatus ||
+	    data->busAgentStatus)
+		pr_info("UtlSts:      %08x %08x %08x\n",
+			be32_to_cpu(data->portStatusReg),
+			be32_to_cpu(data->rootCmplxStatus),
+			be32_to_cpu(data->busAgentStatus));
+	if (data->deviceStatus || data->slotStatus   ||
+	    data->linkStatus   || data->devCmdStatus ||
+	    data->devSecStatus)
+		pr_info("RootSts:     %08x %08x %08x %08x %08x\n",
+			be32_to_cpu(data->deviceStatus),
+			be32_to_cpu(data->slotStatus),
+			be32_to_cpu(data->linkStatus),
+			be32_to_cpu(data->devCmdStatus),
+			be32_to_cpu(data->devSecStatus));
+	if (data->rootErrorStatus   || data->uncorrErrorStatus ||
+	    data->corrErrorStatus)
+		pr_info("RootErrSts:  %08x %08x %08x\n",
+			be32_to_cpu(data->rootErrorStatus),
+			be32_to_cpu(data->uncorrErrorStatus),
+			be32_to_cpu(data->corrErrorStatus));
+	if (data->tlpHdr1 || data->tlpHdr2 ||
+	    data->tlpHdr3 || data->tlpHdr4)
+		pr_info("RootErrLog:  %08x %08x %08x %08x\n",
+			be32_to_cpu(data->tlpHdr1),
+			be32_to_cpu(data->tlpHdr2),
+			be32_to_cpu(data->tlpHdr3),
+			be32_to_cpu(data->tlpHdr4));
+	if (data->sourceId || data->errorClass ||
+	    data->correlator)
+		pr_info("RootErrLog1: %08x %016llx %016llx\n",
+			be32_to_cpu(data->sourceId),
+			be64_to_cpu(data->errorClass),
+			be64_to_cpu(data->correlator));
+	if (data->p7iocPlssr || data->p7iocCsr)
+		pr_info("PhbSts:      %016llx %016llx\n",
+			be64_to_cpu(data->p7iocPlssr),
+			be64_to_cpu(data->p7iocCsr));
+	if (data->lemFir)
+		pr_info("Lem:         %016llx %016llx %016llx\n",
+			be64_to_cpu(data->lemFir),
+			be64_to_cpu(data->lemErrorMask),
+			be64_to_cpu(data->lemWOF));
+	if (data->phbErrorStatus)
+		pr_info("PhbErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbErrorStatus),
+			be64_to_cpu(data->phbFirstErrorStatus),
+			be64_to_cpu(data->phbErrorLog0),
+			be64_to_cpu(data->phbErrorLog1));
+	if (data->mmioErrorStatus)
+		pr_info("OutErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->mmioErrorStatus),
+			be64_to_cpu(data->mmioFirstErrorStatus),
+			be64_to_cpu(data->mmioErrorLog0),
+			be64_to_cpu(data->mmioErrorLog1));
+	if (data->dma0ErrorStatus)
+		pr_info("InAErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma0ErrorStatus),
+			be64_to_cpu(data->dma0FirstErrorStatus),
+			be64_to_cpu(data->dma0ErrorLog0),
+			be64_to_cpu(data->dma0ErrorLog1));
+	if (data->dma1ErrorStatus)
+		pr_info("InBErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma1ErrorStatus),
+			be64_to_cpu(data->dma1FirstErrorStatus),
+			be64_to_cpu(data->dma1ErrorLog0),
+			be64_to_cpu(data->dma1ErrorLog1));
+
+	for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
+		if ((data->pestA[i] >> 63) == 0 &&
+		    (data->pestB[i] >> 63) == 0)
+			continue;
+
+		pr_info("PE[%3d] A/B: %016llx %016llx\n",
+			i, be64_to_cpu(data->pestA[i]),
+			be64_to_cpu(data->pestB[i]));
+	}
+}
+
+static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
+					struct OpalIoPhbErrorCommon *common)
+{
+	struct OpalIoPhb3ErrorData *data;
+	int i;
+
+	data = (struct OpalIoPhb3ErrorData*)common;
+	pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n",
+		hose->global_number, be32_to_cpu(common->version));
+	if (data->brdgCtl)
+		pr_info("brdgCtl:     %08x\n",
+			be32_to_cpu(data->brdgCtl));
+	if (data->portStatusReg || data->rootCmplxStatus ||
+	    data->busAgentStatus)
+		pr_info("UtlSts:      %08x %08x %08x\n",
+			be32_to_cpu(data->portStatusReg),
+			be32_to_cpu(data->rootCmplxStatus),
+			be32_to_cpu(data->busAgentStatus));
+	if (data->deviceStatus || data->slotStatus   ||
+	    data->linkStatus   || data->devCmdStatus ||
+	    data->devSecStatus)
+		pr_info("RootSts:     %08x %08x %08x %08x %08x\n",
+			be32_to_cpu(data->deviceStatus),
+			be32_to_cpu(data->slotStatus),
+			be32_to_cpu(data->linkStatus),
+			be32_to_cpu(data->devCmdStatus),
+			be32_to_cpu(data->devSecStatus));
+	if (data->rootErrorStatus || data->uncorrErrorStatus ||
+	    data->corrErrorStatus)
+		pr_info("RootErrSts:  %08x %08x %08x\n",
+			be32_to_cpu(data->rootErrorStatus),
+			be32_to_cpu(data->uncorrErrorStatus),
+			be32_to_cpu(data->corrErrorStatus));
+	if (data->tlpHdr1 || data->tlpHdr2 ||
+	    data->tlpHdr3 || data->tlpHdr4)
+		pr_info("RootErrLog:  %08x %08x %08x %08x\n",
+			be32_to_cpu(data->tlpHdr1),
+			be32_to_cpu(data->tlpHdr2),
+			be32_to_cpu(data->tlpHdr3),
+			be32_to_cpu(data->tlpHdr4));
+	if (data->sourceId || data->errorClass ||
+	    data->correlator)
+		pr_info("RootErrLog1: %08x %016llx %016llx\n",
+			be32_to_cpu(data->sourceId),
+			be64_to_cpu(data->errorClass),
+			be64_to_cpu(data->correlator));
+	if (data->nFir)
+		pr_info("nFir:        %016llx %016llx %016llx\n",
+			be64_to_cpu(data->nFir),
+			be64_to_cpu(data->nFirMask),
+			be64_to_cpu(data->nFirWOF));
+	if (data->phbPlssr || data->phbCsr)
+		pr_info("PhbSts:      %016llx %016llx\n",
+			be64_to_cpu(data->phbPlssr),
+			be64_to_cpu(data->phbCsr));
+	if (data->lemFir)
+		pr_info("Lem:         %016llx %016llx %016llx\n",
+			be64_to_cpu(data->lemFir),
+			be64_to_cpu(data->lemErrorMask),
+			be64_to_cpu(data->lemWOF));
+	if (data->phbErrorStatus)
+		pr_info("PhbErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->phbErrorStatus),
+			be64_to_cpu(data->phbFirstErrorStatus),
+			be64_to_cpu(data->phbErrorLog0),
+			be64_to_cpu(data->phbErrorLog1));
+	if (data->mmioErrorStatus)
+		pr_info("OutErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->mmioErrorStatus),
+			be64_to_cpu(data->mmioFirstErrorStatus),
+			be64_to_cpu(data->mmioErrorLog0),
+			be64_to_cpu(data->mmioErrorLog1));
+	if (data->dma0ErrorStatus)
+		pr_info("InAErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma0ErrorStatus),
+			be64_to_cpu(data->dma0FirstErrorStatus),
+			be64_to_cpu(data->dma0ErrorLog0),
+			be64_to_cpu(data->dma0ErrorLog1));
+	if (data->dma1ErrorStatus)
+		pr_info("InBErr:      %016llx %016llx %016llx %016llx\n",
+			be64_to_cpu(data->dma1ErrorStatus),
+			be64_to_cpu(data->dma1FirstErrorStatus),
+			be64_to_cpu(data->dma1ErrorLog0),
+			be64_to_cpu(data->dma1ErrorLog1));
+
+	for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) {
+		if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 &&
+		    (be64_to_cpu(data->pestB[i]) >> 63) == 0)
+			continue;
+
+		pr_info("PE[%3d] A/B: %016llx %016llx\n",
+				i, be64_to_cpu(data->pestA[i]),
+				be64_to_cpu(data->pestB[i]));
+	}
+}
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+				unsigned char *log_buff)
+{
+	struct OpalIoPhbErrorCommon *common;
+
+	if (!hose || !log_buff)
+		return;
+
+	common = (struct OpalIoPhbErrorCommon *)log_buff;
+	switch (be32_to_cpu(common->ioType)) {
+	case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
+		pnv_pci_dump_p7ioc_diag_data(hose, common);
+		break;
+	case OPAL_PHB_ERROR_DATA_TYPE_PHB3:
+		pnv_pci_dump_phb3_diag_data(hose, common);
+		break;
+	default:
+		pr_warn("%s: Unrecognized ioType %d\n",
+			__func__, be32_to_cpu(common->ioType));
+	}
+}
+
+static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
+{
+	unsigned long flags, rc;
+	int has_diag, ret = 0;
+
+	spin_lock_irqsave(&phb->lock, flags);
+
+	/* Fetch PHB diag-data */
+	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
+					 PNV_PCI_DIAG_BUF_SIZE);
+	has_diag = (rc == OPAL_SUCCESS);
+
+	/* If PHB supports compound PE, to handle it */
+	if (phb->unfreeze_pe) {
+		ret = phb->unfreeze_pe(phb,
+				       pe_no,
+				       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+	} else {
+		rc = opal_pci_eeh_freeze_clear(phb->opal_id,
+					     pe_no,
+					     OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+		if (rc) {
+			pr_warn("%s: Failure %ld clearing frozen "
+				"PHB#%x-PE#%x\n",
+				__func__, rc, phb->hose->global_number,
+				pe_no);
+			ret = -EIO;
+		}
+	}
+
+	/*
+	 * For now, let's only display the diag buffer when we fail to clear
+	 * the EEH status. We'll do more sensible things later when we have
+	 * proper EEH support. We need to make sure we don't pollute ourselves
+	 * with the normal errors generated when probing empty slots
+	 */
+	if (has_diag && ret)
+		pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob);
+
+	spin_unlock_irqrestore(&phb->lock, flags);
+}
+
+static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
+{
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u8	fstate;
+	__be16	pcierr;
+	int	pe_no;
+	s64	rc;
+
+	/*
+	 * Get the PE#. During the PCI probe stage, we might not
+	 * setup that yet. So all ER errors should be mapped to
+	 * reserved PE.
+	 */
+	pe_no = pdn->pe_number;
+	if (pe_no == IODA_INVALID_PE) {
+		if (phb->type == PNV_PHB_P5IOC2)
+			pe_no = 0;
+		else
+			pe_no = phb->ioda.reserved_pe;
+	}
+
+	/*
+	 * Fetch frozen state. If the PHB support compound PE,
+	 * we need handle that case.
+	 */
+	if (phb->get_pe_state) {
+		fstate = phb->get_pe_state(phb, pe_no);
+	} else {
+		rc = opal_pci_eeh_freeze_status(phb->opal_id,
+						pe_no,
+						&fstate,
+						&pcierr,
+						NULL);
+		if (rc) {
+			pr_warn("%s: Failure %lld getting PHB#%x-PE#%x state\n",
+				__func__, rc, phb->hose->global_number, pe_no);
+			return;
+		}
+	}
+
+	cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
+		(pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
+
+	/* Clear the frozen state if applicable */
+	if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE ||
+	    fstate == OPAL_EEH_STOPPED_DMA_FREEZE  ||
+	    fstate == OPAL_EEH_STOPPED_MMIO_DMA_FREEZE) {
+		/*
+		 * If PHB supports compound PE, freeze it for
+		 * consistency.
+		 */
+		if (phb->freeze_pe)
+			phb->freeze_pe(phb, pe_no);
+
+		pnv_pci_handle_eeh_config(phb, pe_no);
+	}
+}
+
+int pnv_pci_cfg_read(struct pci_dn *pdn,
+		     int where, int size, u32 *val)
+{
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+	s64 rc;
+
+	switch (size) {
+	case 1: {
+		u8 v8;
+		rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8);
+		*val = (rc == OPAL_SUCCESS) ? v8 : 0xff;
+		break;
+	}
+	case 2: {
+		__be16 v16;
+		rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where,
+						   &v16);
+		*val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff;
+		break;
+	}
+	case 4: {
+		__be32 v32;
+		rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32);
+		*val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff;
+		break;
+	}
+	default:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+	}
+
+	cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+		__func__, pdn->busno, pdn->devfn, where, size, *val);
+	return PCIBIOS_SUCCESSFUL;
+}
+
+int pnv_pci_cfg_write(struct pci_dn *pdn,
+		      int where, int size, u32 val)
+{
+	struct pnv_phb *phb = pdn->phb->private_data;
+	u32 bdfn = (pdn->busno << 8) | pdn->devfn;
+
+	cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n",
+		pdn->busno, pdn->devfn, where, size, val);
+	switch (size) {
+	case 1:
+		opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);
+		break;
+	case 2:
+		opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val);
+		break;
+	case 4:
+		opal_pci_config_write_word(phb->opal_id, bdfn, where, val);
+		break;
+	default:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+#if CONFIG_EEH
+static bool pnv_pci_cfg_check(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = NULL;
+	struct pnv_phb *phb = pdn->phb->private_data;
+
+	/* EEH not enabled ? */
+	if (!(phb->flags & PNV_PHB_FLAG_EEH))
+		return true;
+
+	/* PE reset or device removed ? */
+	edev = pdn->edev;
+	if (edev) {
+		if (edev->pe &&
+		    (edev->pe->state & EEH_PE_CFG_BLOCKED))
+			return false;
+
+		if (edev->mode & EEH_DEV_REMOVED)
+			return false;
+	}
+
+	return true;
+}
+#else
+static inline pnv_pci_cfg_check(struct pci_dn *pdn)
+{
+	return true;
+}
+#endif /* CONFIG_EEH */
+
+static int pnv_pci_read_config(struct pci_bus *bus,
+			       unsigned int devfn,
+			       int where, int size, u32 *val)
+{
+	struct pci_dn *pdn;
+	struct pnv_phb *phb;
+	int ret;
+
+	*val = 0xFFFFFFFF;
+	pdn = pci_get_pdn_by_devfn(bus, devfn);
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (!pnv_pci_cfg_check(pdn))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	ret = pnv_pci_cfg_read(pdn, where, size, val);
+	phb = pdn->phb->private_data;
+	if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) {
+		if (*val == EEH_IO_ERROR_VALUE(size) &&
+		    eeh_dev_check_failure(pdn->edev))
+                        return PCIBIOS_DEVICE_NOT_FOUND;
+	} else {
+		pnv_pci_config_check_eeh(pdn);
+	}
+
+	return ret;
+}
+
+static int pnv_pci_write_config(struct pci_bus *bus,
+				unsigned int devfn,
+				int where, int size, u32 val)
+{
+	struct pci_dn *pdn;
+	struct pnv_phb *phb;
+	int ret;
+
+	pdn = pci_get_pdn_by_devfn(bus, devfn);
+	if (!pdn)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (!pnv_pci_cfg_check(pdn))
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	ret = pnv_pci_cfg_write(pdn, where, size, val);
+	phb = pdn->phb->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH))
+		pnv_pci_config_check_eeh(pdn);
+
+	return ret;
+}
+
+struct pci_ops pnv_pci_ops = {
+	.read  = pnv_pci_read_config,
+	.write = pnv_pci_write_config,
+};
+
+static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+			 unsigned long uaddr, enum dma_data_direction direction,
+			 struct dma_attrs *attrs, bool rm)
+{
+	u64 proto_tce;
+	__be64 *tcep, *tces;
+	u64 rpn;
+
+	proto_tce = TCE_PCI_READ; // Read allowed
+
+	if (direction != DMA_TO_DEVICE)
+		proto_tce |= TCE_PCI_WRITE;
+
+	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	rpn = __pa(uaddr) >> tbl->it_page_shift;
+
+	while (npages--)
+		*(tcep++) = cpu_to_be64(proto_tce |
+				(rpn++ << tbl->it_page_shift));
+
+	/* Some implementations won't cache invalid TCEs and thus may not
+	 * need that flush. We'll probably turn it_type into a bit mask
+	 * of flags if that becomes the case
+	 */
+	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+
+	return 0;
+}
+
+static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages,
+			    unsigned long uaddr,
+			    enum dma_data_direction direction,
+			    struct dma_attrs *attrs)
+{
+	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
+			false);
+}
+
+static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
+		bool rm)
+{
+	__be64 *tcep, *tces;
+
+	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+
+	while (npages--)
+		*(tcep++) = cpu_to_be64(0);
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+}
+
+static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages)
+{
+	pnv_tce_free(tbl, index, npages, false);
+}
+
+static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+{
+	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
+}
+
+static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages,
+			    unsigned long uaddr,
+			    enum dma_data_direction direction,
+			    struct dma_attrs *attrs)
+{
+	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true);
+}
+
+static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages)
+{
+	pnv_tce_free(tbl, index, npages, true);
+}
+
+void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+			       void *tce_mem, u64 tce_size,
+			       u64 dma_offset, unsigned page_shift)
+{
+	tbl->it_blocksize = 16;
+	tbl->it_base = (unsigned long)tce_mem;
+	tbl->it_page_shift = page_shift;
+	tbl->it_offset = dma_offset >> tbl->it_page_shift;
+	tbl->it_index = 0;
+	tbl->it_size = tce_size >> 3;
+	tbl->it_busno = 0;
+	tbl->it_type = TCE_PCI;
+}
+
+static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+#ifdef CONFIG_PCI_IOV
+	struct pnv_ioda_pe *pe;
+	struct pci_dn *pdn;
+
+	/* Fix the VF pdn PE number */
+	if (pdev->is_virtfn) {
+		pdn = pci_get_pdn(pdev);
+		WARN_ON(pdn->pe_number != IODA_INVALID_PE);
+		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+			if (pe->rid == ((pdev->bus->number << 8) |
+			    (pdev->devfn & 0xff))) {
+				pdn->pe_number = pe->pe_number;
+				pe->pdev = pdev;
+				break;
+			}
+		}
+	}
+#endif /* CONFIG_PCI_IOV */
+
+	if (phb && phb->dma_dev_setup)
+		phb->dma_dev_setup(phb, pdev);
+}
+
+int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	if (phb && phb->dma_set_mask)
+		return phb->dma_set_mask(phb, pdev, dma_mask);
+	return __dma_set_mask(&pdev->dev, dma_mask);
+}
+
+u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev)
+{
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	if (phb && phb->dma_get_required_mask)
+		return phb->dma_get_required_mask(phb, pdev);
+
+	return __dma_get_required_mask(&pdev->dev);
+}
+
+void pnv_pci_shutdown(void)
+{
+	struct pci_controller *hose;
+
+	list_for_each_entry(hose, &hose_list, list_node) {
+		struct pnv_phb *phb = hose->private_data;
+
+		if (phb && phb->shutdown)
+			phb->shutdown(phb);
+	}
+}
+
+/* Fixup wrong class code in p7ioc and p8 root complex */
+static void pnv_p7ioc_rc_quirk(struct pci_dev *dev)
+{
+	dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
+
+void __init pnv_pci_init(void)
+{
+	struct device_node *np;
+	bool found_ioda = false;
+
+	pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
+
+	/* If we don't have OPAL, eg. in sim, just skip PCI probe */
+	if (!firmware_has_feature(FW_FEATURE_OPAL))
+		return;
+
+	/* Look for IODA IO-Hubs. We don't support mixing IODA
+	 * and p5ioc2 due to the need to change some global
+	 * probing flags
+	 */
+	for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
+		pnv_pci_init_ioda_hub(np);
+		found_ioda = true;
+	}
+
+	/* Look for p5ioc2 IO-Hubs */
+	if (!found_ioda)
+		for_each_compatible_node(np, NULL, "ibm,p5ioc2")
+			pnv_pci_init_p5ioc2_hub(np);
+
+	/* Look for ioda2 built-in PHB3's */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
+		pnv_pci_init_ioda2_phb(np);
+
+	/* Setup the linkage between OF nodes and PHBs */
+	pci_devs_phb_init();
+
+	/* Configure IOMMU DMA hooks */
+	ppc_md.tce_build = pnv_tce_build_vm;
+	ppc_md.tce_free = pnv_tce_free_vm;
+	ppc_md.tce_build_rm = pnv_tce_build_rm;
+	ppc_md.tce_free_rm = pnv_tce_free_rm;
+	ppc_md.tce_get = pnv_tce_get;
+	set_pci_dma_ops(&dma_iommu_ops);
+
+	/* Configure MSIs */
+#ifdef CONFIG_PCI_MSI
+	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
+	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
+#endif
+}
+
+machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init);
+
+struct pci_controller_ops pnv_pci_controller_ops = {
+	.dma_dev_setup = pnv_pci_dma_dev_setup,
+};
diff --git a/kernel/arch/powerpc/platforms/powernv/pci.h b/kernel/arch/powerpc/platforms/powernv/pci.h
new file mode 100644
index 000000000..070ee888f
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/pci.h
@@ -0,0 +1,221 @@
+#ifndef __POWERNV_PCI_H
+#define __POWERNV_PCI_H
+
+struct pci_dn;
+
+enum pnv_phb_type {
+	PNV_PHB_P5IOC2	= 0,
+	PNV_PHB_IODA1	= 1,
+	PNV_PHB_IODA2	= 2,
+};
+
+/* Precise PHB model for error management */
+enum pnv_phb_model {
+	PNV_PHB_MODEL_UNKNOWN,
+	PNV_PHB_MODEL_P5IOC2,
+	PNV_PHB_MODEL_P7IOC,
+	PNV_PHB_MODEL_PHB3,
+};
+
+#define PNV_PCI_DIAG_BUF_SIZE	8192
+#define PNV_IODA_PE_DEV		(1 << 0)	/* PE has single PCI device	*/
+#define PNV_IODA_PE_BUS		(1 << 1)	/* PE has primary PCI bus	*/
+#define PNV_IODA_PE_BUS_ALL	(1 << 2)	/* PE has subordinate buses	*/
+#define PNV_IODA_PE_MASTER	(1 << 3)	/* Master PE in compound case	*/
+#define PNV_IODA_PE_SLAVE	(1 << 4)	/* Slave PE in compound case	*/
+#define PNV_IODA_PE_VF		(1 << 5)	/* PE for one VF 		*/
+
+/* Data associated with a PE, including IOMMU tracking etc.. */
+struct pnv_phb;
+struct pnv_ioda_pe {
+	unsigned long		flags;
+	struct pnv_phb		*phb;
+
+	/* A PE can be associated with a single device or an
+	 * entire bus (& children). In the former case, pdev
+	 * is populated, in the later case, pbus is.
+	 */
+#ifdef CONFIG_PCI_IOV
+	struct pci_dev          *parent_dev;
+#endif
+	struct pci_dev		*pdev;
+	struct pci_bus		*pbus;
+
+	/* Effective RID (device RID for a device PE and base bus
+	 * RID with devfn 0 for a bus PE)
+	 */
+	unsigned int		rid;
+
+	/* PE number */
+	unsigned int		pe_number;
+
+	/* "Weight" assigned to the PE for the sake of DMA resource
+	 * allocations
+	 */
+	unsigned int		dma_weight;
+
+	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
+	int			tce32_seg;
+	int			tce32_segcount;
+	struct iommu_table	*tce32_table;
+	phys_addr_t		tce_inval_reg_phys;
+
+	/* 64-bit TCE bypass region */
+	bool			tce_bypass_enabled;
+	uint64_t		tce_bypass_base;
+
+	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
+	 * and -1 if not supported. (It's actually identical to the
+	 * PE number)
+	 */
+	int			mve_number;
+
+	/* PEs in compound case */
+	struct pnv_ioda_pe	*master;
+	struct list_head	slaves;
+
+	/* Link in list of PE#s */
+	struct list_head	dma_link;
+	struct list_head	list;
+};
+
+#define PNV_PHB_FLAG_EEH	(1 << 0)
+
+struct pnv_phb {
+	struct pci_controller	*hose;
+	enum pnv_phb_type	type;
+	enum pnv_phb_model	model;
+	u64			hub_id;
+	u64			opal_id;
+	int			flags;
+	void __iomem		*regs;
+	int			initialized;
+	spinlock_t		lock;
+
+#ifdef CONFIG_DEBUG_FS
+	int			has_dbgfs;
+	struct dentry		*dbgfs;
+#endif
+
+#ifdef CONFIG_PCI_MSI
+	unsigned int		msi_base;
+	unsigned int		msi32_support;
+	struct msi_bitmap	msi_bmp;
+#endif
+	int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
+			 unsigned int hwirq, unsigned int virq,
+			 unsigned int is_64, struct msi_msg *msg);
+	void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
+	int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev,
+			    u64 dma_mask);
+	u64 (*dma_get_required_mask)(struct pnv_phb *phb,
+				     struct pci_dev *pdev);
+	void (*fixup_phb)(struct pci_controller *hose);
+	u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
+	void (*shutdown)(struct pnv_phb *phb);
+	int (*init_m64)(struct pnv_phb *phb);
+	void (*reserve_m64_pe)(struct pnv_phb *phb);
+	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
+	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
+	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
+	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
+
+	union {
+		struct {
+			struct iommu_table iommu_table;
+		} p5ioc2;
+
+		struct {
+			/* Global bridge info */
+			unsigned int		total_pe;
+			unsigned int		reserved_pe;
+
+			/* 32-bit MMIO window */
+			unsigned int		m32_size;
+			unsigned int		m32_segsize;
+			unsigned int		m32_pci_base;
+
+			/* 64-bit MMIO window */
+			unsigned int		m64_bar_idx;
+			unsigned long		m64_size;
+			unsigned long		m64_segsize;
+			unsigned long		m64_base;
+			unsigned long		m64_bar_alloc;
+
+			/* IO ports */
+			unsigned int		io_size;
+			unsigned int		io_segsize;
+			unsigned int		io_pci_base;
+
+			/* PE allocation bitmap */
+			unsigned long		*pe_alloc;
+			/* PE allocation mutex */
+			struct mutex		pe_alloc_mutex;
+
+			/* M32 & IO segment maps */
+			unsigned int		*m32_segmap;
+			unsigned int		*io_segmap;
+			struct pnv_ioda_pe	*pe_array;
+
+			/* IRQ chip */
+			int			irq_chip_init;
+			struct irq_chip		irq_chip;
+
+			/* Sorted list of used PE's based
+			 * on the sequence of creation
+			 */
+			struct list_head	pe_list;
+			struct mutex            pe_list_mutex;
+
+			/* Reverse map of PEs, will have to extend if
+			 * we are to support more than 256 PEs, indexed
+			 * bus { bus, devfn }
+			 */
+			unsigned char		pe_rmap[0x10000];
+
+			/* 32-bit TCE tables allocation */
+			unsigned long		tce32_count;
+
+			/* Total "weight" for the sake of DMA resources
+			 * allocation
+			 */
+			unsigned int		dma_weight;
+			unsigned int		dma_pe_count;
+
+			/* Sorted list of used PE's, sorted at
+			 * boot for resource allocation purposes
+			 */
+			struct list_head	pe_dma_list;
+		} ioda;
+	};
+
+	/* PHB and hub status structure */
+	union {
+		unsigned char			blob[PNV_PCI_DIAG_BUF_SIZE];
+		struct OpalIoP7IOCPhbErrorData	p7ioc;
+		struct OpalIoPhb3ErrorData	phb3;
+		struct OpalIoP7IOCErrorData 	hub_diag;
+	} diag;
+
+};
+
+extern struct pci_ops pnv_pci_ops;
+
+void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+				unsigned char *log_buff);
+int pnv_pci_cfg_read(struct pci_dn *pdn,
+		     int where, int size, u32 *val);
+int pnv_pci_cfg_write(struct pci_dn *pdn,
+		      int where, int size, u32 val);
+extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
+				      void *tce_mem, u64 tce_size,
+				      u64 dma_offset, unsigned page_shift);
+extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
+extern void pnv_pci_init_ioda_hub(struct device_node *np);
+extern void pnv_pci_init_ioda2_phb(struct device_node *np);
+extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
+					__be64 *startp, __be64 *endp, bool rm);
+extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
+extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
+
+#endif /* __POWERNV_PCI_H */
diff --git a/kernel/arch/powerpc/platforms/powernv/powernv.h b/kernel/arch/powerpc/platforms/powernv/powernv.h
new file mode 100644
index 000000000..826d2c9be
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/powernv.h
@@ -0,0 +1,40 @@
+#ifndef _POWERNV_H
+#define _POWERNV_H
+
+#ifdef CONFIG_SMP
+extern void pnv_smp_init(void);
+#else
+static inline void pnv_smp_init(void) { }
+#endif
+
+struct pci_dev;
+
+#ifdef CONFIG_PCI
+extern void pnv_pci_init(void);
+extern void pnv_pci_shutdown(void);
+extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask);
+extern u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev);
+#else
+static inline void pnv_pci_init(void) { }
+static inline void pnv_pci_shutdown(void) { }
+
+static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+	return -ENODEV;
+}
+
+static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev)
+{
+	return 0;
+}
+#endif
+
+extern struct pci_controller_ops pnv_pci_controller_ops;
+
+extern u32 pnv_get_supported_cpuidle_states(void);
+
+extern void pnv_lpc_init(void);
+
+bool cpu_core_split_required(void);
+
+#endif /* _POWERNV_H */
diff --git a/kernel/arch/powerpc/platforms/powernv/rng.c b/kernel/arch/powerpc/platforms/powernv/rng.c
new file mode 100644
index 000000000..6eb808ff6
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/rng.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt)	"powernv-rng: " fmt
+
+#include <linux/kernel.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <asm/archrandom.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/smp.h>
+
+
+struct powernv_rng {
+	void __iomem *regs;
+	void __iomem *regs_real;
+	unsigned long mask;
+};
+
+static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+
+
+int powernv_hwrng_present(void)
+{
+	struct powernv_rng *rng;
+
+	rng = get_cpu_var(powernv_rng);
+	put_cpu_var(rng);
+	return rng != NULL;
+}
+
+static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+{
+	unsigned long parity;
+
+	/* Calculate the parity of the value */
+	asm ("popcntd %0,%1" : "=r" (parity) : "r" (val));
+
+	/* xor our value with the previous mask */
+	val ^= rng->mask;
+
+	/* update the mask based on the parity of this value */
+	rng->mask = (rng->mask << 1) | (parity & 1);
+
+	return val;
+}
+
+int powernv_get_random_real_mode(unsigned long *v)
+{
+	struct powernv_rng *rng;
+
+	rng = raw_cpu_read(powernv_rng);
+
+	*v = rng_whiten(rng, in_rm64(rng->regs_real));
+
+	return 1;
+}
+
+int powernv_get_random_long(unsigned long *v)
+{
+	struct powernv_rng *rng;
+
+	rng = get_cpu_var(powernv_rng);
+
+	*v = rng_whiten(rng, in_be64(rng->regs));
+
+	put_cpu_var(rng);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(powernv_get_random_long);
+
+static __init void rng_init_per_cpu(struct powernv_rng *rng,
+				    struct device_node *dn)
+{
+	int chip_id, cpu;
+
+	chip_id = of_get_ibm_chip_id(dn);
+	if (chip_id == -1)
+		pr_warn("No ibm,chip-id found for %s.\n", dn->full_name);
+
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(powernv_rng, cpu) == NULL ||
+		    cpu_to_chip_id(cpu) == chip_id) {
+			per_cpu(powernv_rng, cpu) = rng;
+		}
+	}
+}
+
+static __init int rng_create(struct device_node *dn)
+{
+	struct powernv_rng *rng;
+	struct resource res;
+	unsigned long val;
+
+	rng = kzalloc(sizeof(*rng), GFP_KERNEL);
+	if (!rng)
+		return -ENOMEM;
+
+	if (of_address_to_resource(dn, 0, &res)) {
+		kfree(rng);
+		return -ENXIO;
+	}
+
+	rng->regs_real = (void __iomem *)res.start;
+
+	rng->regs = of_iomap(dn, 0);
+	if (!rng->regs) {
+		kfree(rng);
+		return -ENXIO;
+	}
+
+	val = in_be64(rng->regs);
+	rng->mask = val;
+
+	rng_init_per_cpu(rng, dn);
+
+	pr_info_once("Registering arch random hook.\n");
+
+	ppc_md.get_random_long = powernv_get_random_long;
+
+	return 0;
+}
+
+static __init int rng_init(void)
+{
+	struct device_node *dn;
+	int rc;
+
+	for_each_compatible_node(dn, NULL, "ibm,power-rng") {
+		rc = rng_create(dn);
+		if (rc) {
+			pr_err("Failed creating rng for %s (%d).\n",
+				dn->full_name, rc);
+			continue;
+		}
+
+		/* Create devices for hwrng driver */
+		of_platform_device_create(dn, NULL, NULL);
+	}
+
+	return 0;
+}
+machine_subsys_initcall(powernv, rng_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/setup.c b/kernel/arch/powerpc/platforms/powernv/setup.c
new file mode 100644
index 000000000..16fdcb23f
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/setup.c
@@ -0,0 +1,503 @@
+/*
+ * PowerNV setup code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/tty.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/interrupt.h>
+#include <linux/bug.h>
+#include <linux/pci.h>
+#include <linux/cpufreq.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/opal.h>
+#include <asm/kexec.h>
+#include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
+#include <asm/code-patching.h>
+
+#include "powernv.h"
+#include "subcore.h"
+
+static void __init pnv_setup_arch(void)
+{
+	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
+	/* Initialize SMP */
+	pnv_smp_init();
+
+	/* Setup PCI */
+	pnv_pci_init();
+
+	/* Setup RTC and NVRAM callbacks */
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		opal_nvram_init();
+
+	/* Enable NAP mode */
+	powersave_nap = 1;
+
+	/* XXX PMCS */
+}
+
+static void __init pnv_init_early(void)
+{
+	/*
+	 * Initialize the LPC bus now so that legacy serial
+	 * ports can be found on it
+	 */
+	opal_lpc_init();
+
+#ifdef CONFIG_HVC_OPAL
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		hvc_opal_init_early();
+	else
+#endif
+		add_preferred_console("hvc", 0, NULL);
+}
+
+static void __init pnv_init_IRQ(void)
+{
+	xics_init();
+
+	WARN_ON(!ppc_md.get_irq);
+}
+
+static void pnv_show_cpuinfo(struct seq_file *m)
+{
+	struct device_node *root;
+	const char *model = "";
+
+	root = of_find_node_by_path("/");
+	if (root)
+		model = of_get_property(root, "model", NULL);
+	seq_printf(m, "machine\t\t: PowerNV %s\n", model);
+	if (firmware_has_feature(FW_FEATURE_OPALv3))
+		seq_printf(m, "firmware\t: OPAL v3\n");
+	else if (firmware_has_feature(FW_FEATURE_OPALv2))
+		seq_printf(m, "firmware\t: OPAL v2\n");
+	else if (firmware_has_feature(FW_FEATURE_OPAL))
+		seq_printf(m, "firmware\t: OPAL v1\n");
+	else
+		seq_printf(m, "firmware\t: BML\n");
+	of_node_put(root);
+}
+
+static void pnv_prepare_going_down(void)
+{
+	/*
+	 * Disable all notifiers from OPAL, we can't
+	 * service interrupts anymore anyway
+	 */
+	opal_notifier_disable();
+
+	/* Soft disable interrupts */
+	local_irq_disable();
+
+	/*
+	 * Return secondary CPUs to firwmare if a flash update
+	 * is pending otherwise we will get all sort of error
+	 * messages about CPU being stuck etc.. This will also
+	 * have the side effect of hard disabling interrupts so
+	 * past this point, the kernel is effectively dead.
+	 */
+	opal_flash_term_callback();
+}
+
+static void  __noreturn pnv_restart(char *cmd)
+{
+	long rc = OPAL_BUSY;
+
+	pnv_prepare_going_down();
+
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_cec_reboot();
+		if (rc == OPAL_BUSY_EVENT)
+			opal_poll_events(NULL);
+		else
+			mdelay(10);
+	}
+	for (;;)
+		opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_power_off(void)
+{
+	long rc = OPAL_BUSY;
+
+	pnv_prepare_going_down();
+
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_cec_power_down(0);
+		if (rc == OPAL_BUSY_EVENT)
+			opal_poll_events(NULL);
+		else
+			mdelay(10);
+	}
+	for (;;)
+		opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_halt(void)
+{
+	pnv_power_off();
+}
+
+static void pnv_progress(char *s, unsigned short hex)
+{
+}
+
+static int pnv_dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	if (dev_is_pci(dev))
+		return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask);
+	return __dma_set_mask(dev, dma_mask);
+}
+
+static u64 pnv_dma_get_required_mask(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pnv_pci_dma_get_required_mask(to_pci_dev(dev));
+
+	return __dma_get_required_mask(dev);
+}
+
+static void pnv_shutdown(void)
+{
+	/* Let the PCI code clear up IODA tables */
+	pnv_pci_shutdown();
+
+	/*
+	 * Stop OPAL activity: Unregister all OPAL interrupts so they
+	 * don't fire up while we kexec and make sure all potentially
+	 * DMA'ing ops are complete (such as dump retrieval).
+	 */
+	opal_shutdown();
+}
+
+#ifdef CONFIG_KEXEC
+static void pnv_kexec_wait_secondaries_down(void)
+{
+	int my_cpu, i, notified = -1;
+
+	my_cpu = get_cpu();
+
+	for_each_online_cpu(i) {
+		uint8_t status;
+		int64_t rc;
+
+		if (i == my_cpu)
+			continue;
+
+		for (;;) {
+			rc = opal_query_cpu_status(get_hard_smp_processor_id(i),
+						   &status);
+			if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED)
+				break;
+			barrier();
+			if (i != notified) {
+				printk(KERN_INFO "kexec: waiting for cpu %d "
+				       "(physical %d) to enter OPAL\n",
+				       i, paca[i].hw_cpu_id);
+				notified = i;
+			}
+		}
+	}
+}
+
+static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+	xics_kexec_teardown_cpu(secondary);
+
+	/* On OPAL v3, we return all CPUs to firmware */
+
+	if (!firmware_has_feature(FW_FEATURE_OPALv3))
+		return;
+
+	if (secondary) {
+		/* Return secondary CPUs to firmware on OPAL v3 */
+		mb();
+		get_paca()->kexec_state = KEXEC_STATE_REAL_MODE;
+		mb();
+
+		/* Return the CPU to OPAL */
+		opal_return_cpu();
+	} else if (crash_shutdown) {
+		/*
+		 * On crash, we don't wait for secondaries to go
+		 * down as they might be unreachable or hung, so
+		 * instead we just wait a bit and move on.
+		 */
+		mdelay(1);
+	} else {
+		/* Primary waits for the secondaries to have reached OPAL */
+		pnv_kexec_wait_secondaries_down();
+	}
+}
+#endif /* CONFIG_KEXEC */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static unsigned long pnv_memory_block_size(void)
+{
+	return 256UL * 1024 * 1024;
+}
+#endif
+
+static void __init pnv_setup_machdep_opal(void)
+{
+	ppc_md.get_boot_time = opal_get_boot_time;
+	ppc_md.restart = pnv_restart;
+	pm_power_off = pnv_power_off;
+	ppc_md.halt = pnv_halt;
+	ppc_md.machine_check_exception = opal_machine_check;
+	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
+	ppc_md.hmi_exception_early = opal_hmi_exception_early;
+	ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
+}
+
+static u32 supported_cpuidle_states;
+
+int pnv_save_sprs_for_winkle(void)
+{
+	int cpu;
+	int rc;
+
+	/*
+	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross
+	 * all cpus at boot. Get these reg values of current cpu and use the
+	 * same accross all cpus.
+	 */
+	uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
+	uint64_t hid0_val = mfspr(SPRN_HID0);
+	uint64_t hid1_val = mfspr(SPRN_HID1);
+	uint64_t hid4_val = mfspr(SPRN_HID4);
+	uint64_t hid5_val = mfspr(SPRN_HID5);
+	uint64_t hmeer_val = mfspr(SPRN_HMEER);
+
+	for_each_possible_cpu(cpu) {
+		uint64_t pir = get_hard_smp_processor_id(cpu);
+		uint64_t hsprg0_val = (uint64_t)&paca[cpu];
+
+		/*
+		 * HSPRG0 is used to store the cpu's pointer to paca. Hence last
+		 * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0
+		 * with 63rd bit set, so that when a thread wakes up at 0x100 we
+		 * can use this bit to distinguish between fastsleep and
+		 * deep winkle.
+		 */
+		hsprg0_val |= 1;
+
+		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
+		if (rc != 0)
+			return rc;
+
+		rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
+		if (rc != 0)
+			return rc;
+
+		/* HIDs are per core registers */
+		if (cpu_thread_in_core(cpu) == 0) {
+
+			rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
+			if (rc != 0)
+				return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void pnv_alloc_idle_core_states(void)
+{
+	int i, j;
+	int nr_cores = cpu_nr_cores();
+	u32 *core_idle_state;
+
+	/*
+	 * core_idle_state - First 8 bits track the idle state of each thread
+	 * of the core. The 8th bit is the lock bit. Initially all thread bits
+	 * are set. They are cleared when the thread enters deep idle state
+	 * like sleep and winkle. Initially the lock bit is cleared.
+	 * The lock bit has 2 purposes
+	 * a. While the first thread is restoring core state, it prevents
+	 * other threads in the core from switching to process context.
+	 * b. While the last thread in the core is saving the core state, it
+	 * prevents a different thread from waking up.
+	 */
+	for (i = 0; i < nr_cores; i++) {
+		int first_cpu = i * threads_per_core;
+		int node = cpu_to_node(first_cpu);
+
+		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
+		*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+
+		for (j = 0; j < threads_per_core; j++) {
+			int cpu = first_cpu + j;
+
+			paca[cpu].core_idle_state_ptr = core_idle_state;
+			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
+			paca[cpu].thread_mask = 1 << j;
+		}
+	}
+
+	update_subcore_sibling_mask();
+
+	if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED)
+		pnv_save_sprs_for_winkle();
+}
+
+u32 pnv_get_supported_cpuidle_states(void)
+{
+	return supported_cpuidle_states;
+}
+EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
+
+static int __init pnv_init_idle_states(void)
+{
+	struct device_node *power_mgt;
+	int dt_idle_states;
+	u32 *flags;
+	int i;
+
+	supported_cpuidle_states = 0;
+
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		goto out;
+
+	if (!firmware_has_feature(FW_FEATURE_OPALv3))
+		goto out;
+
+	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		pr_warn("opal: PowerMgmt Node not found\n");
+		goto out;
+	}
+	dt_idle_states = of_property_count_u32_elems(power_mgt,
+			"ibm,cpu-idle-state-flags");
+	if (dt_idle_states < 0) {
+		pr_warn("cpuidle-powernv: no idle states found in the DT\n");
+		goto out;
+	}
+
+	flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL);
+	if (of_property_read_u32_array(power_mgt,
+			"ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
+		goto out_free;
+	}
+
+	for (i = 0; i < dt_idle_states; i++)
+		supported_cpuidle_states |= flags[i];
+
+	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+		patch_instruction(
+			(unsigned int *)pnv_fastsleep_workaround_at_entry,
+			PPC_INST_NOP);
+		patch_instruction(
+			(unsigned int *)pnv_fastsleep_workaround_at_exit,
+			PPC_INST_NOP);
+	}
+	pnv_alloc_idle_core_states();
+out_free:
+	kfree(flags);
+out:
+	return 0;
+}
+
+subsys_initcall(pnv_init_idle_states);
+
+static int __init pnv_probe(void)
+{
+	unsigned long root = of_get_flat_dt_root();
+
+	if (!of_flat_dt_is_compatible(root, "ibm,powernv"))
+		return 0;
+
+	hpte_init_native();
+
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		pnv_setup_machdep_opal();
+
+	pr_debug("PowerNV detected !\n");
+
+	return 1;
+}
+
+/*
+ * Returns the cpu frequency for 'cpu' in Hz. This is used by
+ * /proc/cpuinfo
+ */
+static unsigned long pnv_get_proc_freq(unsigned int cpu)
+{
+	unsigned long ret_freq;
+
+	ret_freq = cpufreq_quick_get(cpu) * 1000ul;
+
+	/*
+	 * If the backend cpufreq driver does not exist,
+         * then fallback to old way of reporting the clockrate.
+	 */
+	if (!ret_freq)
+		ret_freq = ppc_proc_freq;
+	return ret_freq;
+}
+
+define_machine(powernv) {
+	.name			= "PowerNV",
+	.probe			= pnv_probe,
+	.init_early		= pnv_init_early,
+	.setup_arch		= pnv_setup_arch,
+	.init_IRQ		= pnv_init_IRQ,
+	.show_cpuinfo		= pnv_show_cpuinfo,
+	.get_proc_freq          = pnv_get_proc_freq,
+	.progress		= pnv_progress,
+	.machine_shutdown	= pnv_shutdown,
+	.power_save             = power7_idle,
+	.calibrate_decr		= generic_calibrate_decr,
+	.dma_set_mask		= pnv_dma_set_mask,
+	.dma_get_required_mask	= pnv_dma_get_required_mask,
+#ifdef CONFIG_KEXEC
+	.kexec_cpu_down		= pnv_kexec_cpu_down,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+	.memory_block_size	= pnv_memory_block_size,
+#endif
+};
diff --git a/kernel/arch/powerpc/platforms/powernv/smp.c b/kernel/arch/powerpc/platforms/powernv/smp.c
new file mode 100644
index 000000000..8f70ba681
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/smp.c
@@ -0,0 +1,256 @@
+/*
+ * SMP support for PowerNV machines.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+
+#include <asm/irq.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+#include <asm/opal.h>
+#include <asm/runlatch.h>
+#include <asm/code-patching.h>
+#include <asm/dbell.h>
+#include <asm/kvm_ppc.h>
+#include <asm/ppc-opcode.h>
+
+#include "powernv.h"
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+static void pnv_smp_setup_cpu(int cpu)
+{
+	if (cpu != boot_cpuid)
+		xics_setup_cpu();
+
+#ifdef CONFIG_PPC_DOORBELL
+	if (cpu_has_feature(CPU_FTR_DBELL))
+		doorbell_setup_this_cpu();
+#endif
+}
+
+static int pnv_smp_kick_cpu(int nr)
+{
+	unsigned int pcpu = get_hard_smp_processor_id(nr);
+	unsigned long start_here =
+			__pa(ppc_function_entry(generic_secondary_smp_init));
+	long rc;
+
+	BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+	/*
+	 * If we already started or OPALv2 is not supported, we just
+	 * kick the CPU via the PACA
+	 */
+	if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2))
+		goto kick;
+
+	/*
+	 * At this point, the CPU can either be spinning on the way in
+	 * from kexec or be inside OPAL waiting to be started for the
+	 * first time. OPAL v3 allows us to query OPAL to know if it
+	 * has the CPUs, so we do that
+	 */
+	if (firmware_has_feature(FW_FEATURE_OPALv3)) {
+		uint8_t status;
+
+		rc = opal_query_cpu_status(pcpu, &status);
+		if (rc != OPAL_SUCCESS) {
+			pr_warn("OPAL Error %ld querying CPU %d state\n",
+				rc, nr);
+			return -ENODEV;
+		}
+
+		/*
+		 * Already started, just kick it, probably coming from
+		 * kexec and spinning
+		 */
+		if (status == OPAL_THREAD_STARTED)
+			goto kick;
+
+		/*
+		 * Available/inactive, let's kick it
+		 */
+		if (status == OPAL_THREAD_INACTIVE) {
+			pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n",
+				 nr, pcpu);
+			rc = opal_start_cpu(pcpu, start_here);
+			if (rc != OPAL_SUCCESS) {
+				pr_warn("OPAL Error %ld starting CPU %d\n",
+					rc, nr);
+				return -ENODEV;
+			}
+		} else {
+			/*
+			 * An unavailable CPU (or any other unknown status)
+			 * shouldn't be started. It should also
+			 * not be in the possible map but currently it can
+			 * happen
+			 */
+			pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable"
+				 " (status %d)...\n", nr, pcpu, status);
+			return -ENODEV;
+		}
+	} else {
+		/*
+		 * On OPAL v2, we just kick it and hope for the best,
+		 * we must not test the error from opal_start_cpu() or
+		 * we would fail to get CPUs from kexec.
+		 */
+		opal_start_cpu(pcpu, start_here);
+	}
+ kick:
+	return smp_generic_kick_cpu(nr);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static int pnv_smp_cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	/* This is identical to pSeries... might consolidate by
+	 * moving migrate_irqs_away to a ppc_md with default to
+	 * the generic fixup_irqs. --BenH.
+	 */
+	set_cpu_online(cpu, false);
+	vdso_data->processorCount--;
+	if (cpu == boot_cpuid)
+		boot_cpuid = cpumask_any(cpu_online_mask);
+	xics_migrate_irqs_away();
+	return 0;
+}
+
+static void pnv_smp_cpu_kill_self(void)
+{
+	unsigned int cpu;
+	unsigned long srr1, wmask;
+	u32 idle_states;
+
+	/* Standard hot unplug procedure */
+	local_irq_disable();
+	idle_task_exit();
+	current->active_mm = NULL; /* for sanity */
+	cpu = smp_processor_id();
+	DBG("CPU%d offline\n", cpu);
+	generic_set_cpu_dead(cpu);
+	smp_wmb();
+
+	wmask = SRR1_WAKEMASK;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		wmask = SRR1_WAKEMASK_P8;
+
+	idle_states = pnv_get_supported_cpuidle_states();
+	/* We don't want to take decrementer interrupts while we are offline,
+	 * so clear LPCR:PECE1. We keep PECE2 enabled.
+	 */
+	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1);
+	while (!generic_check_cpu_restart(cpu)) {
+
+		ppc64_runlatch_off();
+
+		if (idle_states & OPAL_PM_WINKLE_ENABLED)
+			srr1 = power7_winkle();
+		else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+				(idle_states & OPAL_PM_SLEEP_ENABLED_ER1))
+			srr1 = power7_sleep();
+		else
+			srr1 = power7_nap(1);
+
+		ppc64_runlatch_on();
+
+		/*
+		 * If the SRR1 value indicates that we woke up due to
+		 * an external interrupt, then clear the interrupt.
+		 * We clear the interrupt before checking for the
+		 * reason, so as to avoid a race where we wake up for
+		 * some other reason, find nothing and clear the interrupt
+		 * just as some other cpu is sending us an interrupt.
+		 * If we returned from power7_nap as a result of
+		 * having finished executing in a KVM guest, then srr1
+		 * contains 0.
+		 */
+		if ((srr1 & wmask) == SRR1_WAKEEE) {
+			icp_native_flush_interrupt();
+			local_paca->irq_happened &= PACA_IRQ_HARD_DIS;
+			smp_mb();
+		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
+			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+			asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
+			kvmppc_set_host_ipi(cpu, 0);
+		}
+
+		if (cpu_core_split_required())
+			continue;
+
+		if (!generic_check_cpu_restart(cpu))
+			DBG("CPU%d Unexpected exit while offline !\n", cpu);
+	}
+	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1);
+	DBG("CPU%d coming online...\n", cpu);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int pnv_cpu_bootable(unsigned int nr)
+{
+	/*
+	 * Starting with POWER8, the subcore logic relies on all threads of a
+	 * core being booted so that they can participate in split mode
+	 * switches. So on those machines we ignore the smt_enabled_at_boot
+	 * setting (smt-enabled on the kernel command line).
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		return 1;
+
+	return smp_generic_cpu_bootable(nr);
+}
+
+static struct smp_ops_t pnv_smp_ops = {
+	.message_pass	= smp_muxed_ipi_message_pass,
+	.cause_ipi	= NULL,	/* Filled at runtime by xics_smp_probe() */
+	.probe		= xics_smp_probe,
+	.kick_cpu	= pnv_smp_kick_cpu,
+	.setup_cpu	= pnv_smp_setup_cpu,
+	.cpu_bootable	= pnv_cpu_bootable,
+#ifdef CONFIG_HOTPLUG_CPU
+	.cpu_disable	= pnv_smp_cpu_disable,
+	.cpu_die	= generic_cpu_die,
+#endif /* CONFIG_HOTPLUG_CPU */
+};
+
+/* This is called very early during platform setup_arch */
+void __init pnv_smp_init(void)
+{
+	smp_ops = &pnv_smp_ops;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	ppc_md.cpu_die	= pnv_smp_cpu_kill_self;
+#endif
+}
diff --git a/kernel/arch/powerpc/platforms/powernv/subcore-asm.S b/kernel/arch/powerpc/platforms/powernv/subcore-asm.S
new file mode 100644
index 000000000..39bb24aa8
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/subcore-asm.S
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/ppc_asm.h>
+#include <asm/reg.h>
+
+#include "subcore.h"
+
+
+_GLOBAL(split_core_secondary_loop)
+	/*
+	 * r3 = u8 *state, used throughout the routine
+	 * r4 = temp
+	 * r5 = temp
+	 * ..
+	 * r12 = MSR
+	 */
+	mfmsr	r12
+
+	/* Disable interrupts so SRR0/1 don't get trashed */
+	li	r4,0
+	ori	r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI
+	andc	r4,r12,r4
+	sync
+	mtmsrd	r4
+
+	/* Switch to real mode and leave interrupts off */
+	li	r5, MSR_IR|MSR_DR
+	andc	r5, r4, r5
+
+	LOAD_REG_ADDR(r4, real_mode)
+
+	mtspr	SPRN_SRR0,r4
+	mtspr	SPRN_SRR1,r5
+	rfid
+	b	.	/* prevent speculative execution */
+
+real_mode:
+	/* Grab values from unsplit SPRs */
+	mfspr	r6,  SPRN_LDBAR
+	mfspr	r7,  SPRN_PMMAR
+	mfspr	r8,  SPRN_PMCR
+	mfspr	r9,  SPRN_RPR
+	mfspr	r10, SPRN_SDR1
+
+	/* Order reading the SPRs vs telling the primary we are ready to split */
+	sync
+
+	/* Tell thread 0 we are in real mode */
+	li	r4, SYNC_STEP_REAL_MODE
+	stb	r4, 0(r3)
+
+	li	r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest
+	sldi	r5, r5, 48
+
+	/* Loop until we see the split happen in HID0 */
+1:	mfspr	r4, SPRN_HID0
+	and.	r4, r4, r5
+	beq	1b
+
+	/*
+	 * We only need to initialise the below regs once for each subcore,
+	 * but it's simpler and harmless to do it on each thread.
+	 */
+
+	/* Make sure various SPRS have sane values */
+	li	r4, 0
+	mtspr	SPRN_LPID, r4
+	mtspr	SPRN_PCR, r4
+	mtspr	SPRN_HDEC, r4
+
+	/* Restore SPR values now we are split */
+	mtspr	SPRN_LDBAR, r6
+	mtspr	SPRN_PMMAR, r7
+	mtspr	SPRN_PMCR, r8
+	mtspr	SPRN_RPR, r9
+	mtspr	SPRN_SDR1, r10
+
+	LOAD_REG_ADDR(r5, virtual_mode)
+
+	/* Get out of real mode */
+	mtspr	SPRN_SRR0,r5
+	mtspr	SPRN_SRR1,r12
+	rfid
+	b	.	/* prevent speculative execution */
+
+virtual_mode:
+	blr
diff --git a/kernel/arch/powerpc/platforms/powernv/subcore.c b/kernel/arch/powerpc/platforms/powernv/subcore.c
new file mode 100644
index 000000000..f60f80ada
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/subcore.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt)	"powernv: " fmt
+
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+#include <linux/stop_machine.h>
+
+#include <asm/cputhreads.h>
+#include <asm/kvm_ppc.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+
+#include "subcore.h"
+#include "powernv.h"
+
+
+/*
+ * Split/unsplit procedure:
+ *
+ * A core can be in one of three states, unsplit, 2-way split, and 4-way split.
+ *
+ * The mapping to subcores_per_core is simple:
+ *
+ *  State       | subcores_per_core
+ *  ------------|------------------
+ *  Unsplit     |        1
+ *  2-way split |        2
+ *  4-way split |        4
+ *
+ * The core is split along thread boundaries, the mapping between subcores and
+ * threads is as follows:
+ *
+ *  Unsplit:
+ *          ----------------------------
+ *  Subcore |            0             |
+ *          ----------------------------
+ *  Thread  |  0  1  2  3  4  5  6  7  |
+ *          ----------------------------
+ *
+ *  2-way split:
+ *          -------------------------------------
+ *  Subcore |        0        |        1        |
+ *          -------------------------------------
+ *  Thread  |  0   1   2   3  |  4   5   6   7  |
+ *          -------------------------------------
+ *
+ *  4-way split:
+ *          -----------------------------------------
+ *  Subcore |    0    |    1    |    2    |    3    |
+ *          -----------------------------------------
+ *  Thread  |  0   1  |  2   3  |  4   5  |  6   7  |
+ *          -----------------------------------------
+ *
+ *
+ * Transitions
+ * -----------
+ *
+ * It is not possible to transition between either of the split states, the
+ * core must first be unsplit. The legal transitions are:
+ *
+ *  -----------          ---------------
+ *  |         |  <---->  | 2-way split |
+ *  |         |          ---------------
+ *  | Unsplit |
+ *  |         |          ---------------
+ *  |         |  <---->  | 4-way split |
+ *  -----------          ---------------
+ *
+ * Unsplitting
+ * -----------
+ *
+ * Unsplitting is the simpler procedure. It requires thread 0 to request the
+ * unsplit while all other threads NAP.
+ *
+ * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells
+ * the hardware that if all threads except 0 are napping, the hardware should
+ * unsplit the core.
+ *
+ * Non-zero threads are sent to a NAP loop, they don't exit the loop until they
+ * see the core unsplit.
+ *
+ * Core 0 spins waiting for the hardware to see all the other threads napping
+ * and perform the unsplit.
+ *
+ * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them
+ * out of NAP. They will then see the core unsplit and exit the NAP loop.
+ *
+ * Splitting
+ * ---------
+ *
+ * The basic splitting procedure is fairly straight forward. However it is
+ * complicated by the fact that after the split occurs, the newly created
+ * subcores are not in a fully initialised state.
+ *
+ * Most notably the subcores do not have the correct value for SDR1, which
+ * means they must not be running in virtual mode when the split occurs. The
+ * subcores have separate timebases SPRs but these are pre-synchronised by
+ * opal.
+ *
+ * To begin with secondary threads are sent to an assembly routine. There they
+ * switch to real mode, so they are immune to the uninitialised SDR1 value.
+ * Once in real mode they indicate that they are in real mode, and spin waiting
+ * to see the core split.
+ *
+ * Thread 0 waits to see that all secondaries are in real mode, and then begins
+ * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which
+ * prevents the hardware from unsplitting. Then it sets the appropriate HID bit
+ * to request the split, and spins waiting to see that the split has happened.
+ *
+ * Concurrently the secondaries will notice the split. When they do they set up
+ * their SPRs, notably SDR1, and then they can return to virtual mode and exit
+ * the procedure.
+ */
+
+/* Initialised at boot by subcore_init() */
+static int subcores_per_core;
+
+/*
+ * Used to communicate to offline cpus that we want them to pop out of the
+ * offline loop and do a split or unsplit.
+ *
+ * 0 - no split happening
+ * 1 - unsplit in progress
+ * 2 - split to 2 in progress
+ * 4 - split to 4 in progress
+ */
+static int new_split_mode;
+
+static cpumask_var_t cpu_offline_mask;
+
+struct split_state {
+	u8 step;
+	u8 master;
+};
+
+static DEFINE_PER_CPU(struct split_state, split_state);
+
+static void wait_for_sync_step(int step)
+{
+	int i, cpu = smp_processor_id();
+
+	for (i = cpu + 1; i < cpu + threads_per_core; i++)
+		while(per_cpu(split_state, i).step < step)
+			barrier();
+
+	/* Order the wait loop vs any subsequent loads/stores. */
+	mb();
+}
+
+static void update_hid_in_slw(u64 hid0)
+{
+	u64 idle_states = pnv_get_supported_cpuidle_states();
+
+	if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+		/* OPAL call to patch slw with the new HID0 value */
+		u64 cpu_pir = hard_smp_processor_id();
+
+		opal_slw_set_reg(cpu_pir, SPRN_HID0, hid0);
+	}
+}
+
+static void unsplit_core(void)
+{
+	u64 hid0, mask;
+	int i, cpu;
+
+	mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+
+	cpu = smp_processor_id();
+	if (cpu_thread_in_core(cpu) != 0) {
+		while (mfspr(SPRN_HID0) & mask)
+			power7_nap(0);
+
+		per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
+		return;
+	}
+
+	hid0 = mfspr(SPRN_HID0);
+	hid0 &= ~HID0_POWER8_DYNLPARDIS;
+	mtspr(SPRN_HID0, hid0);
+	update_hid_in_slw(hid0);
+
+	while (mfspr(SPRN_HID0) & mask)
+		cpu_relax();
+
+	/* Wake secondaries out of NAP */
+	for (i = cpu + 1; i < cpu + threads_per_core; i++)
+		smp_send_reschedule(i);
+
+	wait_for_sync_step(SYNC_STEP_UNSPLIT);
+}
+
+static void split_core(int new_mode)
+{
+	struct {  u64 value; u64 mask; } split_parms[2] = {
+		{ HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE },
+		{ HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE }
+	};
+	int i, cpu;
+	u64 hid0;
+
+	/* Convert new_mode (2 or 4) into an index into our parms array */
+	i = (new_mode >> 1) - 1;
+	BUG_ON(i < 0 || i > 1);
+
+	cpu = smp_processor_id();
+	if (cpu_thread_in_core(cpu) != 0) {
+		split_core_secondary_loop(&per_cpu(split_state, cpu).step);
+		return;
+	}
+
+	wait_for_sync_step(SYNC_STEP_REAL_MODE);
+
+	/* Write new mode */
+	hid0  = mfspr(SPRN_HID0);
+	hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value;
+	mtspr(SPRN_HID0, hid0);
+	update_hid_in_slw(hid0);
+
+	/* Wait for it to happen */
+	while (!(mfspr(SPRN_HID0) & split_parms[i].mask))
+		cpu_relax();
+}
+
+static void cpu_do_split(int new_mode)
+{
+	/*
+	 * At boot subcores_per_core will be 0, so we will always unsplit at
+	 * boot. In the usual case where the core is already unsplit it's a
+	 * nop, and this just ensures the kernel's notion of the mode is
+	 * consistent with the hardware.
+	 */
+	if (subcores_per_core != 1)
+		unsplit_core();
+
+	if (new_mode != 1)
+		split_core(new_mode);
+
+	mb();
+	per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED;
+}
+
+bool cpu_core_split_required(void)
+{
+	smp_rmb();
+
+	if (!new_split_mode)
+		return false;
+
+	cpu_do_split(new_split_mode);
+
+	return true;
+}
+
+void update_subcore_sibling_mask(void)
+{
+	int cpu;
+	/*
+	 * sibling mask for the first cpu. Left shift this by required bits
+	 * to get sibling mask for the rest of the cpus.
+	 */
+	int sibling_mask_first_cpu =  (1 << threads_per_subcore) - 1;
+
+	for_each_possible_cpu(cpu) {
+		int tid = cpu_thread_in_core(cpu);
+		int offset = (tid / threads_per_subcore) * threads_per_subcore;
+		int mask = sibling_mask_first_cpu << offset;
+
+		paca[cpu].subcore_sibling_mask = mask;
+
+	}
+}
+
+static int cpu_update_split_mode(void *data)
+{
+	int cpu, new_mode = *(int *)data;
+
+	if (this_cpu_ptr(&split_state)->master) {
+		new_split_mode = new_mode;
+		smp_wmb();
+
+		cpumask_andnot(cpu_offline_mask, cpu_present_mask,
+			       cpu_online_mask);
+
+		/* This should work even though the cpu is offline */
+		for_each_cpu(cpu, cpu_offline_mask)
+			smp_send_reschedule(cpu);
+	}
+
+	cpu_do_split(new_mode);
+
+	if (this_cpu_ptr(&split_state)->master) {
+		/* Wait for all cpus to finish before we touch subcores_per_core */
+		for_each_present_cpu(cpu) {
+			if (cpu >= setup_max_cpus)
+				break;
+
+			while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED)
+				barrier();
+		}
+
+		new_split_mode = 0;
+
+		/* Make the new mode public */
+		subcores_per_core = new_mode;
+		threads_per_subcore = threads_per_core / subcores_per_core;
+		update_subcore_sibling_mask();
+
+		/* Make sure the new mode is written before we exit */
+		mb();
+	}
+
+	return 0;
+}
+
+static int set_subcores_per_core(int new_mode)
+{
+	struct split_state *state;
+	int cpu;
+
+	if (kvm_hv_mode_active()) {
+		pr_err("Unable to change split core mode while KVM active.\n");
+		return -EBUSY;
+	}
+
+	/*
+	 * We are only called at boot, or from the sysfs write. If that ever
+	 * changes we'll need a lock here.
+	 */
+	BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3);
+
+	for_each_present_cpu(cpu) {
+		state = &per_cpu(split_state, cpu);
+		state->step = SYNC_STEP_INITIAL;
+		state->master = 0;
+	}
+
+	get_online_cpus();
+
+	/* This cpu will update the globals before exiting stop machine */
+	this_cpu_ptr(&split_state)->master = 1;
+
+	/* Ensure state is consistent before we call the other cpus */
+	mb();
+
+	stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask);
+
+	put_online_cpus();
+
+	return 0;
+}
+
+static ssize_t __used store_subcores_per_core(struct device *dev,
+		struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	unsigned long val;
+	int rc;
+
+	/* We are serialised by the attribute lock */
+
+	rc = sscanf(buf, "%lx", &val);
+	if (rc != 1)
+		return -EINVAL;
+
+	switch (val) {
+	case 1:
+	case 2:
+	case 4:
+		if (subcores_per_core == val)
+			/* Nothing to do */
+			goto out;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	rc = set_subcores_per_core(val);
+	if (rc)
+		return rc;
+
+out:
+	return count;
+}
+
+static ssize_t show_subcores_per_core(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%x\n", subcores_per_core);
+}
+
+static DEVICE_ATTR(subcores_per_core, 0644,
+		show_subcores_per_core, store_subcores_per_core);
+
+static int subcore_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return 0;
+
+	/*
+	 * We need all threads in a core to be present to split/unsplit so
+         * continue only if max_cpus are aligned to threads_per_core.
+	 */
+	if (setup_max_cpus % threads_per_core)
+		return 0;
+
+	BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL));
+
+	set_subcores_per_core(1);
+
+	return device_create_file(cpu_subsys.dev_root,
+				  &dev_attr_subcores_per_core);
+}
+machine_device_initcall(powernv, subcore_init);
diff --git a/kernel/arch/powerpc/platforms/powernv/subcore.h b/kernel/arch/powerpc/platforms/powernv/subcore.h
new file mode 100644
index 000000000..84e02ae52
--- /dev/null
+++ b/kernel/arch/powerpc/platforms/powernv/subcore.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/* These are ordered and tested with <= */
+#define SYNC_STEP_INITIAL	0
+#define SYNC_STEP_UNSPLIT	1	/* Set by secondary when it sees unsplit */
+#define SYNC_STEP_REAL_MODE	2	/* Set by secondary when in real mode  */
+#define SYNC_STEP_FINISHED	3	/* Set by secondary when split/unsplit is done */
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_SMP
+void split_core_secondary_loop(u8 *state);
+extern void update_subcore_sibling_mask(void);
+#else
+static inline void update_subcore_sibling_mask(void) { };
+#endif /* CONFIG_SMP */
+
+#endif /* __ASSEMBLY__ */