diff options
Diffstat (limited to 'kernel/arch/powerpc/platforms/powernv')
31 files changed, 11774 insertions, 0 deletions
diff --git a/kernel/arch/powerpc/platforms/powernv/Kconfig b/kernel/arch/powerpc/platforms/powernv/Kconfig new file mode 100644 index 000000000..4b044d8cb --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/Kconfig @@ -0,0 +1,21 @@ +config PPC_POWERNV + depends on PPC64 && PPC_BOOK3S + bool "IBM PowerNV (Non-Virtualized) platform support" + select PPC_NATIVE + select PPC_XICS + select PPC_ICP_NATIVE + select PPC_P7_NAP + select PPC_PCI_CHOICE if EMBEDDED + select EPAPR_BOOT + select PPC_INDIRECT_PIO + select PPC_UDBG_16550 + select PPC_SCOM + select ARCH_RANDOM + select CPU_FREQ + select CPU_FREQ_GOV_PERFORMANCE + select CPU_FREQ_GOV_POWERSAVE + select CPU_FREQ_GOV_USERSPACE + select CPU_FREQ_GOV_ONDEMAND + select CPU_FREQ_GOV_CONSERVATIVE + select PPC_DOORBELL + default y diff --git a/kernel/arch/powerpc/platforms/powernv/Makefile b/kernel/arch/powerpc/platforms/powernv/Makefile new file mode 100644 index 000000000..33e44f372 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/Makefile @@ -0,0 +1,11 @@ +obj-y += setup.o opal-wrappers.o opal.o opal-async.o +obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o +obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o +obj-y += opal-msglog.o opal-hmi.o opal-power.o + +obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o +obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o +obj-$(CONFIG_EEH) += eeh-powernv.o +obj-$(CONFIG_PPC_SCOM) += opal-xscom.o +obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o +obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o diff --git a/kernel/arch/powerpc/platforms/powernv/eeh-powernv.c b/kernel/arch/powerpc/platforms/powernv/eeh-powernv.c new file mode 100644 index 000000000..ce738ab3d --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -0,0 +1,1542 @@ +/* + * The file intends to implement the platform dependent EEH operations on + * powernv platform. Actually, the powernv was created in order to fully + * hypervisor support. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/atomic.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/msi.h> +#include <linux/of.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/firmware.h> +#include <asm/io.h> +#include <asm/iommu.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/opal.h> +#include <asm/ppc-pci.h> + +#include "powernv.h" +#include "pci.h" + +static bool pnv_eeh_nb_init = false; + +/** + * pnv_eeh_init - EEH platform dependent initialization + * + * EEH platform dependent initialization on powernv + */ +static int pnv_eeh_init(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + + /* We require OPALv3 */ + if (!firmware_has_feature(FW_FEATURE_OPALv3)) { + pr_warn("%s: OPALv3 is required !\n", + __func__); + return -EINVAL; + } + + /* Set probe mode */ + eeh_add_flag(EEH_PROBE_MODE_DEV); + + /* + * P7IOC blocks PCI config access to frozen PE, but PHB3 + * doesn't do that. So we have to selectively enable I/O + * prior to collecting error log. + */ + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_P7IOC) + eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); + + /* + * PE#0 should be regarded as valid by EEH core + * if it's not the reserved one. Currently, we + * have the reserved PE#0 and PE#127 for PHB3 + * and P7IOC separately. So we should regard + * PE#0 as valid for P7IOC. + */ + if (phb->ioda.reserved_pe != 0) + eeh_add_flag(EEH_VALID_PE_ZERO); + + break; + } + + return 0; +} + +static int pnv_eeh_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + uint64_t changed_evts = (uint64_t)change; + + /* + * We simply send special EEH event if EEH has + * been enabled, or clear pending events in + * case that we enable EEH soon + */ + if (!(changed_evts & OPAL_EVENT_PCI_ERROR) || + !(events & OPAL_EVENT_PCI_ERROR)) + return 0; + + if (eeh_enabled()) + eeh_send_failure_event(NULL); + else + opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); + + return 0; +} + +static struct notifier_block pnv_eeh_nb = { + .notifier_call = pnv_eeh_event, + .next = NULL, + .priority = 0 +}; + +#ifdef CONFIG_DEBUG_FS +static ssize_t pnv_eeh_ei_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct pci_controller *hose = filp->private_data; + struct eeh_dev *edev; + struct eeh_pe *pe; + int pe_no, type, func; + unsigned long addr, mask; + char buf[50]; + int ret; + + if (!eeh_ops || !eeh_ops->err_inject) + return -ENXIO; + + /* Copy over argument buffer */ + ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); + if (!ret) + return -EFAULT; + + /* Retrieve parameters */ + ret = sscanf(buf, "%x:%x:%x:%lx:%lx", + &pe_no, &type, &func, &addr, &mask); + if (ret != 5) + return -EINVAL; + + /* Retrieve PE */ + edev = kzalloc(sizeof(*edev), GFP_KERNEL); + if (!edev) + return -ENOMEM; + edev->phb = hose; + edev->pe_config_addr = pe_no; + pe = eeh_pe_get(edev); + kfree(edev); + if (!pe) + return -ENODEV; + + /* Do error injection */ + ret = eeh_ops->err_inject(pe, type, func, addr, mask); + return ret < 0 ? ret : count; +} + +static const struct file_operations pnv_eeh_ei_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = pnv_eeh_ei_write, +}; + +static int pnv_eeh_dbgfs_set(void *data, int offset, u64 val) +{ + struct pci_controller *hose = data; + struct pnv_phb *phb = hose->private_data; + + out_be64(phb->regs + offset, val); + return 0; +} + +static int pnv_eeh_dbgfs_get(void *data, int offset, u64 *val) +{ + struct pci_controller *hose = data; + struct pnv_phb *phb = hose->private_data; + + *val = in_be64(phb->regs + offset); + return 0; +} + +static int pnv_eeh_outb_dbgfs_set(void *data, u64 val) +{ + return pnv_eeh_dbgfs_set(data, 0xD10, val); +} + +static int pnv_eeh_outb_dbgfs_get(void *data, u64 *val) +{ + return pnv_eeh_dbgfs_get(data, 0xD10, val); +} + +static int pnv_eeh_inbA_dbgfs_set(void *data, u64 val) +{ + return pnv_eeh_dbgfs_set(data, 0xD90, val); +} + +static int pnv_eeh_inbA_dbgfs_get(void *data, u64 *val) +{ + return pnv_eeh_dbgfs_get(data, 0xD90, val); +} + +static int pnv_eeh_inbB_dbgfs_set(void *data, u64 val) +{ + return pnv_eeh_dbgfs_set(data, 0xE10, val); +} + +static int pnv_eeh_inbB_dbgfs_get(void *data, u64 *val) +{ + return pnv_eeh_dbgfs_get(data, 0xE10, val); +} + +DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_outb_dbgfs_ops, pnv_eeh_outb_dbgfs_get, + pnv_eeh_outb_dbgfs_set, "0x%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbA_dbgfs_ops, pnv_eeh_inbA_dbgfs_get, + pnv_eeh_inbA_dbgfs_set, "0x%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbB_dbgfs_ops, pnv_eeh_inbB_dbgfs_get, + pnv_eeh_inbB_dbgfs_set, "0x%llx\n"); +#endif /* CONFIG_DEBUG_FS */ + +/** + * pnv_eeh_post_init - EEH platform dependent post initialization + * + * EEH platform dependent post initialization on powernv. When + * the function is called, the EEH PEs and devices should have + * been built. If the I/O cache staff has been built, EEH is + * ready to supply service. + */ +static int pnv_eeh_post_init(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + int ret = 0; + + /* Register OPAL event notifier */ + if (!pnv_eeh_nb_init) { + ret = opal_notifier_register(&pnv_eeh_nb); + if (ret) { + pr_warn("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + + pnv_eeh_nb_init = true; + } + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + /* + * If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; + + /* Create debugfs entries */ +#ifdef CONFIG_DEBUG_FS + if (phb->has_dbgfs || !phb->dbgfs) + continue; + + phb->has_dbgfs = 1; + debugfs_create_file("err_injct", 0200, + phb->dbgfs, hose, + &pnv_eeh_ei_fops); + + debugfs_create_file("err_injct_outbound", 0600, + phb->dbgfs, hose, + &pnv_eeh_outb_dbgfs_ops); + debugfs_create_file("err_injct_inboundA", 0600, + phb->dbgfs, hose, + &pnv_eeh_inbA_dbgfs_ops); + debugfs_create_file("err_injct_inboundB", 0600, + phb->dbgfs, hose, + &pnv_eeh_inbB_dbgfs_ops); +#endif /* CONFIG_DEBUG_FS */ + } + + + return ret; +} + +static int pnv_eeh_cap_start(struct pci_dn *pdn) +{ + u32 status; + + if (!pdn) + return 0; + + pnv_pci_cfg_read(pdn, PCI_STATUS, 2, &status); + if (!(status & PCI_STATUS_CAP_LIST)) + return 0; + + return PCI_CAPABILITY_LIST; +} + +static int pnv_eeh_find_cap(struct pci_dn *pdn, int cap) +{ + int pos = pnv_eeh_cap_start(pdn); + int cnt = 48; /* Maximal number of capabilities */ + u32 id; + + if (!pos) + return 0; + + while (cnt--) { + pnv_pci_cfg_read(pdn, pos, 1, &pos); + if (pos < 0x40) + break; + + pos &= ~3; + pnv_pci_cfg_read(pdn, pos + PCI_CAP_LIST_ID, 1, &id); + if (id == 0xff) + break; + + /* Found */ + if (id == cap) + return pos; + + /* Next one */ + pos += PCI_CAP_LIST_NEXT; + } + + return 0; +} + +static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) +{ + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + u32 header; + int pos = 256, ttl = (4096 - 256) / 8; + + if (!edev || !edev->pcie_cap) + return 0; + if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + return 0; + else if (!header) + return 0; + + while (ttl-- > 0) { + if (PCI_EXT_CAP_ID(header) == cap && pos) + return pos; + + pos = PCI_EXT_CAP_NEXT(header); + if (pos < 256) + break; + + if (pnv_pci_cfg_read(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + break; + } + + return 0; +} + +/** + * pnv_eeh_probe - Do probe on PCI device + * @pdn: PCI device node + * @data: unused + * + * When EEH module is installed during system boot, all PCI devices + * are checked one by one to see if it supports EEH. The function + * is introduced for the purpose. By default, EEH has been enabled + * on all PCI devices. That's to say, we only need do necessary + * initialization on the corresponding eeh device and create PE + * accordingly. + * + * It's notable that's unsafe to retrieve the EEH device through + * the corresponding PCI device. During the PCI device hotplug, which + * was possiblly triggered by EEH core, the binding between EEH device + * and the PCI device isn't built yet. + */ +static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) +{ + struct pci_controller *hose = pdn->phb; + struct pnv_phb *phb = hose->private_data; + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + uint32_t pcie_flags; + int ret; + + /* + * When probing the root bridge, which doesn't have any + * subordinate PCI devices. We don't have OF node for + * the root bridge. So it's not reasonable to continue + * the probing. + */ + if (!edev || edev->pe) + return NULL; + + /* Skip for PCI-ISA bridge */ + if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) + return NULL; + + /* Initialize eeh device */ + edev->class_code = pdn->class_code; + edev->mode &= 0xFFFFFF00; + edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX); + edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP); + edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR); + if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + edev->mode |= EEH_DEV_BRIDGE; + if (edev->pcie_cap) { + pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS, + 2, &pcie_flags); + pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4; + if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT) + edev->mode |= EEH_DEV_ROOT_PORT; + else if (pcie_flags == PCI_EXP_TYPE_DOWNSTREAM) + edev->mode |= EEH_DEV_DS_PORT; + } + } + + edev->config_addr = (pdn->busno << 8) | (pdn->devfn); + edev->pe_config_addr = phb->ioda.pe_rmap[edev->config_addr]; + + /* Create PE */ + ret = eeh_add_to_parent_pe(edev); + if (ret) { + pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%d)\n", + __func__, hose->global_number, pdn->busno, + PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret); + return NULL; + } + + /* + * If the PE contains any one of following adapters, the + * PCI config space can't be accessed when dumping EEH log. + * Otherwise, we will run into fenced PHB caused by shortage + * of outbound credits in the adapter. The PCI config access + * should be blocked until PE reset. MMIO access is dropped + * by hardware certainly. In order to drop PCI config requests, + * one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which + * will be checked in the backend for PE state retrival. If + * the PE becomes frozen for the first time and the flag has + * been set for the PE, we will set EEH_PE_CFG_BLOCKED for + * that PE to block its config space. + * + * Broadcom Austin 4-ports NICs (14e4:1657) + * Broadcom Shiner 2-ports 10G NICs (14e4:168e) + */ + if ((pdn->vendor_id == PCI_VENDOR_ID_BROADCOM && + pdn->device_id == 0x1657) || + (pdn->vendor_id == PCI_VENDOR_ID_BROADCOM && + pdn->device_id == 0x168e)) + edev->pe->state |= EEH_PE_CFG_RESTRICTED; + + /* + * Cache the PE primary bus, which can't be fetched when + * full hotplug is in progress. In that case, all child + * PCI devices of the PE are expected to be removed prior + * to PE reset. + */ + if (!edev->pe->bus) + edev->pe->bus = pci_find_bus(hose->global_number, + pdn->busno); + + /* + * Enable EEH explicitly so that we will do EEH check + * while accessing I/O stuff + */ + eeh_add_flag(EEH_ENABLED); + + /* Save memory bars */ + eeh_save_bars(edev); + + return NULL; +} + +/** + * pnv_eeh_set_option - Initialize EEH or MMIO/DMA reenable + * @pe: EEH PE + * @option: operation to be issued + * + * The function is used to control the EEH functionality globally. + * Currently, following options are support according to PAPR: + * Enable EEH, Disable EEH, Enable MMIO and Enable DMA + */ +static int pnv_eeh_set_option(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + bool freeze_pe = false; + int opt, ret = 0; + s64 rc; + + /* Sanity check on option */ + switch (option) { + case EEH_OPT_DISABLE: + return -EPERM; + case EEH_OPT_ENABLE: + return 0; + case EEH_OPT_THAW_MMIO: + opt = OPAL_EEH_ACTION_CLEAR_FREEZE_MMIO; + break; + case EEH_OPT_THAW_DMA: + opt = OPAL_EEH_ACTION_CLEAR_FREEZE_DMA; + break; + case EEH_OPT_FREEZE_PE: + freeze_pe = true; + opt = OPAL_EEH_ACTION_SET_FREEZE_ALL; + break; + default: + pr_warn("%s: Invalid option %d\n", __func__, option); + return -EINVAL; + } + + /* If PHB supports compound PE, to handle it */ + if (freeze_pe) { + if (phb->freeze_pe) { + phb->freeze_pe(phb, pe->addr); + } else { + rc = opal_pci_eeh_freeze_set(phb->opal_id, + pe->addr, opt); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld freezing " + "PHB#%x-PE#%x\n", + __func__, rc, + phb->hose->global_number, pe->addr); + ret = -EIO; + } + } + } else { + if (phb->unfreeze_pe) { + ret = phb->unfreeze_pe(phb, pe->addr, opt); + } else { + rc = opal_pci_eeh_freeze_clear(phb->opal_id, + pe->addr, opt); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld enable %d " + "for PHB#%x-PE#%x\n", + __func__, rc, option, + phb->hose->global_number, pe->addr); + ret = -EIO; + } + } + } + + return ret; +} + +/** + * pnv_eeh_get_pe_addr - Retrieve PE address + * @pe: EEH PE + * + * Retrieve the PE address according to the given tranditional + * PCI BDF (Bus/Device/Function) address. + */ +static int pnv_eeh_get_pe_addr(struct eeh_pe *pe) +{ + return pe->addr; +} + +static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) +{ + struct pnv_phb *phb = pe->phb->private_data; + s64 rc; + + rc = opal_pci_get_phb_diag_data2(phb->opal_id, pe->data, + PNV_PCI_DIAG_BUF_SIZE); + if (rc != OPAL_SUCCESS) + pr_warn("%s: Failure %lld getting PHB#%x diag-data\n", + __func__, rc, pe->phb->global_number); +} + +static int pnv_eeh_get_phb_state(struct eeh_pe *pe) +{ + struct pnv_phb *phb = pe->phb->private_data; + u8 fstate; + __be16 pcierr; + s64 rc; + int result = 0; + + rc = opal_pci_eeh_freeze_status(phb->opal_id, + pe->addr, + &fstate, + &pcierr, + NULL); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld getting PHB#%x state\n", + __func__, rc, phb->hose->global_number); + return EEH_STATE_NOT_SUPPORT; + } + + /* + * Check PHB state. If the PHB is frozen for the + * first time, to dump the PHB diag-data. + */ + if (be16_to_cpu(pcierr) != OPAL_EEH_PHB_ERROR) { + result = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE | + EEH_STATE_MMIO_ENABLED | + EEH_STATE_DMA_ENABLED); + } else if (!(pe->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + pnv_eeh_get_phb_diag(pe); + + if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data(pe->phb, pe->data); + } + + return result; +} + +static int pnv_eeh_get_pe_state(struct eeh_pe *pe) +{ + struct pnv_phb *phb = pe->phb->private_data; + u8 fstate; + __be16 pcierr; + s64 rc; + int result; + + /* + * We don't clobber hardware frozen state until PE + * reset is completed. In order to keep EEH core + * moving forward, we have to return operational + * state during PE reset. + */ + if (pe->state & EEH_PE_RESET) { + result = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE | + EEH_STATE_MMIO_ENABLED | + EEH_STATE_DMA_ENABLED); + return result; + } + + /* + * Fetch PE state from hardware. If the PHB + * supports compound PE, let it handle that. + */ + if (phb->get_pe_state) { + fstate = phb->get_pe_state(phb, pe->addr); + } else { + rc = opal_pci_eeh_freeze_status(phb->opal_id, + pe->addr, + &fstate, + &pcierr, + NULL); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld getting PHB#%x-PE%x state\n", + __func__, rc, phb->hose->global_number, + pe->addr); + return EEH_STATE_NOT_SUPPORT; + } + } + + /* Figure out state */ + switch (fstate) { + case OPAL_EEH_STOPPED_NOT_FROZEN: + result = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE | + EEH_STATE_MMIO_ENABLED | + EEH_STATE_DMA_ENABLED); + break; + case OPAL_EEH_STOPPED_MMIO_FREEZE: + result = (EEH_STATE_DMA_ACTIVE | + EEH_STATE_DMA_ENABLED); + break; + case OPAL_EEH_STOPPED_DMA_FREEZE: + result = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_MMIO_ENABLED); + break; + case OPAL_EEH_STOPPED_MMIO_DMA_FREEZE: + result = 0; + break; + case OPAL_EEH_STOPPED_RESET: + result = EEH_STATE_RESET_ACTIVE; + break; + case OPAL_EEH_STOPPED_TEMP_UNAVAIL: + result = EEH_STATE_UNAVAILABLE; + break; + case OPAL_EEH_STOPPED_PERM_UNAVAIL: + result = EEH_STATE_NOT_SUPPORT; + break; + default: + result = EEH_STATE_NOT_SUPPORT; + pr_warn("%s: Invalid PHB#%x-PE#%x state %x\n", + __func__, phb->hose->global_number, + pe->addr, fstate); + } + + /* + * If PHB supports compound PE, to freeze all + * slave PEs for consistency. + * + * If the PE is switching to frozen state for the + * first time, to dump the PHB diag-data. + */ + if (!(result & EEH_STATE_NOT_SUPPORT) && + !(result & EEH_STATE_UNAVAILABLE) && + !(result & EEH_STATE_MMIO_ACTIVE) && + !(result & EEH_STATE_DMA_ACTIVE) && + !(pe->state & EEH_PE_ISOLATED)) { + if (phb->freeze_pe) + phb->freeze_pe(phb, pe->addr); + + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + pnv_eeh_get_phb_diag(pe); + + if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data(pe->phb, pe->data); + } + + return result; +} + +/** + * pnv_eeh_get_state - Retrieve PE state + * @pe: EEH PE + * @delay: delay while PE state is temporarily unavailable + * + * Retrieve the state of the specified PE. For IODA-compitable + * platform, it should be retrieved from IODA table. Therefore, + * we prefer passing down to hardware implementation to handle + * it. + */ +static int pnv_eeh_get_state(struct eeh_pe *pe, int *delay) +{ + int ret; + + if (pe->type & EEH_PE_PHB) + ret = pnv_eeh_get_phb_state(pe); + else + ret = pnv_eeh_get_pe_state(pe); + + if (!delay) + return ret; + + /* + * If the PE state is temporarily unavailable, + * to inform the EEH core delay for default + * period (1 second) + */ + *delay = 0; + if (ret & EEH_STATE_UNAVAILABLE) + *delay = 1000; + + return ret; +} + +static s64 pnv_eeh_phb_poll(struct pnv_phb *phb) +{ + s64 rc = OPAL_HARDWARE; + + while (1) { + rc = opal_pci_poll(phb->opal_id); + if (rc <= 0) + break; + + if (system_state < SYSTEM_RUNNING) + udelay(1000 * rc); + else + msleep(rc); + } + + return rc; +} + +int pnv_eeh_phb_reset(struct pci_controller *hose, int option) +{ + struct pnv_phb *phb = hose->private_data; + s64 rc = OPAL_HARDWARE; + + pr_debug("%s: Reset PHB#%x, option=%d\n", + __func__, hose->global_number, option); + + /* Issue PHB complete reset request */ + if (option == EEH_RESET_FUNDAMENTAL || + option == EEH_RESET_HOT) + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PHB_COMPLETE, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_DEACTIVATE) + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PHB_COMPLETE, + OPAL_DEASSERT_RESET); + if (rc < 0) + goto out; + + /* + * Poll state of the PHB until the request is done + * successfully. The PHB reset is usually PHB complete + * reset followed by hot reset on root bus. So we also + * need the PCI bus settlement delay. + */ + rc = pnv_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) { + if (system_state < SYSTEM_RUNNING) + udelay(1000 * EEH_PE_RST_SETTLE_TIME); + else + msleep(EEH_PE_RST_SETTLE_TIME); + } +out: + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} + +static int pnv_eeh_root_reset(struct pci_controller *hose, int option) +{ + struct pnv_phb *phb = hose->private_data; + s64 rc = OPAL_HARDWARE; + + pr_debug("%s: Reset PHB#%x, option=%d\n", + __func__, hose->global_number, option); + + /* + * During the reset deassert time, we needn't care + * the reset scope because the firmware does nothing + * for fundamental or hot reset during deassert phase. + */ + if (option == EEH_RESET_FUNDAMENTAL) + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PCI_FUNDAMENTAL, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_HOT) + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PCI_HOT, + OPAL_ASSERT_RESET); + else if (option == EEH_RESET_DEACTIVATE) + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PCI_HOT, + OPAL_DEASSERT_RESET); + if (rc < 0) + goto out; + + /* Poll state of the PHB until the request is done */ + rc = pnv_eeh_phb_poll(phb); + if (option == EEH_RESET_DEACTIVATE) + msleep(EEH_PE_RST_SETTLE_TIME); +out: + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} + +static int pnv_eeh_bridge_reset(struct pci_dev *dev, int option) +{ + struct pci_dn *pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + int aer = edev ? edev->aer_cap : 0; + u32 ctrl; + + pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(dev->bus), + dev->bus->number, option); + + switch (option) { + case EEH_RESET_FUNDAMENTAL: + case EEH_RESET_HOT: + /* Don't report linkDown event */ + if (aer) { + eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl |= PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + + eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + ctrl |= PCI_BRIDGE_CTL_BUS_RESET; + eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + + msleep(EEH_PE_RST_HOLD_TIME); + break; + case EEH_RESET_DEACTIVATE: + eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; + eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + + msleep(EEH_PE_RST_SETTLE_TIME); + + /* Continue reporting linkDown event */ + if (aer) { + eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + 4, &ctrl); + ctrl &= ~PCI_ERR_UNC_SURPDN; + eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + 4, ctrl); + } + + break; + } + + return 0; +} + +void pnv_pci_reset_secondary_bus(struct pci_dev *dev) +{ + struct pci_controller *hose; + + if (pci_is_root_bus(dev->bus)) { + hose = pci_bus_to_host(dev->bus); + pnv_eeh_root_reset(hose, EEH_RESET_HOT); + pnv_eeh_root_reset(hose, EEH_RESET_DEACTIVATE); + } else { + pnv_eeh_bridge_reset(dev, EEH_RESET_HOT); + pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); + } +} + +/** + * pnv_eeh_reset - Reset the specified PE + * @pe: EEH PE + * @option: reset option + * + * Do reset on the indicated PE. For PCI bus sensitive PE, + * we need to reset the parent p2p bridge. The PHB has to + * be reinitialized if the p2p bridge is root bridge. For + * PCI device sensitive PE, we will try to reset the device + * through FLR. For now, we don't have OPAL APIs to do HARD + * reset yet, so all reset would be SOFT (HOT) reset. + */ +static int pnv_eeh_reset(struct eeh_pe *pe, int option) +{ + struct pci_controller *hose = pe->phb; + struct pci_bus *bus; + int ret; + + /* + * For PHB reset, we always have complete reset. For those PEs whose + * primary bus derived from root complex (root bus) or root port + * (usually bus#1), we apply hot or fundamental reset on the root port. + * For other PEs, we always have hot reset on the PE primary bus. + * + * Here, we have different design to pHyp, which always clear the + * frozen state during PE reset. However, the good idea here from + * benh is to keep frozen state before we get PE reset done completely + * (until BAR restore). With the frozen state, HW drops illegal IO + * or MMIO access, which can incur recrusive frozen PE during PE + * reset. The side effect is that EEH core has to clear the frozen + * state explicitly after BAR restore. + */ + if (pe->type & EEH_PE_PHB) { + ret = pnv_eeh_phb_reset(hose, option); + } else { + struct pnv_phb *phb; + s64 rc; + + /* + * The frozen PE might be caused by PAPR error injection + * registers, which are expected to be cleared after hitting + * frozen PE as stated in the hardware spec. Unfortunately, + * that's not true on P7IOC. So we have to clear it manually + * to avoid recursive EEH errors during recovery. + */ + phb = hose->private_data; + if (phb->model == PNV_PHB_MODEL_P7IOC && + (option == EEH_RESET_HOT || + option == EEH_RESET_FUNDAMENTAL)) { + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PHB_ERROR, + OPAL_ASSERT_RESET); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld clearing " + "error injection registers\n", + __func__, rc); + return -EIO; + } + } + + bus = eeh_pe_bus_get(pe); + if (pci_is_root_bus(bus) || + pci_is_root_bus(bus->parent)) + ret = pnv_eeh_root_reset(hose, option); + else + ret = pnv_eeh_bridge_reset(bus->self, option); + } + + return ret; +} + +/** + * pnv_eeh_wait_state - Wait for PE state + * @pe: EEH PE + * @max_wait: maximal period in microsecond + * + * Wait for the state of associated PE. It might take some time + * to retrieve the PE's state. + */ +static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait) +{ + int ret; + int mwait; + + while (1) { + ret = pnv_eeh_get_state(pe, &mwait); + + /* + * If the PE's state is temporarily unavailable, + * we have to wait for the specified time. Otherwise, + * the PE's state will be returned immediately. + */ + if (ret != EEH_STATE_UNAVAILABLE) + return ret; + + max_wait -= mwait; + if (max_wait <= 0) { + pr_warn("%s: Timeout getting PE#%x's state (%d)\n", + __func__, pe->addr, max_wait); + return EEH_STATE_NOT_SUPPORT; + } + + msleep(mwait); + } + + return EEH_STATE_NOT_SUPPORT; +} + +/** + * pnv_eeh_get_log - Retrieve error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * @drv_log: driver log to be combined with retrieved error log + * @len: length of driver log + * + * Retrieve the temporary or permanent error from the PE. + */ +static int pnv_eeh_get_log(struct eeh_pe *pe, int severity, + char *drv_log, unsigned long len) +{ + if (!eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data(pe->phb, pe->data); + + return 0; +} + +/** + * pnv_eeh_configure_bridge - Configure PCI bridges in the indicated PE + * @pe: EEH PE + * + * The function will be called to reconfigure the bridges included + * in the specified PE so that the mulfunctional PE would be recovered + * again. + */ +static int pnv_eeh_configure_bridge(struct eeh_pe *pe) +{ + return 0; +} + +/** + * pnv_pe_err_inject - Inject specified error to the indicated PE + * @pe: the indicated PE + * @type: error type + * @func: specific error type + * @addr: address + * @mask: address mask + * + * The routine is called to inject specified error, which is + * determined by @type and @func, to the indicated PE for + * testing purpose. + */ +static int pnv_eeh_err_inject(struct eeh_pe *pe, int type, int func, + unsigned long addr, unsigned long mask) +{ + struct pci_controller *hose = pe->phb; + struct pnv_phb *phb = hose->private_data; + s64 rc; + + /* Sanity check on error type */ + if (type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR && + type != OPAL_ERR_INJECT_TYPE_IOA_BUS_ERR64) { + pr_warn("%s: Invalid error type %d\n", + __func__, type); + return -ERANGE; + } + + if (func < OPAL_ERR_INJECT_FUNC_IOA_LD_MEM_ADDR || + func > OPAL_ERR_INJECT_FUNC_IOA_DMA_WR_TARGET) { + pr_warn("%s: Invalid error function %d\n", + __func__, func); + return -ERANGE; + } + + /* Firmware supports error injection ? */ + if (!opal_check_token(OPAL_PCI_ERR_INJECT)) { + pr_warn("%s: Firmware doesn't support error injection\n", + __func__); + return -ENXIO; + } + + /* Do error injection */ + rc = opal_pci_err_inject(phb->opal_id, pe->addr, + type, func, addr, mask); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld injecting error " + "%d-%d to PHB#%x-PE#%x\n", + __func__, rc, type, func, + hose->global_number, pe->addr); + return -EIO; + } + + return 0; +} + +static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn) +{ + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + + if (!edev || !edev->pe) + return false; + + if (edev->pe->state & EEH_PE_CFG_BLOCKED) + return true; + + return false; +} + +static int pnv_eeh_read_config(struct pci_dn *pdn, + int where, int size, u32 *val) +{ + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (pnv_eeh_cfg_blocked(pdn)) { + *val = 0xFFFFFFFF; + return PCIBIOS_SET_FAILED; + } + + return pnv_pci_cfg_read(pdn, where, size, val); +} + +static int pnv_eeh_write_config(struct pci_dn *pdn, + int where, int size, u32 val) +{ + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (pnv_eeh_cfg_blocked(pdn)) + return PCIBIOS_SET_FAILED; + + return pnv_pci_cfg_write(pdn, where, size, val); +} + +static void pnv_eeh_dump_hub_diag_common(struct OpalIoP7IOCErrorData *data) +{ + /* GEM */ + if (data->gemXfir || data->gemRfir || + data->gemRirqfir || data->gemMask || data->gemRwof) + pr_info(" GEM: %016llx %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->gemXfir), + be64_to_cpu(data->gemRfir), + be64_to_cpu(data->gemRirqfir), + be64_to_cpu(data->gemMask), + be64_to_cpu(data->gemRwof)); + + /* LEM */ + if (data->lemFir || data->lemErrMask || + data->lemAction0 || data->lemAction1 || data->lemWof) + pr_info(" LEM: %016llx %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrMask), + be64_to_cpu(data->lemAction0), + be64_to_cpu(data->lemAction1), + be64_to_cpu(data->lemWof)); +} + +static void pnv_eeh_get_and_dump_hub_diag(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + struct OpalIoP7IOCErrorData *data = &phb->diag.hub_diag; + long rc; + + rc = opal_pci_get_hub_diag_data(phb->hub_id, data, sizeof(*data)); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failed to get HUB#%llx diag-data (%ld)\n", + __func__, phb->hub_id, rc); + return; + } + + switch (data->type) { + case OPAL_P7IOC_DIAG_TYPE_RGC: + pr_info("P7IOC diag-data for RGC\n\n"); + pnv_eeh_dump_hub_diag_common(data); + if (data->rgc.rgcStatus || data->rgc.rgcLdcp) + pr_info(" RGC: %016llx %016llx\n", + be64_to_cpu(data->rgc.rgcStatus), + be64_to_cpu(data->rgc.rgcLdcp)); + break; + case OPAL_P7IOC_DIAG_TYPE_BI: + pr_info("P7IOC diag-data for BI %s\n\n", + data->bi.biDownbound ? "Downbound" : "Upbound"); + pnv_eeh_dump_hub_diag_common(data); + if (data->bi.biLdcp0 || data->bi.biLdcp1 || + data->bi.biLdcp2 || data->bi.biFenceStatus) + pr_info(" BI: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->bi.biLdcp0), + be64_to_cpu(data->bi.biLdcp1), + be64_to_cpu(data->bi.biLdcp2), + be64_to_cpu(data->bi.biFenceStatus)); + break; + case OPAL_P7IOC_DIAG_TYPE_CI: + pr_info("P7IOC diag-data for CI Port %d\n\n", + data->ci.ciPort); + pnv_eeh_dump_hub_diag_common(data); + if (data->ci.ciPortStatus || data->ci.ciPortLdcp) + pr_info(" CI: %016llx %016llx\n", + be64_to_cpu(data->ci.ciPortStatus), + be64_to_cpu(data->ci.ciPortLdcp)); + break; + case OPAL_P7IOC_DIAG_TYPE_MISC: + pr_info("P7IOC diag-data for MISC\n\n"); + pnv_eeh_dump_hub_diag_common(data); + break; + case OPAL_P7IOC_DIAG_TYPE_I2C: + pr_info("P7IOC diag-data for I2C\n\n"); + pnv_eeh_dump_hub_diag_common(data); + break; + default: + pr_warn("%s: Invalid type of HUB#%llx diag-data (%d)\n", + __func__, phb->hub_id, data->type); + } +} + +static int pnv_eeh_get_pe(struct pci_controller *hose, + u16 pe_no, struct eeh_pe **pe) +{ + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pnv_pe; + struct eeh_pe *dev_pe; + struct eeh_dev edev; + + /* + * If PHB supports compound PE, to fetch + * the master PE because slave PE is invisible + * to EEH core. + */ + pnv_pe = &phb->ioda.pe_array[pe_no]; + if (pnv_pe->flags & PNV_IODA_PE_SLAVE) { + pnv_pe = pnv_pe->master; + WARN_ON(!pnv_pe || + !(pnv_pe->flags & PNV_IODA_PE_MASTER)); + pe_no = pnv_pe->pe_number; + } + + /* Find the PE according to PE# */ + memset(&edev, 0, sizeof(struct eeh_dev)); + edev.phb = hose; + edev.pe_config_addr = pe_no; + dev_pe = eeh_pe_get(&edev); + if (!dev_pe) + return -EEXIST; + + /* Freeze the (compound) PE */ + *pe = dev_pe; + if (!(dev_pe->state & EEH_PE_ISOLATED)) + phb->freeze_pe(phb, pe_no); + + /* + * At this point, we're sure the (compound) PE should + * have been frozen. However, we still need poke until + * hitting the frozen PE on top level. + */ + dev_pe = dev_pe->parent; + while (dev_pe && !(dev_pe->type & EEH_PE_PHB)) { + int ret; + int active_flags = (EEH_STATE_MMIO_ACTIVE | + EEH_STATE_DMA_ACTIVE); + + ret = eeh_ops->get_state(dev_pe, NULL); + if (ret <= 0 || (ret & active_flags) == active_flags) { + dev_pe = dev_pe->parent; + continue; + } + + /* Frozen parent PE */ + *pe = dev_pe; + if (!(dev_pe->state & EEH_PE_ISOLATED)) + phb->freeze_pe(phb, dev_pe->addr); + + /* Next one */ + dev_pe = dev_pe->parent; + } + + return 0; +} + +/** + * pnv_eeh_next_error - Retrieve next EEH error to handle + * @pe: Affected PE + * + * The function is expected to be called by EEH core while it gets + * special EEH event (without binding PE). The function calls to + * OPAL APIs for next error to handle. The informational error is + * handled internally by platform. However, the dead IOC, dead PHB, + * fenced PHB and frozen PE should be handled by EEH core eventually. + */ +static int pnv_eeh_next_error(struct eeh_pe **pe) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct eeh_pe *phb_pe, *parent_pe; + __be64 frozen_pe_no; + __be16 err_type, severity; + int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + long rc; + int state, ret = EEH_NEXT_ERR_NONE; + + /* + * While running here, it's safe to purge the event queue. + * And we should keep the cached OPAL notifier event sychronized + * between the kernel and firmware. + */ + eeh_remove_event(NULL, false); + opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); + + list_for_each_entry(hose, &hose_list, list_node) { + /* + * If the subordinate PCI buses of the PHB has been + * removed or is exactly under error recovery, we + * needn't take care of it any more. + */ + phb = hose->private_data; + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || (phb_pe->state & EEH_PE_ISOLATED)) + continue; + + rc = opal_pci_next_error(phb->opal_id, + &frozen_pe_no, &err_type, &severity); + if (rc != OPAL_SUCCESS) { + pr_devel("%s: Invalid return value on " + "PHB#%x (0x%lx) from opal_pci_next_error", + __func__, hose->global_number, rc); + continue; + } + + /* If the PHB doesn't have error, stop processing */ + if (be16_to_cpu(err_type) == OPAL_EEH_NO_ERROR || + be16_to_cpu(severity) == OPAL_EEH_SEV_NO_ERROR) { + pr_devel("%s: No error found on PHB#%x\n", + __func__, hose->global_number); + continue; + } + + /* + * Processing the error. We're expecting the error with + * highest priority reported upon multiple errors on the + * specific PHB. + */ + pr_devel("%s: Error (%d, %d, %llu) on PHB#%x\n", + __func__, be16_to_cpu(err_type), + be16_to_cpu(severity), be64_to_cpu(frozen_pe_no), + hose->global_number); + switch (be16_to_cpu(err_type)) { + case OPAL_EEH_IOC_ERROR: + if (be16_to_cpu(severity) == OPAL_EEH_SEV_IOC_DEAD) { + pr_err("EEH: dead IOC detected\n"); + ret = EEH_NEXT_ERR_DEAD_IOC; + } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { + pr_info("EEH: IOC informative error " + "detected\n"); + pnv_eeh_get_and_dump_hub_diag(hose); + ret = EEH_NEXT_ERR_NONE; + } + + break; + case OPAL_EEH_PHB_ERROR: + if (be16_to_cpu(severity) == OPAL_EEH_SEV_PHB_DEAD) { + *pe = phb_pe; + pr_err("EEH: dead PHB#%x detected, " + "location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_DEAD_PHB; + } else if (be16_to_cpu(severity) == + OPAL_EEH_SEV_PHB_FENCED) { + *pe = phb_pe; + pr_err("EEH: Fenced PHB#%x detected, " + "location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_FENCED_PHB; + } else if (be16_to_cpu(severity) == OPAL_EEH_SEV_INF) { + pr_info("EEH: PHB#%x informative error " + "detected, location: %s\n", + hose->global_number, + eeh_pe_loc_get(phb_pe)); + pnv_eeh_get_phb_diag(phb_pe); + pnv_pci_dump_phb_diag_data(hose, phb_pe->data); + ret = EEH_NEXT_ERR_NONE; + } + + break; + case OPAL_EEH_PE_ERROR: + /* + * If we can't find the corresponding PE, we + * just try to unfreeze. + */ + if (pnv_eeh_get_pe(hose, + be64_to_cpu(frozen_pe_no), pe)) { + /* Try best to clear it */ + pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n", + hose->global_number, frozen_pe_no); + pr_info("EEH: PHB location: %s\n", + eeh_pe_loc_get(phb_pe)); + opal_pci_eeh_freeze_clear(phb->opal_id, + frozen_pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + ret = EEH_NEXT_ERR_NONE; + } else if ((*pe)->state & EEH_PE_ISOLATED || + eeh_pe_passed(*pe)) { + ret = EEH_NEXT_ERR_NONE; + } else { + pr_err("EEH: Frozen PE#%x " + "on PHB#%x detected\n", + (*pe)->addr, + (*pe)->phb->global_number); + pr_err("EEH: PE location: %s, " + "PHB location: %s\n", + eeh_pe_loc_get(*pe), + eeh_pe_loc_get(phb_pe)); + ret = EEH_NEXT_ERR_FROZEN_PE; + } + + break; + default: + pr_warn("%s: Unexpected error type %d\n", + __func__, be16_to_cpu(err_type)); + } + + /* + * EEH core will try recover from fenced PHB or + * frozen PE. In the time for frozen PE, EEH core + * enable IO path for that before collecting logs, + * but it ruins the site. So we have to dump the + * log in advance here. + */ + if ((ret == EEH_NEXT_ERR_FROZEN_PE || + ret == EEH_NEXT_ERR_FENCED_PHB) && + !((*pe)->state & EEH_PE_ISOLATED)) { + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + pnv_eeh_get_phb_diag(*pe); + + if (eeh_has_flag(EEH_EARLY_DUMP_LOG)) + pnv_pci_dump_phb_diag_data((*pe)->phb, + (*pe)->data); + } + + /* + * We probably have the frozen parent PE out there and + * we need have to handle frozen parent PE firstly. + */ + if (ret == EEH_NEXT_ERR_FROZEN_PE) { + parent_pe = (*pe)->parent; + while (parent_pe) { + /* Hit the ceiling ? */ + if (parent_pe->type & EEH_PE_PHB) + break; + + /* Frozen parent PE ? */ + state = eeh_ops->get_state(parent_pe, NULL); + if (state > 0 && + (state & active_flags) != active_flags) + *pe = parent_pe; + + /* Next parent level */ + parent_pe = parent_pe->parent; + } + + /* We possibly migrate to another PE */ + eeh_pe_state_mark(*pe, EEH_PE_ISOLATED); + } + + /* + * If we have no errors on the specific PHB or only + * informative error there, we continue poking it. + * Otherwise, we need actions to be taken by upper + * layer. + */ + if (ret > EEH_NEXT_ERR_INF) + break; + } + + return ret; +} + +static int pnv_eeh_restore_config(struct pci_dn *pdn) +{ + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + struct pnv_phb *phb; + s64 ret; + + if (!edev) + return -EEXIST; + + phb = edev->phb->private_data; + ret = opal_pci_reinit(phb->opal_id, + OPAL_REINIT_PCI_DEV, edev->config_addr); + if (ret) { + pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", + __func__, edev->config_addr, ret); + return -EIO; + } + + return 0; +} + +static struct eeh_ops pnv_eeh_ops = { + .name = "powernv", + .init = pnv_eeh_init, + .post_init = pnv_eeh_post_init, + .probe = pnv_eeh_probe, + .set_option = pnv_eeh_set_option, + .get_pe_addr = pnv_eeh_get_pe_addr, + .get_state = pnv_eeh_get_state, + .reset = pnv_eeh_reset, + .wait_state = pnv_eeh_wait_state, + .get_log = pnv_eeh_get_log, + .configure_bridge = pnv_eeh_configure_bridge, + .err_inject = pnv_eeh_err_inject, + .read_config = pnv_eeh_read_config, + .write_config = pnv_eeh_write_config, + .next_error = pnv_eeh_next_error, + .restore_config = pnv_eeh_restore_config +}; + +/** + * eeh_powernv_init - Register platform dependent EEH operations + * + * EEH initialization on powernv platform. This function should be + * called before any EEH related functions. + */ +static int __init eeh_powernv_init(void) +{ + int ret = -EINVAL; + + eeh_set_pe_aux_size(PNV_PCI_DIAG_BUF_SIZE); + ret = eeh_ops_register(&pnv_eeh_ops); + if (!ret) + pr_info("EEH: PowerNV platform initialized\n"); + else + pr_info("EEH: Failed to initialize PowerNV platform (%d)\n", ret); + + return ret; +} +machine_early_initcall(powernv, eeh_powernv_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-async.c b/kernel/arch/powerpc/platforms/powernv/opal-async.c new file mode 100644 index 000000000..693b6cdac --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-async.c @@ -0,0 +1,208 @@ +/* + * PowerNV OPAL asynchronous completion interfaces + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/gfp.h> +#include <linux/of.h> +#include <asm/machdep.h> +#include <asm/opal.h> + +#define N_ASYNC_COMPLETIONS 64 + +static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; +static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); +static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); +static DEFINE_SPINLOCK(opal_async_comp_lock); +static struct semaphore opal_async_sem; +static struct opal_msg *opal_async_responses; +static unsigned int opal_max_async_tokens; + +int __opal_async_get_token(void) +{ + unsigned long flags; + int token; + + spin_lock_irqsave(&opal_async_comp_lock, flags); + token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); + if (token >= opal_max_async_tokens) { + token = -EBUSY; + goto out; + } + + if (__test_and_set_bit(token, opal_async_token_map)) { + token = -EBUSY; + goto out; + } + + __clear_bit(token, opal_async_complete_map); + +out: + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + return token; +} + +int opal_async_get_token_interruptible(void) +{ + int token; + + /* Wait until a token is available */ + if (down_interruptible(&opal_async_sem)) + return -ERESTARTSYS; + + token = __opal_async_get_token(); + if (token < 0) + up(&opal_async_sem); + + return token; +} +EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible); + +int __opal_async_release_token(int token) +{ + unsigned long flags; + + if (token < 0 || token >= opal_max_async_tokens) { + pr_err("%s: Passed token is out of range, token %d\n", + __func__, token); + return -EINVAL; + } + + spin_lock_irqsave(&opal_async_comp_lock, flags); + __set_bit(token, opal_async_complete_map); + __clear_bit(token, opal_async_token_map); + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + + return 0; +} + +int opal_async_release_token(int token) +{ + int ret; + + ret = __opal_async_release_token(token); + if (ret) + return ret; + + up(&opal_async_sem); + + return 0; +} +EXPORT_SYMBOL_GPL(opal_async_release_token); + +int opal_async_wait_response(uint64_t token, struct opal_msg *msg) +{ + if (token >= opal_max_async_tokens) { + pr_err("%s: Invalid token passed\n", __func__); + return -EINVAL; + } + + if (!msg) { + pr_err("%s: Invalid message pointer passed\n", __func__); + return -EINVAL; + } + + wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); + memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + + return 0; +} +EXPORT_SYMBOL_GPL(opal_async_wait_response); + +static int opal_async_comp_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + struct opal_msg *comp_msg = msg; + unsigned long flags; + uint64_t token; + + if (msg_type != OPAL_MSG_ASYNC_COMP) + return 0; + + token = be64_to_cpu(comp_msg->params[0]); + memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); + spin_lock_irqsave(&opal_async_comp_lock, flags); + __set_bit(token, opal_async_complete_map); + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + + wake_up(&opal_async_wait); + + return 0; +} + +static struct notifier_block opal_async_comp_nb = { + .notifier_call = opal_async_comp_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_async_comp_init(void) +{ + struct device_node *opal_node; + const __be32 *async; + int err; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_err("%s: Opal node not found\n", __func__); + err = -ENOENT; + goto out; + } + + async = of_get_property(opal_node, "opal-msg-async-num", NULL); + if (!async) { + pr_err("%s: %s has no opal-msg-async-num\n", + __func__, opal_node->full_name); + err = -ENOENT; + goto out_opal_node; + } + + opal_max_async_tokens = be32_to_cpup(async); + if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) + opal_max_async_tokens = N_ASYNC_COMPLETIONS; + + err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, + &opal_async_comp_nb); + if (err) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, err); + goto out_opal_node; + } + + opal_async_responses = kzalloc( + sizeof(*opal_async_responses) * opal_max_async_tokens, + GFP_KERNEL); + if (!opal_async_responses) { + pr_err("%s: Out of memory, failed to do asynchronous " + "completion init\n", __func__); + err = -ENOMEM; + goto out_opal_node; + } + + /* Initialize to 1 less than the maximum tokens available, as we may + * require to pop one during emergency through synchronous call to + * __opal_async_get_token() + */ + sema_init(&opal_async_sem, opal_max_async_tokens - 1); + +out_opal_node: + of_node_put(opal_node); +out: + return err; +} +machine_subsys_initcall(powernv, opal_async_comp_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-dump.c b/kernel/arch/powerpc/platforms/powernv/opal-dump.c new file mode 100644 index 000000000..5aa9c1ce4 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-dump.c @@ -0,0 +1,457 @@ +/* + * PowerNV OPAL Dump Interface + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/delay.h> + +#include <asm/opal.h> + +#define DUMP_TYPE_FSP 0x01 + +struct dump_obj { + struct kobject kobj; + struct bin_attribute dump_attr; + uint32_t id; /* becomes object name */ + uint32_t type; + uint32_t size; + char *buffer; +}; +#define to_dump_obj(x) container_of(x, struct dump_obj, kobj) + +struct dump_attribute { + struct attribute attr; + ssize_t (*show)(struct dump_obj *dump, struct dump_attribute *attr, + char *buf); + ssize_t (*store)(struct dump_obj *dump, struct dump_attribute *attr, + const char *buf, size_t count); +}; +#define to_dump_attr(x) container_of(x, struct dump_attribute, attr) + +static ssize_t dump_id_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%x\n", dump_obj->id); +} + +static const char* dump_type_to_string(uint32_t type) +{ + switch (type) { + case 0x01: return "SP Dump"; + case 0x02: return "System/Platform Dump"; + case 0x03: return "SMA Dump"; + default: return "unknown"; + } +} + +static ssize_t dump_type_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + + return sprintf(buf, "0x%x %s\n", dump_obj->type, + dump_type_to_string(dump_obj->type)); +} + +static ssize_t dump_ack_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "ack - acknowledge dump\n"); +} + +/* + * Send acknowledgement to OPAL + */ +static int64_t dump_send_ack(uint32_t dump_id) +{ + int rc; + + rc = opal_dump_ack(dump_id); + if (rc) + pr_warn("%s: Failed to send ack to Dump ID 0x%x (%d)\n", + __func__, dump_id, rc); + return rc; +} + +static ssize_t dump_ack_store(struct dump_obj *dump_obj, + struct dump_attribute *attr, + const char *buf, + size_t count) +{ + dump_send_ack(dump_obj->id); + sysfs_remove_file_self(&dump_obj->kobj, &attr->attr); + kobject_put(&dump_obj->kobj); + return count; +} + +/* Attributes of a dump + * The binary attribute of the dump itself is dynamic + * due to the dynamic size of the dump + */ +static struct dump_attribute id_attribute = + __ATTR(id, S_IRUGO, dump_id_show, NULL); +static struct dump_attribute type_attribute = + __ATTR(type, S_IRUGO, dump_type_show, NULL); +static struct dump_attribute ack_attribute = + __ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store); + +static ssize_t init_dump_show(struct dump_obj *dump_obj, + struct dump_attribute *attr, + char *buf) +{ + return sprintf(buf, "1 - initiate Service Processor(FSP) dump\n"); +} + +static int64_t dump_fips_init(uint8_t type) +{ + int rc; + + rc = opal_dump_init(type); + if (rc) + pr_warn("%s: Failed to initiate FSP dump (%d)\n", + __func__, rc); + return rc; +} + +static ssize_t init_dump_store(struct dump_obj *dump_obj, + struct dump_attribute *attr, + const char *buf, + size_t count) +{ + int rc; + + rc = dump_fips_init(DUMP_TYPE_FSP); + if (rc == OPAL_SUCCESS) + pr_info("%s: Initiated FSP dump\n", __func__); + + return count; +} + +static struct dump_attribute initiate_attribute = + __ATTR(initiate_dump, 0600, init_dump_show, init_dump_store); + +static struct attribute *initiate_attrs[] = { + &initiate_attribute.attr, + NULL, +}; + +static struct attribute_group initiate_attr_group = { + .attrs = initiate_attrs, +}; + +static struct kset *dump_kset; + +static ssize_t dump_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct dump_attribute *attribute; + struct dump_obj *dump; + + attribute = to_dump_attr(attr); + dump = to_dump_obj(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(dump, attribute, buf); +} + +static ssize_t dump_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct dump_attribute *attribute; + struct dump_obj *dump; + + attribute = to_dump_attr(attr); + dump = to_dump_obj(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(dump, attribute, buf, len); +} + +static const struct sysfs_ops dump_sysfs_ops = { + .show = dump_attr_show, + .store = dump_attr_store, +}; + +static void dump_release(struct kobject *kobj) +{ + struct dump_obj *dump; + + dump = to_dump_obj(kobj); + vfree(dump->buffer); + kfree(dump); +} + +static struct attribute *dump_default_attrs[] = { + &id_attribute.attr, + &type_attribute.attr, + &ack_attribute.attr, + NULL, +}; + +static struct kobj_type dump_ktype = { + .sysfs_ops = &dump_sysfs_ops, + .release = &dump_release, + .default_attrs = dump_default_attrs, +}; + +static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type) +{ + __be32 id, size, type; + int rc; + + type = cpu_to_be32(0xffffffff); + + rc = opal_dump_info2(&id, &size, &type); + if (rc == OPAL_PARAMETER) + rc = opal_dump_info(&id, &size); + + *dump_id = be32_to_cpu(id); + *dump_size = be32_to_cpu(size); + *dump_type = be32_to_cpu(type); + + if (rc) + pr_warn("%s: Failed to get dump info (%d)\n", + __func__, rc); + return rc; +} + +static int64_t dump_read_data(struct dump_obj *dump) +{ + struct opal_sg_list *list; + uint64_t addr; + int64_t rc; + + /* Allocate memory */ + dump->buffer = vzalloc(PAGE_ALIGN(dump->size)); + if (!dump->buffer) { + pr_err("%s : Failed to allocate memory\n", __func__); + rc = -ENOMEM; + goto out; + } + + /* Generate SG list */ + list = opal_vmalloc_to_sg_list(dump->buffer, dump->size); + if (!list) { + rc = -ENOMEM; + goto out; + } + + /* First entry address */ + addr = __pa(list); + + /* Fetch data */ + rc = OPAL_BUSY_EVENT; + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_dump_read(dump->id, addr); + if (rc == OPAL_BUSY_EVENT) { + opal_poll_events(NULL); + msleep(20); + } + } + + if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) + pr_warn("%s: Extract dump failed for ID 0x%x\n", + __func__, dump->id); + + /* Free SG list */ + opal_free_sg_list(list); + +out: + return rc; +} + +static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + ssize_t rc; + + struct dump_obj *dump = to_dump_obj(kobj); + + if (!dump->buffer) { + rc = dump_read_data(dump); + + if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) { + vfree(dump->buffer); + dump->buffer = NULL; + + return -EIO; + } + if (rc == OPAL_PARTIAL) { + /* On a partial read, we just return EIO + * and rely on userspace to ask us to try + * again. + */ + pr_info("%s: Platform dump partially read. ID = 0x%x\n", + __func__, dump->id); + return -EIO; + } + } + + memcpy(buffer, dump->buffer + pos, count); + + /* You may think we could free the dump buffer now and retrieve + * it again later if needed, but due to current firmware limitation, + * that's not the case. So, once read into userspace once, + * we keep the dump around until it's acknowledged by userspace. + */ + + return count; +} + +static struct dump_obj *create_dump_obj(uint32_t id, size_t size, + uint32_t type) +{ + struct dump_obj *dump; + int rc; + + dump = kzalloc(sizeof(*dump), GFP_KERNEL); + if (!dump) + return NULL; + + dump->kobj.kset = dump_kset; + + kobject_init(&dump->kobj, &dump_ktype); + + sysfs_bin_attr_init(&dump->dump_attr); + + dump->dump_attr.attr.name = "dump"; + dump->dump_attr.attr.mode = 0400; + dump->dump_attr.size = size; + dump->dump_attr.read = dump_attr_read; + + dump->id = id; + dump->size = size; + dump->type = type; + + rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id); + if (rc) { + kobject_put(&dump->kobj); + return NULL; + } + + rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr); + if (rc) { + kobject_put(&dump->kobj); + return NULL; + } + + pr_info("%s: New platform dump. ID = 0x%x Size %u\n", + __func__, dump->id, dump->size); + + kobject_uevent(&dump->kobj, KOBJ_ADD); + + return dump; +} + +static int process_dump(void) +{ + int rc; + uint32_t dump_id, dump_size, dump_type; + struct dump_obj *dump; + char name[22]; + + rc = dump_read_info(&dump_id, &dump_size, &dump_type); + if (rc != OPAL_SUCCESS) + return rc; + + sprintf(name, "0x%x-0x%x", dump_type, dump_id); + + /* we may get notified twice, let's handle + * that gracefully and not create two conflicting + * entries. + */ + if (kset_find_obj(dump_kset, name)) + return 0; + + dump = create_dump_obj(dump_id, dump_size, dump_type); + if (!dump) + return -1; + + return 0; +} + +static void dump_work_fn(struct work_struct *work) +{ + process_dump(); +} + +static DECLARE_WORK(dump_work, dump_work_fn); + +static void schedule_process_dump(void) +{ + schedule_work(&dump_work); +} + +/* + * New dump available notification + * + * Once we get notification, we add sysfs entries for it. + * We only fetch the dump on demand, and create sysfs asynchronously. + */ +static int dump_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + if (events & OPAL_EVENT_DUMP_AVAIL) + schedule_process_dump(); + + return 0; +} + +static struct notifier_block dump_nb = { + .notifier_call = dump_event, + .next = NULL, + .priority = 0 +}; + +void __init opal_platform_dump_init(void) +{ + int rc; + + /* ELOG not supported by firmware */ + if (!opal_check_token(OPAL_DUMP_READ)) + return; + + dump_kset = kset_create_and_add("dump", NULL, opal_kobj); + if (!dump_kset) { + pr_warn("%s: Failed to create dump kset\n", __func__); + return; + } + + rc = sysfs_create_group(&dump_kset->kobj, &initiate_attr_group); + if (rc) { + pr_warn("%s: Failed to create initiate dump attr group\n", + __func__); + kobject_put(&dump_kset->kobj); + return; + } + + rc = opal_notifier_register(&dump_nb); + if (rc) { + pr_warn("%s: Can't register OPAL event notifier (%d)\n", + __func__, rc); + return; + } + + if (opal_check_token(OPAL_DUMP_RESEND)) + opal_dump_resend_notification(); +} diff --git a/kernel/arch/powerpc/platforms/powernv/opal-elog.c b/kernel/arch/powerpc/platforms/powernv/opal-elog.c new file mode 100644 index 000000000..38ce757e5 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-elog.c @@ -0,0 +1,320 @@ +/* + * Error log support on PowerNV. + * + * Copyright 2013,2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/fcntl.h> +#include <linux/kobject.h> +#include <asm/uaccess.h> +#include <asm/opal.h> + +struct elog_obj { + struct kobject kobj; + struct bin_attribute raw_attr; + uint64_t id; + uint64_t type; + size_t size; + char *buffer; +}; +#define to_elog_obj(x) container_of(x, struct elog_obj, kobj) + +struct elog_attribute { + struct attribute attr; + ssize_t (*show)(struct elog_obj *elog, struct elog_attribute *attr, + char *buf); + ssize_t (*store)(struct elog_obj *elog, struct elog_attribute *attr, + const char *buf, size_t count); +}; +#define to_elog_attr(x) container_of(x, struct elog_attribute, attr) + +static ssize_t elog_id_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%llx\n", elog_obj->id); +} + +static const char *elog_type_to_string(uint64_t type) +{ + switch (type) { + case 0: return "PEL"; + default: return "unknown"; + } +} + +static ssize_t elog_type_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "0x%llx %s\n", + elog_obj->type, + elog_type_to_string(elog_obj->type)); +} + +static ssize_t elog_ack_show(struct elog_obj *elog_obj, + struct elog_attribute *attr, + char *buf) +{ + return sprintf(buf, "ack - acknowledge log message\n"); +} + +static ssize_t elog_ack_store(struct elog_obj *elog_obj, + struct elog_attribute *attr, + const char *buf, + size_t count) +{ + opal_send_ack_elog(elog_obj->id); + sysfs_remove_file_self(&elog_obj->kobj, &attr->attr); + kobject_put(&elog_obj->kobj); + return count; +} + +static struct elog_attribute id_attribute = + __ATTR(id, S_IRUGO, elog_id_show, NULL); +static struct elog_attribute type_attribute = + __ATTR(type, S_IRUGO, elog_type_show, NULL); +static struct elog_attribute ack_attribute = + __ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store); + +static struct kset *elog_kset; + +static ssize_t elog_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct elog_attribute *attribute; + struct elog_obj *elog; + + attribute = to_elog_attr(attr); + elog = to_elog_obj(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(elog, attribute, buf); +} + +static ssize_t elog_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct elog_attribute *attribute; + struct elog_obj *elog; + + attribute = to_elog_attr(attr); + elog = to_elog_obj(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(elog, attribute, buf, len); +} + +static const struct sysfs_ops elog_sysfs_ops = { + .show = elog_attr_show, + .store = elog_attr_store, +}; + +static void elog_release(struct kobject *kobj) +{ + struct elog_obj *elog; + + elog = to_elog_obj(kobj); + kfree(elog->buffer); + kfree(elog); +} + +static struct attribute *elog_default_attrs[] = { + &id_attribute.attr, + &type_attribute.attr, + &ack_attribute.attr, + NULL, +}; + +static struct kobj_type elog_ktype = { + .sysfs_ops = &elog_sysfs_ops, + .release = &elog_release, + .default_attrs = elog_default_attrs, +}; + +/* Maximum size of a single log on FSP is 16KB */ +#define OPAL_MAX_ERRLOG_SIZE 16384 + +static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + int opal_rc; + + struct elog_obj *elog = to_elog_obj(kobj); + + /* We may have had an error reading before, so let's retry */ + if (!elog->buffer) { + elog->buffer = kzalloc(elog->size, GFP_KERNEL); + if (!elog->buffer) + return -EIO; + + opal_rc = opal_read_elog(__pa(elog->buffer), + elog->size, elog->id); + if (opal_rc != OPAL_SUCCESS) { + pr_err("ELOG: log read failed for log-id=%llx\n", + elog->id); + kfree(elog->buffer); + elog->buffer = NULL; + return -EIO; + } + } + + memcpy(buffer, elog->buffer + pos, count); + + return count; +} + +static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) +{ + struct elog_obj *elog; + int rc; + + elog = kzalloc(sizeof(*elog), GFP_KERNEL); + if (!elog) + return NULL; + + elog->kobj.kset = elog_kset; + + kobject_init(&elog->kobj, &elog_ktype); + + sysfs_bin_attr_init(&elog->raw_attr); + + elog->raw_attr.attr.name = "raw"; + elog->raw_attr.attr.mode = 0400; + elog->raw_attr.size = size; + elog->raw_attr.read = raw_attr_read; + + elog->id = id; + elog->size = size; + elog->type = type; + + elog->buffer = kzalloc(elog->size, GFP_KERNEL); + + if (elog->buffer) { + rc = opal_read_elog(__pa(elog->buffer), + elog->size, elog->id); + if (rc != OPAL_SUCCESS) { + pr_err("ELOG: log read failed for log-id=%llx\n", + elog->id); + kfree(elog->buffer); + elog->buffer = NULL; + } + } + + rc = kobject_add(&elog->kobj, NULL, "0x%llx", id); + if (rc) { + kobject_put(&elog->kobj); + return NULL; + } + + rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr); + if (rc) { + kobject_put(&elog->kobj); + return NULL; + } + + kobject_uevent(&elog->kobj, KOBJ_ADD); + + return elog; +} + +static void elog_work_fn(struct work_struct *work) +{ + __be64 size; + __be64 id; + __be64 type; + uint64_t elog_size; + uint64_t log_id; + uint64_t elog_type; + int rc; + char name[2+16+1]; + + rc = opal_get_elog_size(&id, &size, &type); + if (rc != OPAL_SUCCESS) { + pr_err("ELOG: OPAL log info read failed\n"); + return; + } + + elog_size = be64_to_cpu(size); + log_id = be64_to_cpu(id); + elog_type = be64_to_cpu(type); + + WARN_ON(elog_size > OPAL_MAX_ERRLOG_SIZE); + + if (elog_size >= OPAL_MAX_ERRLOG_SIZE) + elog_size = OPAL_MAX_ERRLOG_SIZE; + + sprintf(name, "0x%llx", log_id); + + /* we may get notified twice, let's handle + * that gracefully and not create two conflicting + * entries. + */ + if (kset_find_obj(elog_kset, name)) + return; + + create_elog_obj(log_id, elog_size, elog_type); +} + +static DECLARE_WORK(elog_work, elog_work_fn); + +static int elog_event(struct notifier_block *nb, + unsigned long events, void *change) +{ + /* check for error log event */ + if (events & OPAL_EVENT_ERROR_LOG_AVAIL) + schedule_work(&elog_work); + return 0; +} + +static struct notifier_block elog_nb = { + .notifier_call = elog_event, + .next = NULL, + .priority = 0 +}; + +int __init opal_elog_init(void) +{ + int rc = 0; + + /* ELOG not supported by firmware */ + if (!opal_check_token(OPAL_ELOG_READ)) + return -1; + + elog_kset = kset_create_and_add("elog", NULL, opal_kobj); + if (!elog_kset) { + pr_warn("%s: failed to create elog kset\n", __func__); + return -1; + } + + rc = opal_notifier_register(&elog_nb); + if (rc) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, rc); + return rc; + } + + /* We are now ready to pull error logs from opal. */ + if (opal_check_token(OPAL_ELOG_RESEND)) + opal_resend_pending_logs(); + + return 0; +} diff --git a/kernel/arch/powerpc/platforms/powernv/opal-flash.c b/kernel/arch/powerpc/platforms/powernv/opal-flash.c new file mode 100644 index 000000000..4ec621928 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-flash.c @@ -0,0 +1,592 @@ +/* + * PowerNV OPAL Firmware Update Interface + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/delay.h> + +#include <asm/opal.h> + +/* FLASH status codes */ +#define FLASH_NO_OP -1099 /* No operation initiated by user */ +#define FLASH_NO_AUTH -9002 /* Not a service authority partition */ + +/* Validate image status values */ +#define VALIDATE_IMG_READY -1001 /* Image ready for validation */ +#define VALIDATE_IMG_INCOMPLETE -1002 /* User copied < VALIDATE_BUF_SIZE */ + +/* Manage image status values */ +#define MANAGE_ACTIVE_ERR -9001 /* Cannot overwrite active img */ + +/* Flash image status values */ +#define FLASH_IMG_READY 0 /* Img ready for flash on reboot */ +#define FLASH_INVALID_IMG -1003 /* Flash image shorter than expected */ +#define FLASH_IMG_NULL_DATA -1004 /* Bad data in sg list entry */ +#define FLASH_IMG_BAD_LEN -1005 /* Bad length in sg list entry */ + +/* Manage operation tokens */ +#define FLASH_REJECT_TMP_SIDE 0 /* Reject temporary fw image */ +#define FLASH_COMMIT_TMP_SIDE 1 /* Commit temporary fw image */ + +/* Update tokens */ +#define FLASH_UPDATE_CANCEL 0 /* Cancel update request */ +#define FLASH_UPDATE_INIT 1 /* Initiate update */ + +/* Validate image update result tokens */ +#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */ +#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL 4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT 5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL 6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY 7 + +/* Validate buffer size */ +#define VALIDATE_BUF_SIZE 4096 + +/* XXX: Assume candidate image size is <= 1GB */ +#define MAX_IMAGE_SIZE 0x40000000 + +/* Image status */ +enum { + IMAGE_INVALID, + IMAGE_LOADING, + IMAGE_READY, +}; + +/* Candidate image data */ +struct image_data_t { + int status; + void *data; + uint32_t size; +}; + +/* Candidate image header */ +struct image_header_t { + uint16_t magic; + uint16_t version; + uint32_t size; +}; + +struct validate_flash_t { + int status; /* Return status */ + void *buf; /* Candidate image buffer */ + uint32_t buf_size; /* Image size */ + uint32_t result; /* Update results token */ +}; + +struct manage_flash_t { + int status; /* Return status */ +}; + +struct update_flash_t { + int status; /* Return status */ +}; + +static struct image_header_t image_header; +static struct image_data_t image_data; +static struct validate_flash_t validate_flash_data; +static struct manage_flash_t manage_flash_data; + +/* Initialize update_flash_data status to No Operation */ +static struct update_flash_t update_flash_data = { + .status = FLASH_NO_OP, +}; + +static DEFINE_MUTEX(image_data_mutex); + +/* + * Validate candidate image + */ +static inline void opal_flash_validate(void) +{ + long ret; + void *buf = validate_flash_data.buf; + __be32 size = cpu_to_be32(validate_flash_data.buf_size); + __be32 result; + + ret = opal_validate_flash(__pa(buf), &size, &result); + + validate_flash_data.status = ret; + validate_flash_data.buf_size = be32_to_cpu(size); + validate_flash_data.result = be32_to_cpu(result); +} + +/* + * Validate output format: + * validate result token + * current image version details + * new image version details + */ +static ssize_t validate_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct validate_flash_t *args_buf = &validate_flash_data; + int len; + + /* Candidate image is not validated */ + if (args_buf->status < VALIDATE_TMP_UPDATE) { + len = sprintf(buf, "%d\n", args_buf->status); + goto out; + } + + /* Result token */ + len = sprintf(buf, "%d\n", args_buf->result); + + /* Current and candidate image version details */ + if ((args_buf->result != VALIDATE_TMP_UPDATE) && + (args_buf->result < VALIDATE_CUR_UNKNOWN)) + goto out; + + if (args_buf->buf_size > (VALIDATE_BUF_SIZE - len)) { + memcpy(buf + len, args_buf->buf, VALIDATE_BUF_SIZE - len); + len = VALIDATE_BUF_SIZE; + } else { + memcpy(buf + len, args_buf->buf, args_buf->buf_size); + len += args_buf->buf_size; + } +out: + /* Set status to default */ + args_buf->status = FLASH_NO_OP; + return len; +} + +/* + * Validate candidate firmware image + * + * Note: + * We are only interested in first 4K bytes of the + * candidate image. + */ +static ssize_t validate_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct validate_flash_t *args_buf = &validate_flash_data; + + if (buf[0] != '1') + return -EINVAL; + + mutex_lock(&image_data_mutex); + + if (image_data.status != IMAGE_READY || + image_data.size < VALIDATE_BUF_SIZE) { + args_buf->result = VALIDATE_INVALID_IMG; + args_buf->status = VALIDATE_IMG_INCOMPLETE; + goto out; + } + + /* Copy first 4k bytes of candidate image */ + memcpy(args_buf->buf, image_data.data, VALIDATE_BUF_SIZE); + + args_buf->status = VALIDATE_IMG_READY; + args_buf->buf_size = VALIDATE_BUF_SIZE; + + /* Validate candidate image */ + opal_flash_validate(); + +out: + mutex_unlock(&image_data_mutex); + return count; +} + +/* + * Manage flash routine + */ +static inline void opal_flash_manage(uint8_t op) +{ + struct manage_flash_t *const args_buf = &manage_flash_data; + + args_buf->status = opal_manage_flash(op); +} + +/* + * Show manage flash status + */ +static ssize_t manage_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct manage_flash_t *const args_buf = &manage_flash_data; + int rc; + + rc = sprintf(buf, "%d\n", args_buf->status); + /* Set status to default*/ + args_buf->status = FLASH_NO_OP; + return rc; +} + +/* + * Manage operations: + * 0 - Reject + * 1 - Commit + */ +static ssize_t manage_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + uint8_t op; + switch (buf[0]) { + case '0': + op = FLASH_REJECT_TMP_SIDE; + break; + case '1': + op = FLASH_COMMIT_TMP_SIDE; + break; + default: + return -EINVAL; + } + + /* commit/reject temporary image */ + opal_flash_manage(op); + return count; +} + +/* + * OPAL update flash + */ +static int opal_flash_update(int op) +{ + struct opal_sg_list *list; + unsigned long addr; + int64_t rc = OPAL_PARAMETER; + + if (op == FLASH_UPDATE_CANCEL) { + pr_alert("FLASH: Image update cancelled\n"); + addr = '\0'; + goto flash; + } + + list = opal_vmalloc_to_sg_list(image_data.data, image_data.size); + if (!list) + goto invalid_img; + + /* First entry address */ + addr = __pa(list); + +flash: + rc = opal_update_flash(addr); + +invalid_img: + return rc; +} + +/* Return CPUs to OPAL before starting FW update */ +static void flash_return_cpu(void *info) +{ + int cpu = smp_processor_id(); + + if (!cpu_online(cpu)) + return; + + /* Disable IRQ */ + hard_irq_disable(); + + /* Return the CPU to OPAL */ + opal_return_cpu(); +} + +/* This gets called just before system reboots */ +void opal_flash_term_callback(void) +{ + struct cpumask mask; + + if (update_flash_data.status != FLASH_IMG_READY) + return; + + pr_alert("FLASH: Flashing new firmware\n"); + pr_alert("FLASH: Image is %u bytes\n", image_data.size); + pr_alert("FLASH: Performing flash and reboot/shutdown\n"); + pr_alert("FLASH: This will take several minutes. Do not power off!\n"); + + /* Small delay to help getting the above message out */ + msleep(500); + + /* Return secondary CPUs to firmware */ + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + if (!cpumask_empty(&mask)) + smp_call_function_many(&mask, + flash_return_cpu, NULL, false); + /* Hard disable interrupts */ + hard_irq_disable(); +} + +/* + * Show candidate image status + */ +static ssize_t update_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct update_flash_t *const args_buf = &update_flash_data; + return sprintf(buf, "%d\n", args_buf->status); +} + +/* + * Set update image flag + * 1 - Flash new image + * 0 - Cancel flash request + */ +static ssize_t update_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct update_flash_t *const args_buf = &update_flash_data; + int rc = count; + + mutex_lock(&image_data_mutex); + + switch (buf[0]) { + case '0': + if (args_buf->status == FLASH_IMG_READY) + opal_flash_update(FLASH_UPDATE_CANCEL); + args_buf->status = FLASH_NO_OP; + break; + case '1': + /* Image is loaded? */ + if (image_data.status == IMAGE_READY) + args_buf->status = + opal_flash_update(FLASH_UPDATE_INIT); + else + args_buf->status = FLASH_INVALID_IMG; + break; + default: + rc = -EINVAL; + } + + mutex_unlock(&image_data_mutex); + return rc; +} + +/* + * Free image buffer + */ +static void free_image_buf(void) +{ + void *addr; + int size; + + addr = image_data.data; + size = PAGE_ALIGN(image_data.size); + while (size > 0) { + ClearPageReserved(vmalloc_to_page(addr)); + addr += PAGE_SIZE; + size -= PAGE_SIZE; + } + vfree(image_data.data); + image_data.data = NULL; + image_data.status = IMAGE_INVALID; +} + +/* + * Allocate image buffer. + */ +static int alloc_image_buf(char *buffer, size_t count) +{ + void *addr; + int size; + + if (count < sizeof(struct image_header_t)) { + pr_warn("FLASH: Invalid candidate image\n"); + return -EINVAL; + } + + memcpy(&image_header, (void *)buffer, sizeof(struct image_header_t)); + image_data.size = be32_to_cpu(image_header.size); + pr_debug("FLASH: Candidate image size = %u\n", image_data.size); + + if (image_data.size > MAX_IMAGE_SIZE) { + pr_warn("FLASH: Too large image\n"); + return -EINVAL; + } + if (image_data.size < VALIDATE_BUF_SIZE) { + pr_warn("FLASH: Image is shorter than expected\n"); + return -EINVAL; + } + + image_data.data = vzalloc(PAGE_ALIGN(image_data.size)); + if (!image_data.data) { + pr_err("%s : Failed to allocate memory\n", __func__); + return -ENOMEM; + } + + /* Pin memory */ + addr = image_data.data; + size = PAGE_ALIGN(image_data.size); + while (size > 0) { + SetPageReserved(vmalloc_to_page(addr)); + addr += PAGE_SIZE; + size -= PAGE_SIZE; + } + + image_data.status = IMAGE_LOADING; + return 0; +} + +/* + * Copy candidate image + * + * Parse candidate image header to get total image size + * and pre-allocate required memory. + */ +static ssize_t image_data_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buffer, loff_t pos, size_t count) +{ + int rc; + + mutex_lock(&image_data_mutex); + + /* New image ? */ + if (pos == 0) { + /* Free memory, if already allocated */ + if (image_data.data) + free_image_buf(); + + /* Cancel outstanding image update request */ + if (update_flash_data.status == FLASH_IMG_READY) + opal_flash_update(FLASH_UPDATE_CANCEL); + + /* Allocate memory */ + rc = alloc_image_buf(buffer, count); + if (rc) + goto out; + } + + if (image_data.status != IMAGE_LOADING) { + rc = -ENOMEM; + goto out; + } + + if ((pos + count) > image_data.size) { + rc = -EINVAL; + goto out; + } + + memcpy(image_data.data + pos, (void *)buffer, count); + rc = count; + + /* Set image status */ + if ((pos + count) == image_data.size) { + pr_debug("FLASH: Candidate image loaded....\n"); + image_data.status = IMAGE_READY; + } + +out: + mutex_unlock(&image_data_mutex); + return rc; +} + +/* + * sysfs interface : + * OPAL uses below sysfs files for code update. + * We create these files under /sys/firmware/opal. + * + * image : Interface to load candidate firmware image + * validate_flash : Validate firmware image + * manage_flash : Commit/Reject firmware image + * update_flash : Flash new firmware image + * + */ +static struct bin_attribute image_data_attr = { + .attr = {.name = "image", .mode = 0200}, + .size = MAX_IMAGE_SIZE, /* Limit image size */ + .write = image_data_write, +}; + +static struct kobj_attribute validate_attribute = + __ATTR(validate_flash, 0600, validate_show, validate_store); + +static struct kobj_attribute manage_attribute = + __ATTR(manage_flash, 0600, manage_show, manage_store); + +static struct kobj_attribute update_attribute = + __ATTR(update_flash, 0600, update_show, update_store); + +static struct attribute *image_op_attrs[] = { + &validate_attribute.attr, + &manage_attribute.attr, + &update_attribute.attr, + NULL /* need to NULL terminate the list of attributes */ +}; + +static struct attribute_group image_op_attr_group = { + .attrs = image_op_attrs, +}; + +void __init opal_flash_update_init(void) +{ + int ret; + + /* Allocate validate image buffer */ + validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL); + if (!validate_flash_data.buf) { + pr_err("%s : Failed to allocate memory\n", __func__); + return; + } + + /* Make sure /sys/firmware/opal directory is created */ + if (!opal_kobj) { + pr_warn("FLASH: opal kobject is not available\n"); + goto nokobj; + } + + /* Create the sysfs files */ + ret = sysfs_create_group(opal_kobj, &image_op_attr_group); + if (ret) { + pr_warn("FLASH: Failed to create sysfs files\n"); + goto nokobj; + } + + ret = sysfs_create_bin_file(opal_kobj, &image_data_attr); + if (ret) { + pr_warn("FLASH: Failed to create sysfs files\n"); + goto nosysfs_file; + } + + /* Set default status */ + validate_flash_data.status = FLASH_NO_OP; + manage_flash_data.status = FLASH_NO_OP; + update_flash_data.status = FLASH_NO_OP; + image_data.status = IMAGE_INVALID; + return; + +nosysfs_file: + sysfs_remove_group(opal_kobj, &image_op_attr_group); + +nokobj: + kfree(validate_flash_data.buf); + return; +} diff --git a/kernel/arch/powerpc/platforms/powernv/opal-hmi.c b/kernel/arch/powerpc/platforms/powernv/opal-hmi.c new file mode 100644 index 000000000..b322bfb51 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-hmi.c @@ -0,0 +1,189 @@ +/* + * OPAL hypervisor Maintenance interrupt handling support in PowreNV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; If not, see <http://www.gnu.org/licenses/>. + * + * Copyright 2014 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/opal.h> +#include <asm/cputable.h> +#include <asm/machdep.h> + +static int opal_hmi_handler_nb_init; +struct OpalHmiEvtNode { + struct list_head list; + struct OpalHMIEvent hmi_evt; +}; +static LIST_HEAD(opal_hmi_evt_list); +static DEFINE_SPINLOCK(opal_hmi_evt_lock); + +static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) +{ + const char *level, *sevstr, *error_info; + static const char *hmi_error_types[] = { + "Malfunction Alert", + "Processor Recovery done", + "Processor recovery occurred again", + "Processor recovery occurred for masked error", + "Timer facility experienced an error", + "TFMR SPR is corrupted", + "UPS (Uniterrupted Power System) Overflow indication", + "An XSCOM operation failure", + "An XSCOM operation completed", + "SCOM has set a reserved FIR bit to cause recovery", + "Debug trigger has set a reserved FIR bit to cause recovery", + "A hypervisor resource error occurred" + }; + + /* Print things out */ + if (hmi_evt->version < OpalHMIEvt_V1) { + pr_err("HMI Interrupt, Unknown event version %d !\n", + hmi_evt->version); + return; + } + switch (hmi_evt->severity) { + case OpalHMI_SEV_NO_ERROR: + level = KERN_INFO; + sevstr = "Harmless"; + break; + case OpalHMI_SEV_WARNING: + level = KERN_WARNING; + sevstr = ""; + break; + case OpalHMI_SEV_ERROR_SYNC: + level = KERN_ERR; + sevstr = "Severe"; + break; + case OpalHMI_SEV_FATAL: + default: + level = KERN_ERR; + sevstr = "Fatal"; + break; + } + + printk("%s%s Hypervisor Maintenance interrupt [%s]\n", + level, sevstr, + hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? + "Recovered" : "Not recovered"); + error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? + hmi_error_types[hmi_evt->type] + : "Unknown"; + printk("%s Error detail: %s\n", level, error_info); + printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer)); + if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || + (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) + printk("%s TFMR: %016llx\n", level, + be64_to_cpu(hmi_evt->tfmr)); +} + +static void hmi_event_handler(struct work_struct *work) +{ + unsigned long flags; + struct OpalHMIEvent *hmi_evt; + struct OpalHmiEvtNode *msg_node; + uint8_t disposition; + + spin_lock_irqsave(&opal_hmi_evt_lock, flags); + while (!list_empty(&opal_hmi_evt_list)) { + msg_node = list_entry(opal_hmi_evt_list.next, + struct OpalHmiEvtNode, list); + list_del(&msg_node->list); + spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); + + hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt; + print_hmi_event_info(hmi_evt); + disposition = hmi_evt->disposition; + kfree(msg_node); + + /* + * Check if HMI event has been recovered or not. If not + * then we can't continue, invoke panic. + */ + if (disposition != OpalHMI_DISPOSITION_RECOVERED) + panic("Unrecoverable HMI exception"); + + spin_lock_irqsave(&opal_hmi_evt_lock, flags); + } + spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); +} + +static DECLARE_WORK(hmi_event_work, hmi_event_handler); +/* + * opal_handle_hmi_event - notifier handler that queues up HMI events + * to be preocessed later. + */ +static int opal_handle_hmi_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + unsigned long flags; + struct OpalHMIEvent *hmi_evt; + struct opal_msg *hmi_msg = msg; + struct OpalHmiEvtNode *msg_node; + + /* Sanity Checks */ + if (msg_type != OPAL_MSG_HMI_EVT) + return 0; + + /* HMI event info starts from param[0] */ + hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0]; + + /* Delay the logging of HMI events to workqueue. */ + msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); + if (!msg_node) { + pr_err("HMI: out of memory, Opal message event not handled\n"); + return -ENOMEM; + } + memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent)); + + spin_lock_irqsave(&opal_hmi_evt_lock, flags); + list_add(&msg_node->list, &opal_hmi_evt_list); + spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); + + schedule_work(&hmi_event_work); + return 0; +} + +static struct notifier_block opal_hmi_handler_nb = { + .notifier_call = opal_handle_hmi_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_hmi_handler_init(void) +{ + int ret; + + if (!opal_hmi_handler_nb_init) { + ret = opal_message_notifier_register( + OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + opal_hmi_handler_nb_init = 1; + } + return 0; +} +machine_subsys_initcall(powernv, opal_hmi_handler_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-lpc.c b/kernel/arch/powerpc/platforms/powernv/opal-lpc.c new file mode 100644 index 000000000..e4169d68c --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-lpc.c @@ -0,0 +1,414 @@ +/* + * PowerNV LPC bus handling. + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/bug.h> +#include <linux/debugfs.h> +#include <linux/io.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/xics.h> +#include <asm/opal.h> +#include <asm/prom.h> +#include <asm/uaccess.h> +#include <asm/debug.h> + +static int opal_lpc_chip_id = -1; + +static u8 opal_lpc_inb(unsigned long port) +{ + int64_t rc; + __be32 data; + + if (opal_lpc_chip_id < 0 || port > 0xffff) + return 0xff; + rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 1); + return rc ? 0xff : be32_to_cpu(data); +} + +static __le16 __opal_lpc_inw(unsigned long port) +{ + int64_t rc; + __be32 data; + + if (opal_lpc_chip_id < 0 || port > 0xfffe) + return 0xffff; + if (port & 1) + return (__le16)opal_lpc_inb(port) << 8 | opal_lpc_inb(port + 1); + rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 2); + return rc ? 0xffff : be32_to_cpu(data); +} +static u16 opal_lpc_inw(unsigned long port) +{ + return le16_to_cpu(__opal_lpc_inw(port)); +} + +static __le32 __opal_lpc_inl(unsigned long port) +{ + int64_t rc; + __be32 data; + + if (opal_lpc_chip_id < 0 || port > 0xfffc) + return 0xffffffff; + if (port & 3) + return (__le32)opal_lpc_inb(port ) << 24 | + (__le32)opal_lpc_inb(port + 1) << 16 | + (__le32)opal_lpc_inb(port + 2) << 8 | + opal_lpc_inb(port + 3); + rc = opal_lpc_read(opal_lpc_chip_id, OPAL_LPC_IO, port, &data, 4); + return rc ? 0xffffffff : be32_to_cpu(data); +} + +static u32 opal_lpc_inl(unsigned long port) +{ + return le32_to_cpu(__opal_lpc_inl(port)); +} + +static void opal_lpc_outb(u8 val, unsigned long port) +{ + if (opal_lpc_chip_id < 0 || port > 0xffff) + return; + opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 1); +} + +static void __opal_lpc_outw(__le16 val, unsigned long port) +{ + if (opal_lpc_chip_id < 0 || port > 0xfffe) + return; + if (port & 1) { + opal_lpc_outb(val >> 8, port); + opal_lpc_outb(val , port + 1); + return; + } + opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 2); +} + +static void opal_lpc_outw(u16 val, unsigned long port) +{ + __opal_lpc_outw(cpu_to_le16(val), port); +} + +static void __opal_lpc_outl(__le32 val, unsigned long port) +{ + if (opal_lpc_chip_id < 0 || port > 0xfffc) + return; + if (port & 3) { + opal_lpc_outb(val >> 24, port); + opal_lpc_outb(val >> 16, port + 1); + opal_lpc_outb(val >> 8, port + 2); + opal_lpc_outb(val , port + 3); + return; + } + opal_lpc_write(opal_lpc_chip_id, OPAL_LPC_IO, port, val, 4); +} + +static void opal_lpc_outl(u32 val, unsigned long port) +{ + __opal_lpc_outl(cpu_to_le32(val), port); +} + +static void opal_lpc_insb(unsigned long p, void *b, unsigned long c) +{ + u8 *ptr = b; + + while(c--) + *(ptr++) = opal_lpc_inb(p); +} + +static void opal_lpc_insw(unsigned long p, void *b, unsigned long c) +{ + __le16 *ptr = b; + + while(c--) + *(ptr++) = __opal_lpc_inw(p); +} + +static void opal_lpc_insl(unsigned long p, void *b, unsigned long c) +{ + __le32 *ptr = b; + + while(c--) + *(ptr++) = __opal_lpc_inl(p); +} + +static void opal_lpc_outsb(unsigned long p, const void *b, unsigned long c) +{ + const u8 *ptr = b; + + while(c--) + opal_lpc_outb(*(ptr++), p); +} + +static void opal_lpc_outsw(unsigned long p, const void *b, unsigned long c) +{ + const __le16 *ptr = b; + + while(c--) + __opal_lpc_outw(*(ptr++), p); +} + +static void opal_lpc_outsl(unsigned long p, const void *b, unsigned long c) +{ + const __le32 *ptr = b; + + while(c--) + __opal_lpc_outl(*(ptr++), p); +} + +static const struct ppc_pci_io opal_lpc_io = { + .inb = opal_lpc_inb, + .inw = opal_lpc_inw, + .inl = opal_lpc_inl, + .outb = opal_lpc_outb, + .outw = opal_lpc_outw, + .outl = opal_lpc_outl, + .insb = opal_lpc_insb, + .insw = opal_lpc_insw, + .insl = opal_lpc_insl, + .outsb = opal_lpc_outsb, + .outsw = opal_lpc_outsw, + .outsl = opal_lpc_outsl, +}; + +#ifdef CONFIG_DEBUG_FS +struct lpc_debugfs_entry { + enum OpalLPCAddressType lpc_type; +}; + +static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_WRITE, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + rc = opal_lpc_read(opal_lpc_chip_id, lpc->lpc_type, pos, + &data, len); + if (rc) + return -ENXIO; + + /* + * Now there is some trickery with the data returned by OPAL + * as it's the desired data right justified in a 32-bit BE + * word. + * + * This is a very bad interface and I'm to blame for it :-( + * + * So we can't just apply a 32-bit swap to what comes from OPAL, + * because user space expects the *bytes* to be in their proper + * respective positions (ie, LPC position). + * + * So what we really want to do here is to shift data right + * appropriately on a LE kernel. + * + * IE. If the LPC transaction has bytes B0, B1, B2 and B3 in that + * order, we have in memory written to by OPAL at the "data" + * pointer: + * + * Bytes: OPAL "data" LE "data" + * 32-bit: B0 B1 B2 B3 B0B1B2B3 B3B2B1B0 + * 16-bit: B0 B1 0000B0B1 B1B00000 + * 8-bit: B0 000000B0 B0000000 + * + * So a BE kernel will have the leftmost of the above in the MSB + * and rightmost in the LSB and can just then "cast" the u32 "data" + * down to the appropriate quantity and write it. + * + * However, an LE kernel can't. It doesn't need to swap because a + * load from data followed by a store to user are going to preserve + * the byte ordering which is the wire byte order which is what the + * user wants, but in order to "crop" to the right size, we need to + * shift right first. + */ + switch(len) { + case 4: + rc = __put_user((u32)data, (u32 __user *)ubuf); + break; + case 2: +#ifdef __LITTLE_ENDIAN__ + data >>= 16; +#endif + rc = __put_user((u16)data, (u16 __user *)ubuf); + break; + default: +#ifdef __LITTLE_ENDIAN__ + data >>= 24; +#endif + rc = __put_user((u8)data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static ssize_t lpc_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct lpc_debugfs_entry *lpc = filp->private_data; + u32 data, pos, len, todo; + int rc; + + if (!access_ok(VERIFY_READ, ubuf, count)) + return -EFAULT; + + todo = count; + while (todo) { + pos = *ppos; + + /* + * Select access size based on count and alignment and + * access type. IO and MEM only support byte acceses, + * FW supports all 3. + */ + len = 1; + if (lpc->lpc_type == OPAL_LPC_FW) { + if (todo > 3 && (pos & 3) == 0) + len = 4; + else if (todo > 1 && (pos & 1) == 0) + len = 2; + } + + /* + * Similarly to the read case, we have some trickery here but + * it's different to handle. We need to pass the value to OPAL in + * a register whose layout depends on the access size. We want + * to reproduce the memory layout of the user, however we aren't + * doing a load from user and a store to another memory location + * which would achieve that. Here we pass the value to OPAL via + * a register which is expected to contain the "BE" interpretation + * of the byte sequence. IE: for a 32-bit access, byte 0 should be + * in the MSB. So here we *do* need to byteswap on LE. + * + * User bytes: LE "data" OPAL "data" + * 32-bit: B0 B1 B2 B3 B3B2B1B0 B0B1B2B3 + * 16-bit: B0 B1 0000B1B0 0000B0B1 + * 8-bit: B0 000000B0 000000B0 + */ + switch(len) { + case 4: + rc = __get_user(data, (u32 __user *)ubuf); + data = cpu_to_be32(data); + break; + case 2: + rc = __get_user(data, (u16 __user *)ubuf); + data = cpu_to_be16(data); + break; + default: + rc = __get_user(data, (u8 __user *)ubuf); + break; + } + if (rc) + return -EFAULT; + + rc = opal_lpc_write(opal_lpc_chip_id, lpc->lpc_type, pos, + data, len); + if (rc) + return -ENXIO; + *ppos += len; + ubuf += len; + todo -= len; + } + + return count; +} + +static const struct file_operations lpc_fops = { + .read = lpc_debug_read, + .write = lpc_debug_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static int opal_lpc_debugfs_create_type(struct dentry *folder, + const char *fname, + enum OpalLPCAddressType type) +{ + struct lpc_debugfs_entry *entry; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->lpc_type = type; + debugfs_create_file(fname, 0600, folder, entry, &lpc_fops); + return 0; +} + +static int opal_lpc_init_debugfs(void) +{ + struct dentry *root; + int rc = 0; + + if (opal_lpc_chip_id < 0) + return -ENODEV; + + root = debugfs_create_dir("lpc", powerpc_debugfs_root); + + rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); + rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); + rc |= opal_lpc_debugfs_create_type(root, "fw", OPAL_LPC_FW); + return rc; +} +machine_device_initcall(powernv, opal_lpc_init_debugfs); +#endif /* CONFIG_DEBUG_FS */ + +void opal_lpc_init(void) +{ + struct device_node *np; + + /* + * Look for a Power8 LPC bus tagged as "primary", + * we currently support only one though the OPAL APIs + * support any number. + */ + for_each_compatible_node(np, NULL, "ibm,power8-lpc") { + if (!of_device_is_available(np)) + continue; + if (!of_get_property(np, "primary", NULL)) + continue; + opal_lpc_chip_id = of_get_ibm_chip_id(np); + break; + } + if (opal_lpc_chip_id < 0) + return; + + /* Setup special IO ops */ + ppc_pci_io = opal_lpc_io; + isa_io_special = true; + + pr_info("OPAL: Power8 LPC bus found, chip ID %d\n", opal_lpc_chip_id); +} diff --git a/kernel/arch/powerpc/platforms/powernv/opal-memory-errors.c b/kernel/arch/powerpc/platforms/powernv/opal-memory-errors.c new file mode 100644 index 000000000..43db2136d --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -0,0 +1,147 @@ +/* + * OPAL asynchronus Memory error handling support in PowreNV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/opal.h> +#include <asm/cputable.h> + +static int opal_mem_err_nb_init; +static LIST_HEAD(opal_memory_err_list); +static DEFINE_SPINLOCK(opal_mem_err_lock); + +struct OpalMsgNode { + struct list_head list; + struct opal_msg msg; +}; + +static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt) +{ + uint64_t paddr_start, paddr_end; + + pr_debug("%s: Retrived memory error event, type: 0x%x\n", + __func__, merr_evt->type); + switch (merr_evt->type) { + case OPAL_MEM_ERR_TYPE_RESILIENCE: + paddr_start = be64_to_cpu(merr_evt->u.resilience.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.resilience.physical_address_end); + break; + case OPAL_MEM_ERR_TYPE_DYN_DALLOC: + paddr_start = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_start); + paddr_end = be64_to_cpu(merr_evt->u.dyn_dealloc.physical_address_end); + break; + default: + return; + } + + for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) { + memory_failure(paddr_start >> PAGE_SHIFT, 0, 0); + } +} + +static void handle_memory_error(void) +{ + unsigned long flags; + struct OpalMemoryErrorData *merr_evt; + struct OpalMsgNode *msg_node; + + spin_lock_irqsave(&opal_mem_err_lock, flags); + while (!list_empty(&opal_memory_err_list)) { + msg_node = list_entry(opal_memory_err_list.next, + struct OpalMsgNode, list); + list_del(&msg_node->list); + spin_unlock_irqrestore(&opal_mem_err_lock, flags); + + merr_evt = (struct OpalMemoryErrorData *) + &msg_node->msg.params[0]; + handle_memory_error_event(merr_evt); + kfree(msg_node); + spin_lock_irqsave(&opal_mem_err_lock, flags); + } + spin_unlock_irqrestore(&opal_mem_err_lock, flags); +} + +static void mem_error_handler(struct work_struct *work) +{ + handle_memory_error(); +} + +static DECLARE_WORK(mem_error_work, mem_error_handler); + +/* + * opal_memory_err_event - notifier handler that queues up the opal message + * to be preocessed later. + */ +static int opal_memory_err_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + unsigned long flags; + struct OpalMsgNode *msg_node; + + if (msg_type != OPAL_MSG_MEM_ERR) + return 0; + + msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); + if (!msg_node) { + pr_err("MEMORY_ERROR: out of memory, Opal message event not" + "handled\n"); + return -ENOMEM; + } + memcpy(&msg_node->msg, msg, sizeof(struct opal_msg)); + + spin_lock_irqsave(&opal_mem_err_lock, flags); + list_add(&msg_node->list, &opal_memory_err_list); + spin_unlock_irqrestore(&opal_mem_err_lock, flags); + + schedule_work(&mem_error_work); + return 0; +} + +static struct notifier_block opal_mem_err_nb = { + .notifier_call = opal_memory_err_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_mem_err_init(void) +{ + int ret; + + if (!opal_mem_err_nb_init) { + ret = opal_message_notifier_register( + OPAL_MSG_MEM_ERR, &opal_mem_err_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + opal_mem_err_nb_init = 1; + } + return 0; +} +machine_subsys_initcall(powernv, opal_mem_err_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-msglog.c b/kernel/arch/powerpc/platforms/powernv/opal-msglog.c new file mode 100644 index 000000000..44ed78af1 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-msglog.c @@ -0,0 +1,124 @@ +/* + * PowerNV OPAL in-memory console interface + * + * Copyright 2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/io.h> +#include <asm/opal.h> +#include <linux/debugfs.h> +#include <linux/of.h> +#include <linux/types.h> +#include <asm/barrier.h> + +/* OPAL in-memory console. Defined in OPAL source at core/console.c */ +struct memcons { + __be64 magic; +#define MEMCONS_MAGIC 0x6630696567726173L + __be64 obuf_phys; + __be64 ibuf_phys; + __be32 obuf_size; + __be32 ibuf_size; + __be32 out_pos; +#define MEMCONS_OUT_POS_WRAP 0x80000000u +#define MEMCONS_OUT_POS_MASK 0x00ffffffu + __be32 in_prod; + __be32 in_cons; +}; + +static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct memcons *mc = bin_attr->private; + const char *conbuf; + ssize_t ret; + size_t first_read = 0; + uint32_t out_pos, avail; + + if (!mc) + return -ENODEV; + + out_pos = be32_to_cpu(ACCESS_ONCE(mc->out_pos)); + + /* Now we've read out_pos, put a barrier in before reading the new + * data it points to in conbuf. */ + smp_rmb(); + + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); + + /* When the buffer has wrapped, read from the out_pos marker to the end + * of the buffer, and then read the remaining data as in the un-wrapped + * case. */ + if (out_pos & MEMCONS_OUT_POS_WRAP) { + + out_pos &= MEMCONS_OUT_POS_MASK; + avail = be32_to_cpu(mc->obuf_size) - out_pos; + + ret = memory_read_from_buffer(to, count, &pos, + conbuf + out_pos, avail); + + if (ret < 0) + goto out; + + first_read = ret; + to += first_read; + count -= first_read; + pos -= avail; + + if (count <= 0) + goto out; + } + + /* Sanity check. The firmware should not do this to us. */ + if (out_pos > be32_to_cpu(mc->obuf_size)) { + pr_err("OPAL: memory console corruption. Aborting read.\n"); + return -EINVAL; + } + + ret = memory_read_from_buffer(to, count, &pos, conbuf, out_pos); + + if (ret < 0) + goto out; + + ret += first_read; +out: + return ret; +} + +static struct bin_attribute opal_msglog_attr = { + .attr = {.name = "msglog", .mode = 0444}, + .read = opal_msglog_read +}; + +void __init opal_msglog_init(void) +{ + u64 mcaddr; + struct memcons *mc; + + if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { + pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); + return; + } + + mc = phys_to_virt(mcaddr); + if (!mc) { + pr_warn("OPAL: memory console address is invalid\n"); + return; + } + + if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { + pr_warn("OPAL: memory console version is invalid\n"); + return; + } + + opal_msglog_attr.private = mc; + + if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0) + pr_warn("OPAL: sysfs file creation failed\n"); +} diff --git a/kernel/arch/powerpc/platforms/powernv/opal-nvram.c b/kernel/arch/powerpc/platforms/powernv/opal-nvram.c new file mode 100644 index 000000000..9db4398de --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-nvram.c @@ -0,0 +1,98 @@ +/* + * PowerNV nvram code. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define DEBUG + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/of.h> + +#include <asm/opal.h> +#include <asm/nvram.h> +#include <asm/machdep.h> + +static unsigned int nvram_size; + +static ssize_t opal_nvram_size(void) +{ + return nvram_size; +} + +static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index) +{ + s64 rc; + int off; + + if (*index >= nvram_size) + return 0; + off = *index; + if ((off + count) > nvram_size) + count = nvram_size - off; + rc = opal_read_nvram(__pa(buf), count, off); + if (rc != OPAL_SUCCESS) + return -EIO; + *index += count; + return count; +} + +static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) +{ + s64 rc = OPAL_BUSY; + int off; + + if (*index >= nvram_size) + return 0; + off = *index; + if ((off + count) > nvram_size) + count = nvram_size - off; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_write_nvram(__pa(buf), count, off); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + } + *index += count; + return count; +} + +static int __init opal_nvram_init_log_partitions(void) +{ + /* Scan nvram for partitions */ + nvram_scan_partitions(); + nvram_init_oops_partition(0); + return 0; +} +machine_arch_initcall(powernv, opal_nvram_init_log_partitions); + +void __init opal_nvram_init(void) +{ + struct device_node *np; + const __be32 *nbytes_p; + + np = of_find_compatible_node(NULL, NULL, "ibm,opal-nvram"); + if (np == NULL) + return; + + nbytes_p = of_get_property(np, "#bytes", NULL); + if (!nbytes_p) { + of_node_put(np); + return; + } + nvram_size = be32_to_cpup(nbytes_p); + + pr_info("OPAL nvram setup, %u bytes\n", nvram_size); + of_node_put(np); + + ppc_md.nvram_read = opal_nvram_read; + ppc_md.nvram_write = opal_nvram_write; + ppc_md.nvram_size = opal_nvram_size; +} + diff --git a/kernel/arch/powerpc/platforms/powernv/opal-power.c b/kernel/arch/powerpc/platforms/powernv/opal-power.c new file mode 100644 index 000000000..ac46c2c24 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-power.c @@ -0,0 +1,66 @@ +/* + * PowerNV OPAL power control for graceful shutdown handling + * + * Copyright 2015 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <linux/notifier.h> + +#include <asm/opal.h> +#include <asm/machdep.h> + +#define SOFT_OFF 0x00 +#define SOFT_REBOOT 0x01 + +static int opal_power_control_event(struct notifier_block *nb, + unsigned long msg_type, void *msg) +{ + struct opal_msg *power_msg = msg; + uint64_t type; + + type = be64_to_cpu(power_msg->params[0]); + + switch (type) { + case SOFT_REBOOT: + pr_info("OPAL: reboot requested\n"); + orderly_reboot(); + break; + case SOFT_OFF: + pr_info("OPAL: poweroff requested\n"); + orderly_poweroff(true); + break; + default: + pr_err("OPAL: power control type unexpected %016llx\n", type); + } + + return 0; +} + +static struct notifier_block opal_power_control_nb = { + .notifier_call = opal_power_control_event, + .next = NULL, + .priority = 0, +}; + +static int __init opal_power_control_init(void) +{ + int ret; + + ret = opal_message_notifier_register(OPAL_MSG_SHUTDOWN, + &opal_power_control_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + + return 0; +} +machine_subsys_initcall(powernv, opal_power_control_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-rtc.c b/kernel/arch/powerpc/platforms/powernv/opal-rtc.c new file mode 100644 index 000000000..37dbee157 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-rtc.c @@ -0,0 +1,87 @@ +/* + * PowerNV Real Time Clock. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/kernel.h> +#include <linux/time.h> +#include <linux/bcd.h> +#include <linux/rtc.h> +#include <linux/delay.h> +#include <linux/platform_device.h> +#include <linux/of_platform.h> + +#include <asm/opal.h> +#include <asm/firmware.h> +#include <asm/machdep.h> + +static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) +{ + tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) + + bcd2bin((y_m_d >> 16) & 0xff)) - 1900; + tm->tm_mon = bcd2bin((y_m_d >> 8) & 0xff) - 1; + tm->tm_mday = bcd2bin(y_m_d & 0xff); + tm->tm_hour = bcd2bin((h_m_s_ms >> 56) & 0xff); + tm->tm_min = bcd2bin((h_m_s_ms >> 48) & 0xff); + tm->tm_sec = bcd2bin((h_m_s_ms >> 40) & 0xff); + + GregorianDay(tm); +} + +unsigned long __init opal_get_boot_time(void) +{ + struct rtc_time tm; + u32 y_m_d; + u64 h_m_s_ms; + __be32 __y_m_d; + __be64 __h_m_s_ms; + long rc = OPAL_BUSY; + + if (!opal_check_token(OPAL_RTC_READ)) + return 0; + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + if (rc != OPAL_SUCCESS) + return 0; + + y_m_d = be32_to_cpu(__y_m_d); + h_m_s_ms = be64_to_cpu(__h_m_s_ms); + opal_to_tm(y_m_d, h_m_s_ms, &tm); + return mktime(tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); +} + +static __init int opal_time_init(void) +{ + struct platform_device *pdev; + struct device_node *rtc; + + rtc = of_find_node_by_path("/ibm,opal/rtc"); + if (rtc) { + pdev = of_platform_device_create(rtc, "opal-rtc", NULL); + of_node_put(rtc); + } else { + if (opal_check_token(OPAL_RTC_READ) || + opal_check_token(OPAL_READ_TPO)) + pdev = platform_device_register_simple("opal-rtc", -1, + NULL, 0); + else + return -ENODEV; + } + + return PTR_ERR_OR_ZERO(pdev); +} +machine_subsys_initcall(powernv, opal_time_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-sensor.c b/kernel/arch/powerpc/platforms/powernv/opal-sensor.c new file mode 100644 index 000000000..655250499 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-sensor.c @@ -0,0 +1,96 @@ +/* + * PowerNV sensor code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/mutex.h> +#include <linux/of_platform.h> +#include <asm/opal.h> +#include <asm/machdep.h> + +static DEFINE_MUTEX(opal_sensor_mutex); + +/* + * This will return sensor information to driver based on the requested sensor + * handle. A handle is an opaque id for the powernv, read by the driver from the + * device tree.. + */ +int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) +{ + int ret, token; + struct opal_msg msg; + __be32 data; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + pr_err("%s: Couldn't get the token, returning\n", __func__); + ret = token; + goto out; + } + + mutex_lock(&opal_sensor_mutex); + ret = opal_sensor_read(sensor_hndl, token, &data); + switch (ret) { + case OPAL_ASYNC_COMPLETION: + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %d\n", + __func__, ret); + goto out_token; + } + + ret = opal_error_code(be64_to_cpu(msg.params[1])); + *sensor_data = be32_to_cpu(data); + break; + + case OPAL_SUCCESS: + ret = 0; + *sensor_data = be32_to_cpu(data); + break; + + default: + ret = opal_error_code(ret); + break; + } + +out_token: + mutex_unlock(&opal_sensor_mutex); + opal_async_release_token(token); +out: + return ret; +} +EXPORT_SYMBOL_GPL(opal_get_sensor_data); + +static __init int opal_sensor_init(void) +{ + struct platform_device *pdev; + struct device_node *sensor; + + sensor = of_find_node_by_path("/ibm,opal/sensors"); + if (!sensor) { + pr_err("Opal node 'sensors' not found\n"); + return -ENODEV; + } + + pdev = of_platform_device_create(sensor, "opal-sensor", NULL); + of_node_put(sensor); + + return PTR_ERR_OR_ZERO(pdev); +} +machine_subsys_initcall(powernv, opal_sensor_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-sysparam.c b/kernel/arch/powerpc/platforms/powernv/opal-sysparam.c new file mode 100644 index 000000000..9d1acf22a --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -0,0 +1,304 @@ +/* + * PowerNV system parameter code + * + * Copyright (C) 2013 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kobject.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/gfp.h> +#include <linux/stat.h> +#include <asm/opal.h> + +#define MAX_PARAM_DATA_LEN 64 + +static DEFINE_MUTEX(opal_sysparam_mutex); +static struct kobject *sysparam_kobj; +static void *param_data_buf; + +struct param_attr { + struct list_head list; + u32 param_id; + u32 param_size; + struct kobj_attribute kobj_attr; +}; + +static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) +{ + struct opal_msg msg; + ssize_t ret; + int token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + if (token != -ERESTARTSYS) + pr_err("%s: Couldn't get the token, returning\n", + __func__); + ret = token; + goto out; + } + + ret = opal_get_param(token, param_id, (u64)buffer, length); + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %zd\n", + __func__, ret); + goto out_token; + } + + ret = be64_to_cpu(msg.params[1]); + +out_token: + opal_async_release_token(token); +out: + return ret; +} + +static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) +{ + struct opal_msg msg; + int ret, token; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + if (token != -ERESTARTSYS) + pr_err("%s: Couldn't get the token, returning\n", + __func__); + ret = token; + goto out; + } + + ret = opal_set_param(token, param_id, (u64)buffer, length); + + if (ret != OPAL_ASYNC_COMPLETION) + goto out_token; + + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_err("%s: Failed to wait for the async response, %d\n", + __func__, ret); + goto out_token; + } + + ret = be64_to_cpu(msg.params[1]); + +out_token: + opal_async_release_token(token); +out: + return ret; +} + +static ssize_t sys_param_show(struct kobject *kobj, + struct kobj_attribute *kobj_attr, char *buf) +{ + struct param_attr *attr = container_of(kobj_attr, struct param_attr, + kobj_attr); + ssize_t ret; + + mutex_lock(&opal_sysparam_mutex); + ret = opal_get_sys_param(attr->param_id, attr->param_size, + param_data_buf); + if (ret) + goto out; + + memcpy(buf, param_data_buf, attr->param_size); + + ret = attr->param_size; +out: + mutex_unlock(&opal_sysparam_mutex); + return ret; +} + +static ssize_t sys_param_store(struct kobject *kobj, + struct kobj_attribute *kobj_attr, const char *buf, size_t count) +{ + struct param_attr *attr = container_of(kobj_attr, struct param_attr, + kobj_attr); + ssize_t ret; + + /* MAX_PARAM_DATA_LEN is sizeof(param_data_buf) */ + if (count > MAX_PARAM_DATA_LEN) + count = MAX_PARAM_DATA_LEN; + + mutex_lock(&opal_sysparam_mutex); + memcpy(param_data_buf, buf, count); + ret = opal_set_sys_param(attr->param_id, attr->param_size, + param_data_buf); + mutex_unlock(&opal_sysparam_mutex); + if (!ret) + ret = count; + return ret; +} + +void __init opal_sys_param_init(void) +{ + struct device_node *sysparam; + struct param_attr *attr; + u32 *id, *size; + int count, i; + u8 *perm; + + if (!opal_kobj) { + pr_warn("SYSPARAM: opal kobject is not available\n"); + goto out; + } + + sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj); + if (!sysparam_kobj) { + pr_err("SYSPARAM: Failed to create sysparam kobject\n"); + goto out; + } + + /* Allocate big enough buffer for any get/set transactions */ + param_data_buf = kzalloc(MAX_PARAM_DATA_LEN, GFP_KERNEL); + if (!param_data_buf) { + pr_err("SYSPARAM: Failed to allocate memory for param data " + "buf\n"); + goto out_kobj_put; + } + + sysparam = of_find_node_by_path("/ibm,opal/sysparams"); + if (!sysparam) { + pr_err("SYSPARAM: Opal sysparam node not found\n"); + goto out_param_buf; + } + + if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) { + pr_err("SYSPARAM: Opal sysparam node not compatible\n"); + goto out_node_put; + } + + /* Number of parameters exposed through DT */ + count = of_property_count_strings(sysparam, "param-name"); + if (count < 0) { + pr_err("SYSPARAM: No string found of property param-name in " + "the node %s\n", sysparam->name); + goto out_node_put; + } + + id = kzalloc(sizeof(*id) * count, GFP_KERNEL); + if (!id) { + pr_err("SYSPARAM: Failed to allocate memory to read parameter " + "id\n"); + goto out_node_put; + } + + size = kzalloc(sizeof(*size) * count, GFP_KERNEL); + if (!size) { + pr_err("SYSPARAM: Failed to allocate memory to read parameter " + "size\n"); + goto out_free_id; + } + + perm = kzalloc(sizeof(*perm) * count, GFP_KERNEL); + if (!perm) { + pr_err("SYSPARAM: Failed to allocate memory to read supported " + "action on the parameter"); + goto out_free_size; + } + + if (of_property_read_u32_array(sysparam, "param-id", id, count)) { + pr_err("SYSPARAM: Missing property param-id in the DT\n"); + goto out_free_perm; + } + + if (of_property_read_u32_array(sysparam, "param-len", size, count)) { + pr_err("SYSPARAM: Missing property param-len in the DT\n"); + goto out_free_perm; + } + + + if (of_property_read_u8_array(sysparam, "param-perm", perm, count)) { + pr_err("SYSPARAM: Missing property param-perm in the DT\n"); + goto out_free_perm; + } + + attr = kzalloc(sizeof(*attr) * count, GFP_KERNEL); + if (!attr) { + pr_err("SYSPARAM: Failed to allocate memory for parameter " + "attributes\n"); + goto out_free_perm; + } + + /* For each of the parameters, populate the parameter attributes */ + for (i = 0; i < count; i++) { + if (size[i] > MAX_PARAM_DATA_LEN) { + pr_warn("SYSPARAM: Not creating parameter %d as size " + "exceeds buffer length\n", i); + continue; + } + + sysfs_attr_init(&attr[i].kobj_attr.attr); + attr[i].param_id = id[i]; + attr[i].param_size = size[i]; + if (of_property_read_string_index(sysparam, "param-name", i, + &attr[i].kobj_attr.attr.name)) + continue; + + /* If the parameter is read-only or read-write */ + switch (perm[i] & 3) { + case OPAL_SYSPARAM_READ: + attr[i].kobj_attr.attr.mode = S_IRUGO; + break; + case OPAL_SYSPARAM_WRITE: + attr[i].kobj_attr.attr.mode = S_IWUSR; + break; + case OPAL_SYSPARAM_RW: + attr[i].kobj_attr.attr.mode = S_IRUGO | S_IWUSR; + break; + default: + break; + } + + attr[i].kobj_attr.show = sys_param_show; + attr[i].kobj_attr.store = sys_param_store; + + if (sysfs_create_file(sysparam_kobj, &attr[i].kobj_attr.attr)) { + pr_err("SYSPARAM: Failed to create sysfs file %s\n", + attr[i].kobj_attr.attr.name); + goto out_free_attr; + } + } + + kfree(perm); + kfree(size); + kfree(id); + of_node_put(sysparam); + return; + +out_free_attr: + kfree(attr); +out_free_perm: + kfree(perm); +out_free_size: + kfree(size); +out_free_id: + kfree(id); +out_node_put: + of_node_put(sysparam); +out_param_buf: + kfree(param_data_buf); +out_kobj_put: + kobject_put(sysparam_kobj); +out: + return; +} diff --git a/kernel/arch/powerpc/platforms/powernv/opal-tracepoints.c b/kernel/arch/powerpc/platforms/powernv/opal-tracepoints.c new file mode 100644 index 000000000..e11273b23 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-tracepoints.c @@ -0,0 +1,84 @@ +#include <linux/percpu.h> +#include <linux/jump_label.h> +#include <asm/trace.h> + +#ifdef HAVE_JUMP_LABEL +struct static_key opal_tracepoint_key = STATIC_KEY_INIT; + +void opal_tracepoint_regfunc(void) +{ + static_key_slow_inc(&opal_tracepoint_key); +} + +void opal_tracepoint_unregfunc(void) +{ + static_key_slow_dec(&opal_tracepoint_key); +} +#else +/* + * We optimise OPAL calls by placing opal_tracepoint_refcount + * directly in the TOC so we can check if the opal tracepoints are + * enabled via a single load. + */ + +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ +extern long opal_tracepoint_refcount; + +void opal_tracepoint_regfunc(void) +{ + opal_tracepoint_refcount++; +} + +void opal_tracepoint_unregfunc(void) +{ + opal_tracepoint_refcount--; +} +#endif + +/* + * Since the tracing code might execute OPAL calls we need to guard against + * recursion. + */ +static DEFINE_PER_CPU(unsigned int, opal_trace_depth); + +void __trace_opal_entry(unsigned long opcode, unsigned long *args) +{ + unsigned long flags; + unsigned int *depth; + + local_irq_save(flags); + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + goto out; + + (*depth)++; + preempt_disable(); + trace_opal_entry(opcode, args); + (*depth)--; + +out: + local_irq_restore(flags); +} + +void __trace_opal_exit(long opcode, unsigned long retval) +{ + unsigned long flags; + unsigned int *depth; + + local_irq_save(flags); + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + goto out; + + (*depth)++; + trace_opal_exit(opcode, retval); + preempt_enable(); + (*depth)--; + +out: + local_irq_restore(flags); +} diff --git a/kernel/arch/powerpc/platforms/powernv/opal-wrappers.S b/kernel/arch/powerpc/platforms/powernv/opal-wrappers.S new file mode 100644 index 000000000..a7ade94cd --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -0,0 +1,297 @@ +/* + * PowerNV OPAL API wrappers + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/jump_label.h> +#include <asm/ppc_asm.h> +#include <asm/hvcall.h> +#include <asm/asm-offsets.h> +#include <asm/opal.h> + + .section ".text" + +#ifdef CONFIG_TRACEPOINTS +#ifdef HAVE_JUMP_LABEL +#define OPAL_BRANCH(LABEL) \ + ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key) +#else + + .section ".toc","aw" + + .globl opal_tracepoint_refcount +opal_tracepoint_refcount: + .llong 0 + + .section ".text" + +/* + * We branch around this in early init by using an unconditional cpu + * feature. + */ +#define OPAL_BRANCH(LABEL) \ +BEGIN_FTR_SECTION; \ + b 1f; \ +END_FTR_SECTION(0, 1); \ + ld r12,opal_tracepoint_refcount@toc(r2); \ + cmpdi r12,0; \ + bne- LABEL; \ +1: + +#endif + +#else +#define OPAL_BRANCH(LABEL) +#endif + +/* TODO: + * + * - Trace irqs in/off (needs saving/restoring all args, argh...) + * - Get r11 feed up by Dave so I can have better register usage + */ + +#define OPAL_CALL(name, token) \ + _GLOBAL_TOC(name); \ + mflr r0; \ + std r0,16(r1); \ + li r0,token; \ + OPAL_BRANCH(opal_tracepoint_entry) \ + mfcr r12; \ + stw r12,8(r1); \ + std r1,PACAR1(r13); \ + li r11,0; \ + mfmsr r12; \ + ori r11,r11,MSR_EE; \ + std r12,PACASAVEDMSR(r13); \ + andc r12,r12,r11; \ + mtmsrd r12,1; \ + LOAD_REG_ADDR(r11,opal_return); \ + mtlr r11; \ + li r11,MSR_DR|MSR_IR|MSR_LE;\ + andc r12,r12,r11; \ + mtspr SPRN_HSRR1,r12; \ + LOAD_REG_ADDR(r11,opal); \ + ld r12,8(r11); \ + ld r2,0(r11); \ + mtspr SPRN_HSRR0,r12; \ + hrfid + +opal_return: + /* + * Fixup endian on OPAL return... we should be able to simplify + * this by instead converting the below trampoline to a set of + * bytes (always BE) since MSR:LE will end up fixed up as a side + * effect of the rfid. + */ + FIXUP_ENDIAN + ld r2,PACATOC(r13); + lwz r4,8(r1); + ld r5,16(r1); + ld r6,PACASAVEDMSR(r13); + mtspr SPRN_SRR0,r5; + mtspr SPRN_SRR1,r6; + mtcr r4; + rfid + +#ifdef CONFIG_TRACEPOINTS +opal_tracepoint_entry: + stdu r1,-STACKFRAMESIZE(r1) + std r0,STK_REG(R23)(r1) + std r3,STK_REG(R24)(r1) + std r4,STK_REG(R25)(r1) + std r5,STK_REG(R26)(r1) + std r6,STK_REG(R27)(r1) + std r7,STK_REG(R28)(r1) + std r8,STK_REG(R29)(r1) + std r9,STK_REG(R30)(r1) + std r10,STK_REG(R31)(r1) + mr r3,r0 + addi r4,r1,STK_REG(R24) + bl __trace_opal_entry + ld r0,STK_REG(R23)(r1) + ld r3,STK_REG(R24)(r1) + ld r4,STK_REG(R25)(r1) + ld r5,STK_REG(R26)(r1) + ld r6,STK_REG(R27)(r1) + ld r7,STK_REG(R28)(r1) + ld r8,STK_REG(R29)(r1) + ld r9,STK_REG(R30)(r1) + ld r10,STK_REG(R31)(r1) + LOAD_REG_ADDR(r11,opal_tracepoint_return) + mfcr r12 + std r11,16(r1) + stw r12,8(r1) + std r1,PACAR1(r13) + li r11,0 + mfmsr r12 + ori r11,r11,MSR_EE + std r12,PACASAVEDMSR(r13) + andc r12,r12,r11 + mtmsrd r12,1 + LOAD_REG_ADDR(r11,opal_return) + mtlr r11 + li r11,MSR_DR|MSR_IR|MSR_LE + andc r12,r12,r11 + mtspr SPRN_HSRR1,r12 + LOAD_REG_ADDR(r11,opal) + ld r12,8(r11) + ld r2,0(r11) + mtspr SPRN_HSRR0,r12 + hrfid + +opal_tracepoint_return: + std r3,STK_REG(R31)(r1) + mr r4,r3 + ld r0,STK_REG(R23)(r1) + bl __trace_opal_exit + ld r3,STK_REG(R31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr +#endif + +/* + * Make opal call in realmode. This is a generic function to be called + * from realmode. It handles endianness. + * + * r13 - paca pointer + * r1 - stack pointer + * r0 - opal token + */ +_GLOBAL(opal_call_realmode) + mflr r12 + std r12,PPC_LR_STKOFF(r1) + ld r2,PACATOC(r13) + /* Set opal return address */ + LOAD_REG_ADDR(r12,return_from_opal_call) + mtlr r12 + + mfmsr r12 +#ifdef __LITTLE_ENDIAN__ + /* Handle endian-ness */ + li r11,MSR_LE + andc r12,r12,r11 +#endif + mtspr SPRN_HSRR1,r12 + LOAD_REG_ADDR(r11,opal) + ld r12,8(r11) + ld r2,0(r11) + mtspr SPRN_HSRR0,r12 + hrfid + +return_from_opal_call: +#ifdef __LITTLE_ENDIAN__ + FIXUP_ENDIAN +#endif + ld r12,PPC_LR_STKOFF(r1) + mtlr r12 + blr + +OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); +OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); +OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); +OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); +OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); +OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); +OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); +OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); +OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); +OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); +OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); +OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param, OPAL_GET_PARAM); +OPAL_CALL(opal_set_param, OPAL_SET_PARAM); +OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); +OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); +OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); +OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); +OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); +OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); +OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); +OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); +OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); +OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); +OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); +OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); diff --git a/kernel/arch/powerpc/platforms/powernv/opal-xscom.c b/kernel/arch/powerpc/platforms/powernv/opal-xscom.c new file mode 100644 index 000000000..7634d1c62 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal-xscom.c @@ -0,0 +1,133 @@ +/* + * PowerNV LPC bus handling. + * + * Copyright 2013 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/bug.h> +#include <linux/gfp.h> +#include <linux/slab.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/opal.h> +#include <asm/scom.h> + +/* + * We could probably fit that inside the scom_map_t + * which is a void* after all but it's really too ugly + * so let's kmalloc it for now + */ +struct opal_scom_map { + uint32_t chip; + uint64_t addr; +}; + +static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) +{ + struct opal_scom_map *m; + const __be32 *gcid; + + if (!of_get_property(dev, "scom-controller", NULL)) { + pr_err("%s: device %s is not a SCOM controller\n", + __func__, dev->full_name); + return SCOM_MAP_INVALID; + } + gcid = of_get_property(dev, "ibm,chip-id", NULL); + if (!gcid) { + pr_err("%s: device %s has no ibm,chip-id\n", + __func__, dev->full_name); + return SCOM_MAP_INVALID; + } + m = kmalloc(sizeof(struct opal_scom_map), GFP_KERNEL); + if (!m) + return NULL; + m->chip = be32_to_cpup(gcid); + m->addr = reg; + + return (scom_map_t)m; +} + +static void opal_scom_unmap(scom_map_t map) +{ + kfree(map); +} + +static int opal_xscom_err_xlate(int64_t rc) +{ + switch(rc) { + case 0: + return 0; + /* Add more translations if necessary */ + default: + return -EIO; + } +} + +static u64 opal_scom_unmangle(u64 addr) +{ + /* + * XSCOM indirect addresses have the top bit set. Additionally + * the rest of the top 3 nibbles is always 0. + * + * Because the debugfs interface uses signed offsets and shifts + * the address left by 3, we basically cannot use the top 4 bits + * of the 64-bit address, and thus cannot use the indirect bit. + * + * To deal with that, we support the indirect bit being in bit + * 4 (IBM notation) instead of bit 0 in this API, we do the + * conversion here. To leave room for further xscom address + * expansion, we only clear out the top byte + * + * For in-kernel use, we also support the real indirect bit, so + * we test for any of the top 5 bits + * + */ + if (addr & (0x1full << 59)) + addr = (addr & ~(0xffull << 56)) | (1ull << 63); + return addr; +} + +static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +{ + struct opal_scom_map *m = map; + int64_t rc; + __be64 v; + + reg = opal_scom_unmangle(m->addr + reg); + rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); + *value = be64_to_cpu(v); + return opal_xscom_err_xlate(rc); +} + +static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +{ + struct opal_scom_map *m = map; + int64_t rc; + + reg = opal_scom_unmangle(m->addr + reg); + rc = opal_xscom_write(m->chip, reg, value); + return opal_xscom_err_xlate(rc); +} + +static const struct scom_controller opal_scom_controller = { + .map = opal_scom_map, + .unmap = opal_scom_unmap, + .read = opal_scom_read, + .write = opal_scom_write +}; + +static int opal_xscom_init(void) +{ + if (firmware_has_feature(FW_FEATURE_OPALv3)) + scom_init(&opal_scom_controller); + return 0; +} +machine_arch_initcall(powernv, opal_xscom_init); diff --git a/kernel/arch/powerpc/platforms/powernv/opal.c b/kernel/arch/powerpc/platforms/powernv/opal.c new file mode 100644 index 000000000..2241565b0 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/opal.c @@ -0,0 +1,972 @@ +/* + * PowerNV OPAL high level interfaces + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "opal: " fmt + +#include <linux/printk.h> +#include <linux/types.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_platform.h> +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/kobject.h> +#include <linux/delay.h> +#include <linux/memblock.h> +#include <linux/kthread.h> +#include <linux/freezer.h> + +#include <asm/machdep.h> +#include <asm/opal.h> +#include <asm/firmware.h> +#include <asm/mce.h> + +#include "powernv.h" + +/* /sys/firmware/opal */ +struct kobject *opal_kobj; + +struct opal { + u64 base; + u64 entry; + u64 size; +} opal; + +struct mcheck_recoverable_range { + u64 start_addr; + u64 end_addr; + u64 recover_addr; +}; + +static struct mcheck_recoverable_range *mc_recoverable_range; +static int mc_recoverable_range_len; + +struct device_node *opal_node; +static DEFINE_SPINLOCK(opal_write_lock); +static unsigned int *opal_irqs; +static unsigned int opal_irq_count; +static ATOMIC_NOTIFIER_HEAD(opal_notifier_head); +static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; +static DEFINE_SPINLOCK(opal_notifier_lock); +static uint64_t last_notified_mask = 0x0ul; +static atomic_t opal_notifier_hold = ATOMIC_INIT(0); +static uint32_t opal_heartbeat; + +static void opal_reinit_cores(void) +{ + /* Do the actual re-init, This will clobber all FPRs, VRs, etc... + * + * It will preserve non volatile GPRs and HSPRG0/1. It will + * also restore HIDs and other SPRs to their original value + * but it might clobber a bunch. + */ +#ifdef __BIG_ENDIAN__ + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); +#else + opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); +#endif +} + +int __init early_init_dt_scan_opal(unsigned long node, + const char *uname, int depth, void *data) +{ + const void *basep, *entryp, *sizep; + int basesz, entrysz, runtimesz; + + if (depth != 1 || strcmp(uname, "ibm,opal") != 0) + return 0; + + basep = of_get_flat_dt_prop(node, "opal-base-address", &basesz); + entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz); + sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz); + + if (!basep || !entryp || !sizep) + return 1; + + opal.base = of_read_number(basep, basesz/4); + opal.entry = of_read_number(entryp, entrysz/4); + opal.size = of_read_number(sizep, runtimesz/4); + + pr_debug("OPAL Base = 0x%llx (basep=%p basesz=%d)\n", + opal.base, basep, basesz); + pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n", + opal.entry, entryp, entrysz); + pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n", + opal.size, sizep, runtimesz); + + powerpc_firmware_features |= FW_FEATURE_OPAL; + if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) { + powerpc_firmware_features |= FW_FEATURE_OPALv2; + powerpc_firmware_features |= FW_FEATURE_OPALv3; + pr_info("OPAL V3 detected !\n"); + } else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { + powerpc_firmware_features |= FW_FEATURE_OPALv2; + pr_info("OPAL V2 detected !\n"); + } else { + pr_info("OPAL V1 detected !\n"); + } + + /* Reinit all cores with the right endian */ + opal_reinit_cores(); + + /* Restore some bits */ + if (cur_cpu_spec->cpu_restore) + cur_cpu_spec->cpu_restore(); + + return 1; +} + +int __init early_init_dt_scan_recoverable_ranges(unsigned long node, + const char *uname, int depth, void *data) +{ + int i, psize, size; + const __be32 *prop; + + if (depth != 1 || strcmp(uname, "ibm,opal") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize); + + if (!prop) + return 1; + + pr_debug("Found machine check recoverable ranges.\n"); + + /* + * Calculate number of available entries. + * + * Each recoverable address range entry is (start address, len, + * recovery address), 2 cells each for start and recovery address, + * 1 cell for len, totalling 5 cells per entry. + */ + mc_recoverable_range_len = psize / (sizeof(*prop) * 5); + + /* Sanity check */ + if (!mc_recoverable_range_len) + return 1; + + /* Size required to hold all the entries. */ + size = mc_recoverable_range_len * + sizeof(struct mcheck_recoverable_range); + + /* + * Allocate a buffer to hold the MC recoverable ranges. We would be + * accessing them in real mode, hence it needs to be within + * RMO region. + */ + mc_recoverable_range =__va(memblock_alloc_base(size, __alignof__(u64), + ppc64_rma_size)); + memset(mc_recoverable_range, 0, size); + + for (i = 0; i < mc_recoverable_range_len; i++) { + mc_recoverable_range[i].start_addr = + of_read_number(prop + (i * 5) + 0, 2); + mc_recoverable_range[i].end_addr = + mc_recoverable_range[i].start_addr + + of_read_number(prop + (i * 5) + 2, 1); + mc_recoverable_range[i].recover_addr = + of_read_number(prop + (i * 5) + 3, 2); + + pr_debug("Machine check recoverable range: %llx..%llx: %llx\n", + mc_recoverable_range[i].start_addr, + mc_recoverable_range[i].end_addr, + mc_recoverable_range[i].recover_addr); + } + return 1; +} + +static int __init opal_register_exception_handlers(void) +{ +#ifdef __BIG_ENDIAN__ + u64 glue; + + if (!(powerpc_firmware_features & FW_FEATURE_OPAL)) + return -ENODEV; + + /* Hookup some exception handlers except machine check. We use the + * fwnmi area at 0x7000 to provide the glue space to OPAL + */ + glue = 0x7000; + + /* + * Check if we are running on newer firmware that exports + * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch + * the HMI interrupt and we catch it directly in Linux. + * + * For older firmware (i.e currently released POWER8 System Firmware + * as of today <= SV810_087), we fallback to old behavior and let OPAL + * patch the HMI vector and handle it inside OPAL firmware. + * + * For newer firmware (in development/yet to be released) we will + * start catching/handling HMI directly in Linux. + */ + if (!opal_check_token(OPAL_HANDLE_HMI)) { + pr_info("Old firmware detected, OPAL handles HMIs.\n"); + opal_register_exception_handler( + OPAL_HYPERVISOR_MAINTENANCE_HANDLER, + 0, glue); + glue += 128; + } + + opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); +#endif + + return 0; +} +machine_early_initcall(powernv, opal_register_exception_handlers); + +int opal_notifier_register(struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + + atomic_notifier_chain_register(&opal_notifier_head, nb); + return 0; +} +EXPORT_SYMBOL_GPL(opal_notifier_register); + +int opal_notifier_unregister(struct notifier_block *nb) +{ + if (!nb) { + pr_warning("%s: Invalid argument (%p)\n", + __func__, nb); + return -EINVAL; + } + + atomic_notifier_chain_unregister(&opal_notifier_head, nb); + return 0; +} +EXPORT_SYMBOL_GPL(opal_notifier_unregister); + +static void opal_do_notifier(uint64_t events) +{ + unsigned long flags; + uint64_t changed_mask; + + if (atomic_read(&opal_notifier_hold)) + return; + + spin_lock_irqsave(&opal_notifier_lock, flags); + changed_mask = last_notified_mask ^ events; + last_notified_mask = events; + spin_unlock_irqrestore(&opal_notifier_lock, flags); + + /* + * We feed with the event bits and changed bits for + * enough information to the callback. + */ + atomic_notifier_call_chain(&opal_notifier_head, + events, (void *)changed_mask); +} + +void opal_notifier_update_evt(uint64_t evt_mask, + uint64_t evt_val) +{ + unsigned long flags; + + spin_lock_irqsave(&opal_notifier_lock, flags); + last_notified_mask &= ~evt_mask; + last_notified_mask |= evt_val; + spin_unlock_irqrestore(&opal_notifier_lock, flags); +} + +void opal_notifier_enable(void) +{ + int64_t rc; + __be64 evt = 0; + + atomic_set(&opal_notifier_hold, 0); + + /* Process pending events */ + rc = opal_poll_events(&evt); + if (rc == OPAL_SUCCESS && evt) + opal_do_notifier(be64_to_cpu(evt)); +} + +void opal_notifier_disable(void) +{ + atomic_set(&opal_notifier_hold, 1); +} + +/* + * Opal message notifier based on message type. Allow subscribers to get + * notified for specific messgae type. + */ +int opal_message_notifier_register(enum opal_msg_type msg_type, + struct notifier_block *nb) +{ + if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) { + pr_warning("%s: Invalid arguments, msg_type:%d\n", + __func__, msg_type); + return -EINVAL; + } + + return atomic_notifier_chain_register( + &opal_msg_notifier_head[msg_type], nb); +} + +int opal_message_notifier_unregister(enum opal_msg_type msg_type, + struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister( + &opal_msg_notifier_head[msg_type], nb); +} + +static void opal_message_do_notify(uint32_t msg_type, void *msg) +{ + /* notify subscribers */ + atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type], + msg_type, msg); +} + +static void opal_handle_message(void) +{ + s64 ret; + /* + * TODO: pre-allocate a message buffer depending on opal-msg-size + * value in /proc/device-tree. + */ + static struct opal_msg msg; + u32 type; + + ret = opal_get_msg(__pa(&msg), sizeof(msg)); + /* No opal message pending. */ + if (ret == OPAL_RESOURCE) + return; + + /* check for errors. */ + if (ret) { + pr_warning("%s: Failed to retrieve opal message, err=%lld\n", + __func__, ret); + return; + } + + type = be32_to_cpu(msg.msg_type); + + /* Sanity check */ + if (type >= OPAL_MSG_TYPE_MAX) { + pr_warning("%s: Unknown message type: %u\n", __func__, type); + return; + } + opal_message_do_notify(type, (void *)&msg); +} + +static int opal_message_notify(struct notifier_block *nb, + unsigned long events, void *change) +{ + if (events & OPAL_EVENT_MSG_PENDING) + opal_handle_message(); + return 0; +} + +static struct notifier_block opal_message_nb = { + .notifier_call = opal_message_notify, + .next = NULL, + .priority = 0, +}; + +static int __init opal_message_init(void) +{ + int ret, i; + + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) + ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); + + ret = opal_notifier_register(&opal_message_nb); + if (ret) { + pr_err("%s: Can't register OPAL event notifier (%d)\n", + __func__, ret); + return ret; + } + return 0; +} +machine_early_initcall(powernv, opal_message_init); + +int opal_get_chars(uint32_t vtermno, char *buf, int count) +{ + s64 rc; + __be64 evt, len; + + if (!opal.entry) + return -ENODEV; + opal_poll_events(&evt); + if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0) + return 0; + len = cpu_to_be64(count); + rc = opal_console_read(vtermno, &len, buf); + if (rc == OPAL_SUCCESS) + return be64_to_cpu(len); + return 0; +} + +int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +{ + int written = 0; + __be64 olen; + s64 len, rc; + unsigned long flags; + __be64 evt; + + if (!opal.entry) + return -ENODEV; + + /* We want put_chars to be atomic to avoid mangling of hvsi + * packets. To do that, we first test for room and return + * -EAGAIN if there isn't enough. + * + * Unfortunately, opal_console_write_buffer_space() doesn't + * appear to work on opal v1, so we just assume there is + * enough room and be done with it + */ + spin_lock_irqsave(&opal_write_lock, flags); + if (firmware_has_feature(FW_FEATURE_OPALv2)) { + rc = opal_console_write_buffer_space(vtermno, &olen); + len = be64_to_cpu(olen); + if (rc || len < total_len) { + spin_unlock_irqrestore(&opal_write_lock, flags); + /* Closed -> drop characters */ + if (rc) + return total_len; + opal_poll_events(NULL); + return -EAGAIN; + } + } + + /* We still try to handle partial completions, though they + * should no longer happen. + */ + rc = OPAL_BUSY; + while(total_len > 0 && (rc == OPAL_BUSY || + rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { + olen = cpu_to_be64(total_len); + rc = opal_console_write(vtermno, &olen, data); + len = be64_to_cpu(olen); + + /* Closed or other error drop */ + if (rc != OPAL_SUCCESS && rc != OPAL_BUSY && + rc != OPAL_BUSY_EVENT) { + written = total_len; + break; + } + if (rc == OPAL_SUCCESS) { + total_len -= len; + data += len; + written += len; + } + /* This is a bit nasty but we need that for the console to + * flush when there aren't any interrupts. We will clean + * things a bit later to limit that to synchronous path + * such as the kernel console and xmon/udbg + */ + do + opal_poll_events(&evt); + while(rc == OPAL_SUCCESS && + (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT)); + } + spin_unlock_irqrestore(&opal_write_lock, flags); + return written; +} + +static int opal_recover_mce(struct pt_regs *regs, + struct machine_check_event *evt) +{ + int recovered = 0; + uint64_t ea = get_mce_fault_addr(evt); + + if (!(regs->msr & MSR_RI)) { + /* If MSR_RI isn't set, we cannot recover */ + recovered = 0; + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { + /* Platform corrected itself */ + recovered = 1; + } else if (ea && !is_kernel_addr(ea)) { + /* + * Faulting address is not in kernel text. We should be fine. + * We need to find which process uses this address. + * For now, kill the task if we have received exception when + * in userspace. + * + * TODO: Queue up this address for hwpoisioning later. + */ + if (user_mode(regs) && !is_global_init(current)) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else + recovered = 0; + } else if (user_mode(regs) && !is_global_init(current) && + evt->severity == MCE_SEV_ERROR_SYNC) { + /* + * If we have received a synchronous error when in userspace + * kill the task. + */ + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } + return recovered; +} + +int opal_machine_check(struct pt_regs *regs) +{ + struct machine_check_event evt; + + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return 0; + + /* Print things out */ + if (evt.version != MCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; + } + machine_check_print_event_info(&evt); + + if (opal_recover_mce(regs, &evt)) + return 1; + return 0; +} + +/* Early hmi handler called in real mode. */ +int opal_hmi_exception_early(struct pt_regs *regs) +{ + s64 rc; + + /* + * call opal hmi handler. Pass paca address as token. + * The return value OPAL_SUCCESS is an indication that there is + * an HMI event generated waiting to pull by Linux. + */ + rc = opal_handle_hmi(); + if (rc == OPAL_SUCCESS) { + local_paca->hmi_event_available = 1; + return 1; + } + return 0; +} + +/* HMI exception handler called in virtual mode during check_irq_replay. */ +int opal_handle_hmi_exception(struct pt_regs *regs) +{ + s64 rc; + __be64 evt = 0; + + /* + * Check if HMI event is available. + * if Yes, then call opal_poll_events to pull opal messages and + * process them. + */ + if (!local_paca->hmi_event_available) + return 0; + + local_paca->hmi_event_available = 0; + rc = opal_poll_events(&evt); + if (rc == OPAL_SUCCESS && evt) + opal_do_notifier(be64_to_cpu(evt)); + + return 1; +} + +static uint64_t find_recovery_address(uint64_t nip) +{ + int i; + + for (i = 0; i < mc_recoverable_range_len; i++) + if ((nip >= mc_recoverable_range[i].start_addr) && + (nip < mc_recoverable_range[i].end_addr)) + return mc_recoverable_range[i].recover_addr; + return 0; +} + +bool opal_mce_check_early_recovery(struct pt_regs *regs) +{ + uint64_t recover_addr = 0; + + if (!opal.base || !opal.size) + goto out; + + if ((regs->nip >= opal.base) && + (regs->nip <= (opal.base + opal.size))) + recover_addr = find_recovery_address(regs->nip); + + /* + * Setup regs->nip to rfi into fixup address. + */ + if (recover_addr) + regs->nip = recover_addr; + +out: + return !!recover_addr; +} + +static irqreturn_t opal_interrupt(int irq, void *data) +{ + __be64 events; + + opal_handle_interrupt(virq_to_hw(irq), &events); + + opal_do_notifier(be64_to_cpu(events)); + + return IRQ_HANDLED; +} + +static int opal_sysfs_init(void) +{ + opal_kobj = kobject_create_and_add("opal", firmware_kobj); + if (!opal_kobj) { + pr_warn("kobject_create_and_add opal failed\n"); + return -ENOMEM; + } + + return 0; +} + +static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + return memory_read_from_buffer(buf, count, &off, bin_attr->private, + bin_attr->size); +} + +static BIN_ATTR_RO(symbol_map, 0); + +static void opal_export_symmap(void) +{ + const __be64 *syms; + unsigned int size; + struct device_node *fw; + int rc; + + fw = of_find_node_by_path("/ibm,opal/firmware"); + if (!fw) + return; + syms = of_get_property(fw, "symbol-map", &size); + if (!syms || size != 2 * sizeof(__be64)) + return; + + /* Setup attributes */ + bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); + bin_attr_symbol_map.size = be64_to_cpu(syms[1]); + + rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); + if (rc) + pr_warn("Error %d creating OPAL symbols file\n", rc); +} + +static void __init opal_dump_region_init(void) +{ + void *addr; + uint64_t size; + int rc; + + if (!opal_check_token(OPAL_REGISTER_DUMP_REGION)) + return; + + /* Register kernel log buffer */ + addr = log_buf_addr_get(); + if (addr == NULL) + return; + + size = log_buf_len_get(); + if (size == 0) + return; + + rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF, + __pa(addr), size); + /* Don't warn if this is just an older OPAL that doesn't + * know about that call + */ + if (rc && rc != OPAL_UNSUPPORTED) + pr_warn("DUMP: Failed to register kernel log buffer. " + "rc = %d\n", rc); +} + +static void opal_flash_init(struct device_node *opal_node) +{ + struct device_node *np; + + for_each_child_of_node(opal_node, np) + if (of_device_is_compatible(np, "ibm,opal-flash")) + of_platform_device_create(np, NULL, NULL); +} + +static void opal_ipmi_init(struct device_node *opal_node) +{ + struct device_node *np; + + for_each_child_of_node(opal_node, np) + if (of_device_is_compatible(np, "ibm,opal-ipmi")) + of_platform_device_create(np, NULL, NULL); +} + +static void opal_i2c_create_devs(void) +{ + struct device_node *np; + + for_each_compatible_node(np, NULL, "ibm,opal-i2c") + of_platform_device_create(np, NULL, NULL); +} + +static void __init opal_irq_init(struct device_node *dn) +{ + const __be32 *irqs; + int i, irqlen; + + /* Get interrupt property */ + irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); + opal_irq_count = irqs ? (irqlen / 4) : 0; + pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count); + if (!opal_irq_count) + return; + + /* Install interrupt handlers */ + opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL); + for (i = 0; irqs && i < opal_irq_count; i++, irqs++) { + unsigned int irq, virq; + int rc; + + /* Get hardware and virtual IRQ */ + irq = be32_to_cpup(irqs); + virq = irq_create_mapping(NULL, irq); + if (virq == NO_IRQ) { + pr_warn("Failed to map irq 0x%x\n", irq); + continue; + } + + /* Install interrupt handler */ + rc = request_irq(virq, opal_interrupt, 0, "opal", NULL); + if (rc) { + irq_dispose_mapping(virq); + pr_warn("Error %d requesting irq %d (0x%x)\n", + rc, virq, irq); + continue; + } + + /* Cache IRQ */ + opal_irqs[i] = virq; + } +} + +static int kopald(void *unused) +{ + set_freezable(); + do { + try_to_freeze(); + opal_poll_events(NULL); + msleep_interruptible(opal_heartbeat); + } while (!kthread_should_stop()); + + return 0; +} + +static void opal_init_heartbeat(void) +{ + /* Old firwmware, we assume the HVC heartbeat is sufficient */ + if (of_property_read_u32(opal_node, "ibm,heartbeat-ms", + &opal_heartbeat) != 0) + opal_heartbeat = 0; + + if (opal_heartbeat) + kthread_run(kopald, NULL, "kopald"); +} + +static int __init opal_init(void) +{ + struct device_node *np, *consoles; + int rc; + + opal_node = of_find_node_by_path("/ibm,opal"); + if (!opal_node) { + pr_warn("Device node not found\n"); + return -ENODEV; + } + + /* Register OPAL consoles if any ports */ + if (firmware_has_feature(FW_FEATURE_OPALv2)) + consoles = of_find_node_by_path("/ibm,opal/consoles"); + else + consoles = of_node_get(opal_node); + if (consoles) { + for_each_child_of_node(consoles, np) { + if (strcmp(np->name, "serial")) + continue; + of_platform_device_create(np, NULL, NULL); + } + of_node_put(consoles); + } + + /* Create i2c platform devices */ + opal_i2c_create_devs(); + + /* Setup a heatbeat thread if requested by OPAL */ + opal_init_heartbeat(); + + /* Find all OPAL interrupts and request them */ + opal_irq_init(opal_node); + + /* Create "opal" kobject under /sys/firmware */ + rc = opal_sysfs_init(); + if (rc == 0) { + /* Export symbol map to userspace */ + opal_export_symmap(); + /* Setup dump region interface */ + opal_dump_region_init(); + /* Setup error log interface */ + rc = opal_elog_init(); + /* Setup code update interface */ + opal_flash_update_init(); + /* Setup platform dump extract interface */ + opal_platform_dump_init(); + /* Setup system parameters interface */ + opal_sys_param_init(); + /* Setup message log interface. */ + opal_msglog_init(); + } + + /* Initialize OPAL IPMI backend */ + opal_ipmi_init(opal_node); + + opal_flash_init(opal_node); + + return 0; +} +machine_subsys_initcall(powernv, opal_init); + +void opal_shutdown(void) +{ + unsigned int i; + long rc = OPAL_BUSY; + + /* First free interrupts, which will also mask them */ + for (i = 0; i < opal_irq_count; i++) { + if (opal_irqs[i]) + free_irq(opal_irqs[i], NULL); + opal_irqs[i] = 0; + } + + /* + * Then sync with OPAL which ensure anything that can + * potentially write to our memory has completed such + * as an ongoing dump retrieval + */ + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_sync_host_reboot(); + if (rc == OPAL_BUSY) + opal_poll_events(NULL); + else + mdelay(10); + } + + /* Unregister memory dump region */ + if (opal_check_token(OPAL_UNREGISTER_DUMP_REGION)) + opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF); +} + +/* Export this so that test modules can use it */ +EXPORT_SYMBOL_GPL(opal_invalid_call); +EXPORT_SYMBOL_GPL(opal_ipmi_send); +EXPORT_SYMBOL_GPL(opal_ipmi_recv); +EXPORT_SYMBOL_GPL(opal_flash_read); +EXPORT_SYMBOL_GPL(opal_flash_write); +EXPORT_SYMBOL_GPL(opal_flash_erase); + +/* Convert a region of vmalloc memory to an opal sg list */ +struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, + unsigned long vmalloc_size) +{ + struct opal_sg_list *sg, *first = NULL; + unsigned long i = 0; + + sg = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!sg) + goto nomem; + + first = sg; + + while (vmalloc_size > 0) { + uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT; + uint64_t length = min(vmalloc_size, PAGE_SIZE); + + sg->entry[i].data = cpu_to_be64(data); + sg->entry[i].length = cpu_to_be64(length); + i++; + + if (i >= SG_ENTRIES_PER_NODE) { + struct opal_sg_list *next; + + next = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!next) + goto nomem; + + sg->length = cpu_to_be64( + i * sizeof(struct opal_sg_entry) + 16); + i = 0; + sg->next = cpu_to_be64(__pa(next)); + sg = next; + } + + vmalloc_addr += length; + vmalloc_size -= length; + } + + sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16); + + return first; + +nomem: + pr_err("%s : Failed to allocate memory\n", __func__); + opal_free_sg_list(first); + return NULL; +} + +void opal_free_sg_list(struct opal_sg_list *sg) +{ + while (sg) { + uint64_t next = be64_to_cpu(sg->next); + + kfree(sg); + + if (next) + sg = __va(next); + else + sg = NULL; + } +} + +int opal_error_code(int rc) +{ + switch (rc) { + case OPAL_SUCCESS: return 0; + + case OPAL_PARAMETER: return -EINVAL; + case OPAL_ASYNC_COMPLETION: return -EINPROGRESS; + case OPAL_BUSY_EVENT: return -EBUSY; + case OPAL_NO_MEM: return -ENOMEM; + + case OPAL_UNSUPPORTED: return -EIO; + case OPAL_HARDWARE: return -EIO; + case OPAL_INTERNAL_ERROR: return -EIO; + default: + pr_err("%s: unexpected OPAL error %d\n", __func__, rc); + return -EIO; + } +} + +EXPORT_SYMBOL_GPL(opal_poll_events); +EXPORT_SYMBOL_GPL(opal_rtc_read); +EXPORT_SYMBOL_GPL(opal_rtc_write); +EXPORT_SYMBOL_GPL(opal_tpo_read); +EXPORT_SYMBOL_GPL(opal_tpo_write); +EXPORT_SYMBOL_GPL(opal_i2c_request); diff --git a/kernel/arch/powerpc/platforms/powernv/pci-ioda.c b/kernel/arch/powerpc/platforms/powernv/pci-ioda.c new file mode 100644 index 000000000..f8bc950ef --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/pci-ioda.c @@ -0,0 +1,2871 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/crash_dump.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> +#include <linux/memblock.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/firmware.h> +#include <asm/pnv-pci.h> + +#include <misc/cxl.h> + +#include "powernv.h" +#include "pci.h" + +/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ +#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) + +static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + char pfix[32]; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + if (pe->flags & PNV_IODA_PE_DEV) + strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + sprintf(pfix, "%04x:%02x ", + pci_domain_nr(pe->pbus), pe->pbus->number); +#ifdef CONFIG_PCI_IOV + else if (pe->flags & PNV_IODA_PE_VF) + sprintf(pfix, "%04x:%02x:%2x.%d", + pci_domain_nr(pe->parent_dev->bus), + (pe->rid & 0xff00) >> 8, + PCI_SLOT(pe->rid), PCI_FUNC(pe->rid)); +#endif /* CONFIG_PCI_IOV*/ + + printk("%spci %s: [PE# %.3d] %pV", + level, pfix, pe->pe_number, &vaf); + + va_end(args); +} + +#define pe_err(pe, fmt, ...) \ + pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) +#define pe_warn(pe, fmt, ...) \ + pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) +#define pe_info(pe, fmt, ...) \ + pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) + +static bool pnv_iommu_bypass_disabled __read_mostly; + +static int __init iommu_setup(char *str) +{ + if (!str) + return -EINVAL; + + while (*str) { + if (!strncmp(str, "nobypass", 8)) { + pnv_iommu_bypass_disabled = true; + pr_info("PowerNV: IOMMU bypass window disabled.\n"); + break; + } + str += strcspn(str, ","); + if (*str == ',') + str++; + } + + return 0; +} +early_param("iommu", iommu_setup); + +/* + * stdcix is only supposed to be used in hypervisor real mode as per + * the architecture spec + */ +static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) +{ + __asm__ __volatile__("stdcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) +{ + return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == + (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); +} + +static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) +{ + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { + pr_warn("%s: Invalid PE %d on PHB#%x\n", + __func__, pe_no, phb->hose->global_number); + return; + } + + if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) { + pr_warn("%s: PE %d was assigned on PHB#%x\n", + __func__, pe_no, phb->hose->global_number); + return; + } + + phb->ioda.pe_array[pe_no].phb = phb; + phb->ioda.pe_array[pe_no].pe_number = pe_no; +} + +static int pnv_ioda_alloc_pe(struct pnv_phb *phb) +{ + unsigned long pe; + + do { + pe = find_next_zero_bit(phb->ioda.pe_alloc, + phb->ioda.total_pe, 0); + if (pe >= phb->ioda.total_pe) + return IODA_INVALID_PE; + } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); + + phb->ioda.pe_array[pe].phb = phb; + phb->ioda.pe_array[pe].pe_number = pe; + return pe; +} + +static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +{ + WARN_ON(phb->ioda.pe_array[pe].pdev); + + memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); + clear_bit(pe, phb->ioda.pe_alloc); +} + +/* The default M64 BAR is shared by all PEs */ +static int pnv_ioda2_init_m64(struct pnv_phb *phb) +{ + const char *desc; + struct resource *r; + s64 rc; + + /* Configure the default M64 BAR */ + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + phb->ioda.m64_bar_idx, + phb->ioda.m64_base, + 0, /* unused */ + phb->ioda.m64_size); + if (rc != OPAL_SUCCESS) { + desc = "configuring"; + goto fail; + } + + /* Enable the default M64 BAR */ + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + phb->ioda.m64_bar_idx, + OPAL_ENABLE_M64_SPLIT); + if (rc != OPAL_SUCCESS) { + desc = "enabling"; + goto fail; + } + + /* Mark the M64 BAR assigned */ + set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc); + + /* + * Strip off the segment used by the reserved PE, which is + * expected to be 0 or last one of PE capabicity. + */ + r = &phb->hose->mem_resources[1]; + if (phb->ioda.reserved_pe == 0) + r->start += phb->ioda.m64_segsize; + else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) + r->end -= phb->ioda.m64_segsize; + else + pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", + phb->ioda.reserved_pe); + + return 0; + +fail: + pr_warn(" Failure %lld %s M64 BAR#%d\n", + rc, desc, phb->ioda.m64_bar_idx); + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + phb->ioda.m64_bar_idx, + OPAL_DISABLE_M64); + return -EIO; +} + +static void pnv_ioda2_reserve_m64_pe(struct pnv_phb *phb) +{ + resource_size_t sgsz = phb->ioda.m64_segsize; + struct pci_dev *pdev; + struct resource *r; + int base, step, i; + + /* + * Root bus always has full M64 range and root port has + * M64 range used in reality. So we're checking root port + * instead of root bus. + */ + list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) { + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { + r = &pdev->resource[PCI_BRIDGE_RESOURCES + i]; + if (!r->parent || + !pnv_pci_is_mem_pref_64(r->flags)) + continue; + + base = (r->start - phb->ioda.m64_base) / sgsz; + for (step = 0; step < resource_size(r) / sgsz; step++) + pnv_ioda_reserve_pe(phb, base + step); + } + } +} + +static int pnv_ioda2_pick_m64_pe(struct pnv_phb *phb, + struct pci_bus *bus, int all) +{ + resource_size_t segsz = phb->ioda.m64_segsize; + struct pci_dev *pdev; + struct resource *r; + struct pnv_ioda_pe *master_pe, *pe; + unsigned long size, *pe_alloc; + bool found; + int start, i, j; + + /* Root bus shouldn't use M64 */ + if (pci_is_root_bus(bus)) + return IODA_INVALID_PE; + + /* We support only one M64 window on each bus */ + found = false; + pci_bus_for_each_resource(bus, r, i) { + if (r && r->parent && + pnv_pci_is_mem_pref_64(r->flags)) { + found = true; + break; + } + } + + /* No M64 window found ? */ + if (!found) + return IODA_INVALID_PE; + + /* Allocate bitmap */ + size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + pe_alloc = kzalloc(size, GFP_KERNEL); + if (!pe_alloc) { + pr_warn("%s: Out of memory !\n", + __func__); + return IODA_INVALID_PE; + } + + /* + * Figure out reserved PE numbers by the PE + * the its child PEs. + */ + start = (r->start - phb->ioda.m64_base) / segsz; + for (i = 0; i < resource_size(r) / segsz; i++) + set_bit(start + i, pe_alloc); + + if (all) + goto done; + + /* + * If the PE doesn't cover all subordinate buses, + * we need subtract from reserved PEs for children. + */ + list_for_each_entry(pdev, &bus->devices, bus_list) { + if (!pdev->subordinate) + continue; + + pci_bus_for_each_resource(pdev->subordinate, r, i) { + if (!r || !r->parent || + !pnv_pci_is_mem_pref_64(r->flags)) + continue; + + start = (r->start - phb->ioda.m64_base) / segsz; + for (j = 0; j < resource_size(r) / segsz ; j++) + clear_bit(start + j, pe_alloc); + } + } + + /* + * the current bus might not own M64 window and that's all + * contributed by its child buses. For the case, we needn't + * pick M64 dependent PE#. + */ + if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { + kfree(pe_alloc); + return IODA_INVALID_PE; + } + + /* + * Figure out the master PE and put all slave PEs to master + * PE's list to form compound PE. + */ +done: + master_pe = NULL; + i = -1; + while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < + phb->ioda.total_pe) { + pe = &phb->ioda.pe_array[i]; + + if (!master_pe) { + pe->flags |= PNV_IODA_PE_MASTER; + INIT_LIST_HEAD(&pe->slaves); + master_pe = pe; + } else { + pe->flags |= PNV_IODA_PE_SLAVE; + pe->master = master_pe; + list_add_tail(&pe->list, &master_pe->slaves); + } + } + + kfree(pe_alloc); + return master_pe->pe_number; +} + +static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + struct device_node *dn = hose->dn; + struct resource *res; + const u32 *r; + u64 pci_addr; + + /* FIXME: Support M64 for P7IOC */ + if (phb->type != PNV_PHB_IODA2) { + pr_info(" Not support M64 window\n"); + return; + } + + if (!firmware_has_feature(FW_FEATURE_OPALv3)) { + pr_info(" Firmware too old to support M64 window\n"); + return; + } + + r = of_get_property(dn, "ibm,opal-m64-window", NULL); + if (!r) { + pr_info(" No <ibm,opal-m64-window> on %s\n", + dn->full_name); + return; + } + + res = &hose->mem_resources[1]; + res->start = of_translate_address(dn, r + 2); + res->end = res->start + of_read_number(r + 4, 2) - 1; + res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); + pci_addr = of_read_number(r, 2); + hose->mem_offset[1] = res->start - pci_addr; + + phb->ioda.m64_size = resource_size(res); + phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; + phb->ioda.m64_base = pci_addr; + + pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", + res->start, res->end, pci_addr); + + /* Use last M64 BAR to cover M64 window */ + phb->ioda.m64_bar_idx = 15; + phb->init_m64 = pnv_ioda2_init_m64; + phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe; + phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; +} + +static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) +{ + struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no]; + struct pnv_ioda_pe *slave; + s64 rc; + + /* Fetch master PE */ + if (pe->flags & PNV_IODA_PE_SLAVE) { + pe = pe->master; + if (WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER))) + return; + + pe_no = pe->pe_number; + } + + /* Freeze master PE */ + rc = opal_pci_eeh_freeze_set(phb->opal_id, + pe_no, + OPAL_EEH_ACTION_SET_FREEZE_ALL); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", + __func__, rc, phb->hose->global_number, pe_no); + return; + } + + /* Freeze slave PEs */ + if (!(pe->flags & PNV_IODA_PE_MASTER)) + return; + + list_for_each_entry(slave, &pe->slaves, list) { + rc = opal_pci_eeh_freeze_set(phb->opal_id, + slave->pe_number, + OPAL_EEH_ACTION_SET_FREEZE_ALL); + if (rc != OPAL_SUCCESS) + pr_warn("%s: Failure %lld freezing PHB#%x-PE#%x\n", + __func__, rc, phb->hose->global_number, + slave->pe_number); + } +} + +static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt) +{ + struct pnv_ioda_pe *pe, *slave; + s64 rc; + + /* Find master PE */ + pe = &phb->ioda.pe_array[pe_no]; + if (pe->flags & PNV_IODA_PE_SLAVE) { + pe = pe->master; + WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); + pe_no = pe->pe_number; + } + + /* Clear frozen state for master PE */ + rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no, opt); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", + __func__, rc, opt, phb->hose->global_number, pe_no); + return -EIO; + } + + if (!(pe->flags & PNV_IODA_PE_MASTER)) + return 0; + + /* Clear frozen state for slave PEs */ + list_for_each_entry(slave, &pe->slaves, list) { + rc = opal_pci_eeh_freeze_clear(phb->opal_id, + slave->pe_number, + opt); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld clear %d on PHB#%x-PE#%x\n", + __func__, rc, opt, phb->hose->global_number, + slave->pe_number); + return -EIO; + } + } + + return 0; +} + +static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) +{ + struct pnv_ioda_pe *slave, *pe; + u8 fstate, state; + __be16 pcierr; + s64 rc; + + /* Sanity check on PE number */ + if (pe_no < 0 || pe_no >= phb->ioda.total_pe) + return OPAL_EEH_STOPPED_PERM_UNAVAIL; + + /* + * Fetch the master PE and the PE instance might be + * not initialized yet. + */ + pe = &phb->ioda.pe_array[pe_no]; + if (pe->flags & PNV_IODA_PE_SLAVE) { + pe = pe->master; + WARN_ON(!pe || !(pe->flags & PNV_IODA_PE_MASTER)); + pe_no = pe->pe_number; + } + + /* Check the master PE */ + rc = opal_pci_eeh_freeze_status(phb->opal_id, pe_no, + &state, &pcierr, NULL); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld getting " + "PHB#%x-PE#%x state\n", + __func__, rc, + phb->hose->global_number, pe_no); + return OPAL_EEH_STOPPED_TEMP_UNAVAIL; + } + + /* Check the slave PE */ + if (!(pe->flags & PNV_IODA_PE_MASTER)) + return state; + + list_for_each_entry(slave, &pe->slaves, list) { + rc = opal_pci_eeh_freeze_status(phb->opal_id, + slave->pe_number, + &fstate, + &pcierr, + NULL); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld getting " + "PHB#%x-PE#%x state\n", + __func__, rc, + phb->hose->global_number, slave->pe_number); + return OPAL_EEH_STOPPED_TEMP_UNAVAIL; + } + + /* + * Override the result based on the ascending + * priority. + */ + if (fstate > state) + state = fstate; + } + + return state; +} + +/* Currently those 2 are only used when MSIs are enabled, this will change + * but in the meantime, we need to protect them to avoid warnings + */ +#ifdef CONFIG_PCI_MSI +static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pci_get_pdn(dev); + + if (!pdn) + return NULL; + if (pdn->pe_number == IODA_INVALID_PE) + return NULL; + return &phb->ioda.pe_array[pdn->pe_number]; +} +#endif /* CONFIG_PCI_MSI */ + +static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, + struct pnv_ioda_pe *parent, + struct pnv_ioda_pe *child, + bool is_add) +{ + const char *desc = is_add ? "adding" : "removing"; + uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN : + OPAL_REMOVE_PE_FROM_DOMAIN; + struct pnv_ioda_pe *slave; + long rc; + + /* Parent PE affects child PE */ + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, + child->pe_number, op); + if (rc != OPAL_SUCCESS) { + pe_warn(child, "OPAL error %ld %s to parent PELTV\n", + rc, desc); + return -ENXIO; + } + + if (!(child->flags & PNV_IODA_PE_MASTER)) + return 0; + + /* Compound case: parent PE affects slave PEs */ + list_for_each_entry(slave, &child->slaves, list) { + rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number, + slave->pe_number, op); + if (rc != OPAL_SUCCESS) { + pe_warn(slave, "OPAL error %ld %s to parent PELTV\n", + rc, desc); + return -ENXIO; + } + } + + return 0; +} + +static int pnv_ioda_set_peltv(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + bool is_add) +{ + struct pnv_ioda_pe *slave; + struct pci_dev *pdev = NULL; + int ret; + + /* + * Clear PE frozen state. If it's master PE, we need + * clear slave PE frozen state as well. + */ + if (is_add) { + opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + if (pe->flags & PNV_IODA_PE_MASTER) { + list_for_each_entry(slave, &pe->slaves, list) + opal_pci_eeh_freeze_clear(phb->opal_id, + slave->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + } + } + + /* + * Associate PE in PELT. We need add the PE into the + * corresponding PELT-V as well. Otherwise, the error + * originated from the PE might contribute to other + * PEs. + */ + ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add); + if (ret) + return ret; + + /* For compound PEs, any one affects all of them */ + if (pe->flags & PNV_IODA_PE_MASTER) { + list_for_each_entry(slave, &pe->slaves, list) { + ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add); + if (ret) + return ret; + } + } + + if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS)) + pdev = pe->pbus->self; + else if (pe->flags & PNV_IODA_PE_DEV) + pdev = pe->pdev->bus->self; +#ifdef CONFIG_PCI_IOV + else if (pe->flags & PNV_IODA_PE_VF) + pdev = pe->parent_dev->bus->self; +#endif /* CONFIG_PCI_IOV */ + while (pdev) { + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *parent; + + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + parent = &phb->ioda.pe_array[pdn->pe_number]; + ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add); + if (ret) + return ret; + } + + pdev = pdev->bus->self; + } + + return 0; +} + +#ifdef CONFIG_PCI_IOV +static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +{ + struct pci_dev *parent; + uint8_t bcomp, dcomp, fcomp; + int64_t rc; + long rid_end, rid; + + /* Currently, we just deconfigure VF PE. Bus PE will always there.*/ + if (pe->pbus) { + int count; + + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; + parent = pe->pbus->self; + if (pe->flags & PNV_IODA_PE_BUS_ALL) + count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + else + count = 1; + + switch(count) { + case 1: bcomp = OpalPciBusAll; break; + case 2: bcomp = OpalPciBus7Bits; break; + case 4: bcomp = OpalPciBus6Bits; break; + case 8: bcomp = OpalPciBus5Bits; break; + case 16: bcomp = OpalPciBus4Bits; break; + case 32: bcomp = OpalPciBus3Bits; break; + default: + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", + count); + /* Do an exact match only */ + bcomp = OpalPciBusAll; + } + rid_end = pe->rid + (count << 8); + } else { + if (pe->flags & PNV_IODA_PE_VF) + parent = pe->parent_dev; + else + parent = pe->pdev->bus->self; + bcomp = OpalPciBusAll; + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; + rid_end = pe->rid + 1; + } + + /* Clear the reverse map */ + for (rid = pe->rid; rid < rid_end; rid++) + phb->ioda.pe_rmap[rid] = 0; + + /* Release from all parents PELT-V */ + while (parent) { + struct pci_dn *pdn = pci_get_pdn(parent); + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + + opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Disassociate PE in PELT */ + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + if (rc) + pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, + bcomp, dcomp, fcomp, OPAL_UNMAP_PE); + if (rc) + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + + pe->pbus = NULL; + pe->pdev = NULL; + pe->parent_dev = NULL; + + return 0; +} +#endif /* CONFIG_PCI_IOV */ + +static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +{ + struct pci_dev *parent; + uint8_t bcomp, dcomp, fcomp; + long rc, rid_end, rid; + + /* Bus validation ? */ + if (pe->pbus) { + int count; + + dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; + fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; + parent = pe->pbus->self; + if (pe->flags & PNV_IODA_PE_BUS_ALL) + count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + else + count = 1; + + switch(count) { + case 1: bcomp = OpalPciBusAll; break; + case 2: bcomp = OpalPciBus7Bits; break; + case 4: bcomp = OpalPciBus6Bits; break; + case 8: bcomp = OpalPciBus5Bits; break; + case 16: bcomp = OpalPciBus4Bits; break; + case 32: bcomp = OpalPciBus3Bits; break; + default: + dev_err(&pe->pbus->dev, "Number of subordinate buses %d unsupported\n", + count); + /* Do an exact match only */ + bcomp = OpalPciBusAll; + } + rid_end = pe->rid + (count << 8); + } else { +#ifdef CONFIG_PCI_IOV + if (pe->flags & PNV_IODA_PE_VF) + parent = pe->parent_dev; + else +#endif /* CONFIG_PCI_IOV */ + parent = pe->pdev->bus->self; + bcomp = OpalPciBusAll; + dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; + fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; + rid_end = pe->rid + 1; + } + + /* + * Associate PE in PELT. We need add the PE into the + * corresponding PELT-V as well. Otherwise, the error + * originated from the PE might contribute to other + * PEs. + */ + rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, + bcomp, dcomp, fcomp, OPAL_MAP_PE); + if (rc) { + pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + return -ENXIO; + } + + /* Configure PELTV */ + pnv_ioda_set_peltv(phb, pe, true); + + /* Setup reverse map */ + for (rid = pe->rid; rid < rid_end; rid++) + phb->ioda.pe_rmap[rid] = pe->pe_number; + + /* Setup one MVTs on IODA1 */ + if (phb->type != PNV_PHB_IODA1) { + pe->mve_number = 0; + goto out; + } + + pe->mve_number = pe->pe_number; + rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number); + if (rc != OPAL_SUCCESS) { + pe_err(pe, "OPAL error %ld setting up MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } else { + rc = opal_pci_set_mve_enable(phb->opal_id, + pe->mve_number, OPAL_ENABLE_MVE); + if (rc) { + pe_err(pe, "OPAL error %ld enabling MVE %d\n", + rc, pe->mve_number); + pe->mve_number = -1; + } + } + +out: + return 0; +} + +static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct pnv_ioda_pe *lpe; + + list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { + if (lpe->dma_weight < pe->dma_weight) { + list_add_tail(&pe->dma_link, &lpe->dma_link); + return; + } + } + list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); +} + +static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) +{ + /* This is quite simplistic. The "base" weight of a device + * is 10. 0 means no DMA is to be accounted for it. + */ + + /* If it's a bridge, no DMA */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return 0; + + /* Reduce the weight of slow USB controllers */ + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || + dev->class == PCI_CLASS_SERIAL_USB_OHCI || + dev->class == PCI_CLASS_SERIAL_USB_EHCI) + return 3; + + /* Increase the weight of RAID (includes Obsidian) */ + if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) + return 15; + + /* Default */ + return 10; +} + +#ifdef CONFIG_PCI_IOV +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) +{ + struct pci_dn *pdn = pci_get_pdn(dev); + int i; + struct resource *res, res2; + resource_size_t size; + u16 num_vfs; + + if (!dev->is_physfn) + return -EINVAL; + + /* + * "offset" is in VFs. The M64 windows are sized so that when they + * are segmented, each segment is the same size as the IOV BAR. + * Each segment is in a separate PE, and the high order bits of the + * address are the PE number. Therefore, each VF's BAR is in a + * separate PE, and changing the IOV BAR start address changes the + * range of PEs the VFs are in. + */ + num_vfs = pdn->num_vfs; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + /* + * The actual IOV BAR range is determined by the start address + * and the actual size for num_vfs VFs BAR. This check is to + * make sure that after shifting, the range will not overlap + * with another device. + */ + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2.flags = res->flags; + res2.start = res->start + (size * offset); + res2.end = res2.start + (size * num_vfs) - 1; + + if (res2.end > res->end) { + dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + return -EBUSY; + } + } + + /* + * After doing so, there would be a "hole" in the /proc/iomem when + * offset is a positive value. It looks like the device return some + * mmio back to the system, which actually no one could use it. + */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2 = *res; + res->start += size * offset; + + dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (enabling %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + } + return 0; +} +#endif /* CONFIG_PCI_IOV */ + +#if 0 +static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pci_get_pdn(dev); + struct pnv_ioda_pe *pe; + int pe_num; + + if (!pdn) { + pr_err("%s: Device tree node not associated properly\n", + pci_name(dev)); + return NULL; + } + if (pdn->pe_number != IODA_INVALID_PE) + return NULL; + + /* PE#0 has been pre-set */ + if (dev->bus->number == 0) + pe_num = 0; + else + pe_num = pnv_ioda_alloc_pe(phb); + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available, disabling device\n", + pci_name(dev)); + return NULL; + } + + /* NOTE: We get only one ref to the pci_dev for the pdn, not for the + * pointer in the PE data structure, both should be destroyed at the + * same time. However, this needs to be looked at more closely again + * once we actually start removing things (Hotplug, SR-IOV, ...) + * + * At some point we want to remove the PDN completely anyways + */ + pe = &phb->ioda.pe_array[pe_num]; + pci_dev_get(dev); + pdn->pcidev = dev; + pdn->pe_number = pe_num; + pe->pdev = dev; + pe->pbus = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = dev->bus->number << 8 | pdn->devfn; + + pe_info(pe, "Associated device to PE\n"); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pdn->pe_number = IODA_INVALID_PE; + pe->pdev = NULL; + pci_dev_put(dev); + return NULL; + } + + /* Assign a DMA weight to the device */ + pe->dma_weight = pnv_ioda_dma_weight(dev); + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); + + return pe; +} +#endif /* Useful for SRIOV case */ + +static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + struct pci_dn *pdn = pci_get_pdn(dev); + + if (pdn == NULL) { + pr_warn("%s: No device node associated with device !\n", + pci_name(dev)); + continue; + } + pdn->pe_number = pe->pe_number; + pe->dma_weight += pnv_ioda_dma_weight(dev); + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_same_PE(dev->subordinate, pe); + } +} + +/* + * There're 2 types of PCI bus sensitive PEs: One that is compromised of + * single PCI bus. Another one that contains the primary PCI bus and its + * subordinate PCI devices and buses. The second type of PE is normally + * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. + */ +static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + int pe_num = IODA_INVALID_PE; + + /* Check if PE is determined by M64 */ + if (phb->pick_m64_pe) + pe_num = phb->pick_m64_pe(phb, bus, all); + + /* The PE number isn't pinned by M64 */ + if (pe_num == IODA_INVALID_PE) + pe_num = pnv_ioda_alloc_pe(phb); + + if (pe_num == IODA_INVALID_PE) { + pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", + __func__, pci_domain_nr(bus), bus->number); + return; + } + + pe = &phb->ioda.pe_array[pe_num]; + pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); + pe->pbus = bus; + pe->pdev = NULL; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = bus->busn_res.start << 8; + pe->dma_weight = 0; + + if (all) + pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", + bus->busn_res.start, bus->busn_res.end, pe_num); + else + pe_info(pe, "Secondary bus %d associated with PE#%d\n", + bus->busn_res.start, pe_num); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pe->pbus = NULL; + return; + } + + pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), + GFP_KERNEL, hose->node); + pe->tce32_table->data = pe; + + /* Associate it with all child devices */ + pnv_ioda_setup_same_PE(bus, pe); + + /* Put PE to the list */ + list_add_tail(&pe->list, &phb->ioda.pe_list); + + /* Account for one DMA PE if at least one DMA capable device exist + * below the bridge + */ + if (pe->dma_weight != 0) { + phb->ioda.dma_weight += pe->dma_weight; + phb->ioda.dma_pe_count++; + } + + /* Link the PE */ + pnv_ioda_link_pe_by_weight(phb, pe); +} + +static void pnv_ioda_setup_PEs(struct pci_bus *bus) +{ + struct pci_dev *dev; + + pnv_ioda_setup_bus_PE(bus, 0); + + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->subordinate) { + if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) + pnv_ioda_setup_bus_PE(dev->subordinate, 1); + else + pnv_ioda_setup_PEs(dev->subordinate); + } + } +} + +/* + * Configure PEs so that the downstream PCI buses and devices + * could have their associated PE#. Unfortunately, we didn't + * figure out the way to identify the PLX bridge yet. So we + * simply put the PCI bus and the subordinate behind the root + * port to PE# here. The game rule here is expected to be changed + * as soon as we can detected PLX bridge correctly. + */ +static void pnv_pci_ioda_setup_PEs(void) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + + /* M64 layout might affect PE allocation */ + if (phb->reserve_m64_pe) + phb->reserve_m64_pe(phb); + + pnv_ioda_setup_PEs(hose->bus); + } +} + +#ifdef CONFIG_PCI_IOV +static int pnv_pci_vf_release_m64(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + int i, j; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + for (j = 0; j < M64_PER_IOV; j++) { + if (pdn->m64_wins[i][j] == IODA_INVALID_M64) + continue; + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0); + clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc); + pdn->m64_wins[i][j] = IODA_INVALID_M64; + } + + return 0; +} + +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + unsigned int win; + struct resource *res; + int i, j; + int64_t rc; + int total_vfs; + resource_size_t size, start; + int pe_num; + int vf_groups; + int vf_per_group; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + total_vfs = pci_sriov_get_totalvfs(pdev); + + /* Initialize the m64_wins to IODA_INVALID_M64 */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + for (j = 0; j < M64_PER_IOV; j++) + pdn->m64_wins[i][j] = IODA_INVALID_M64; + + if (pdn->m64_per_iov == M64_PER_IOV) { + vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV; + vf_per_group = (num_vfs <= M64_PER_IOV)? 1: + roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; + } else { + vf_groups = 1; + vf_per_group = 1; + } + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + if (!pnv_pci_is_mem_pref_64(res->flags)) + continue; + + for (j = 0; j < vf_groups; j++) { + do { + win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, + phb->ioda.m64_bar_idx + 1, 0); + + if (win >= phb->ioda.m64_bar_idx + 1) + goto m64_failed; + } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); + + pdn->m64_wins[i][j] = win; + + if (pdn->m64_per_iov == M64_PER_IOV) { + size = pci_iov_resource_size(pdev, + PCI_IOV_RESOURCES + i); + size = size * vf_per_group; + start = res->start + size * j; + } else { + size = resource_size(res); + start = res->start; + } + + /* Map the M64 here */ + if (pdn->m64_per_iov == M64_PER_IOV) { + pe_num = pdn->offset + j; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe_num, OPAL_M64_WINDOW_TYPE, + pdn->m64_wins[i][j], 0); + } + + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + pdn->m64_wins[i][j], + start, + 0, /* unused */ + size); + + + if (rc != OPAL_SUCCESS) { + dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", + win, rc); + goto m64_failed; + } + + if (pdn->m64_per_iov == M64_PER_IOV) + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2); + else + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1); + + if (rc != OPAL_SUCCESS) { + dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", + win, rc); + goto m64_failed; + } + } + } + return 0; + +m64_failed: + pnv_pci_vf_release_m64(pdev); + return -EBUSY; +} + +static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct iommu_table *tbl; + unsigned long addr; + int64_t rc; + + bus = dev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + tbl = pe->tce32_table; + addr = tbl->it_base; + + opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + pe->pe_number << 1, 1, __pa(addr), + 0, 0x1000); + + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + (pe->pe_number << 1) + 1, + pe->tce_bypass_base, + 0); + if (rc) + pe_warn(pe, "OPAL error %ld release DMA window\n", rc); + + iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); + free_pages(addr, get_order(TCE32_TABLE_SIZE)); + pe->tce32_table = NULL; +} + +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *pe_n; + struct pci_dn *pdn; + u16 vf_index; + int64_t rc; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (!pdev->is_physfn) + return; + + if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { + int vf_group; + int vf_per_group; + int vf_index1; + + vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; + + for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) + for (vf_index = vf_group * vf_per_group; + vf_index < (vf_group + 1) * vf_per_group && + vf_index < num_vfs; + vf_index++) + for (vf_index1 = vf_group * vf_per_group; + vf_index1 < (vf_group + 1) * vf_per_group && + vf_index1 < num_vfs; + vf_index1++){ + + rc = opal_pci_set_peltv(phb->opal_id, + pdn->offset + vf_index, + pdn->offset + vf_index1, + OPAL_REMOVE_PE_FROM_DOMAIN); + + if (rc) + dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n", + __func__, + pdn->offset + vf_index1, rc); + } + } + + list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { + if (pe->parent_dev != pdev) + continue; + + pnv_pci_ioda2_release_dma_pe(pdev, pe); + + /* Remove from list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_ioda_deconfigure_pe(phb, pe); + + pnv_ioda_free_pe(phb, pe->pe_number); + } +} + +void pnv_pci_sriov_disable(struct pci_dev *pdev) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + struct pci_sriov *iov; + u16 num_vfs; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + iov = pdev->sriov; + num_vfs = pdn->num_vfs; + + /* Release VF PEs */ + pnv_ioda_release_vf_PE(pdev, num_vfs); + + if (phb->type == PNV_PHB_IODA2) { + if (pdn->m64_per_iov == 1) + pnv_pci_vf_resource_shift(pdev, -pdn->offset); + + /* Release M64 windows */ + pnv_pci_vf_release_m64(pdev); + + /* Release PE numbers */ + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->offset = 0; + } +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe); +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + int pe_num; + u16 vf_index; + struct pci_dn *pdn; + int64_t rc; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (!pdev->is_physfn) + return; + + /* Reserve PE for each VF */ + for (vf_index = 0; vf_index < num_vfs; vf_index++) { + pe_num = pdn->offset + vf_index; + + pe = &phb->ioda.pe_array[pe_num]; + pe->pe_number = pe_num; + pe->phb = phb; + pe->flags = PNV_IODA_PE_VF; + pe->pbus = NULL; + pe->parent_dev = pdev; + pe->tce32_seg = -1; + pe->mve_number = -1; + pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | + pci_iov_virtfn_devfn(pdev, vf_index); + + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n", + hose->global_number, pdev->bus->number, + PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), + PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + if (pe_num) + pnv_ioda_free_pe(phb, pe_num); + pe->pdev = NULL; + continue; + } + + pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), + GFP_KERNEL, hose->node); + pe->tce32_table->data = pe; + + /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_add_tail(&pe->list, &phb->ioda.pe_list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } + + if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) { + int vf_group; + int vf_per_group; + int vf_index1; + + vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov; + + for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) { + for (vf_index = vf_group * vf_per_group; + vf_index < (vf_group + 1) * vf_per_group && + vf_index < num_vfs; + vf_index++) { + for (vf_index1 = vf_group * vf_per_group; + vf_index1 < (vf_group + 1) * vf_per_group && + vf_index1 < num_vfs; + vf_index1++) { + + rc = opal_pci_set_peltv(phb->opal_id, + pdn->offset + vf_index, + pdn->offset + vf_index1, + OPAL_ADD_PE_TO_DOMAIN); + + if (rc) + dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n", + __func__, + pdn->offset + vf_index1, rc); + } + } + } + } +} + +int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + struct pci_bus *bus; + struct pci_controller *hose; + struct pnv_phb *phb; + struct pci_dn *pdn; + int ret; + + bus = pdev->bus; + hose = pci_bus_to_host(bus); + phb = hose->private_data; + pdn = pci_get_pdn(pdev); + + if (phb->type == PNV_PHB_IODA2) { + /* Calculate available PE for required VFs */ + mutex_lock(&phb->ioda.pe_alloc_mutex); + pdn->offset = bitmap_find_next_zero_area( + phb->ioda.pe_alloc, phb->ioda.total_pe, + 0, num_vfs, 0); + if (pdn->offset >= phb->ioda.total_pe) { + mutex_unlock(&phb->ioda.pe_alloc_mutex); + dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); + pdn->offset = 0; + return -EBUSY; + } + bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->num_vfs = num_vfs; + mutex_unlock(&phb->ioda.pe_alloc_mutex); + + /* Assign M64 window accordingly */ + ret = pnv_pci_vf_assign_m64(pdev, num_vfs); + if (ret) { + dev_info(&pdev->dev, "Not enough M64 window resources\n"); + goto m64_failed; + } + + /* + * When using one M64 BAR to map one IOV BAR, we need to shift + * the IOV BAR according to the PE# allocated to the VFs. + * Otherwise, the PE# for the VF will conflict with others. + */ + if (pdn->m64_per_iov == 1) { + ret = pnv_pci_vf_resource_shift(pdev, pdn->offset); + if (ret) + goto m64_failed; + } + } + + /* Setup VF PEs */ + pnv_ioda_setup_vf_PE(pdev, num_vfs); + + return 0; + +m64_failed: + bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs); + pdn->offset = 0; + + return ret; +} + +int pcibios_sriov_disable(struct pci_dev *pdev) +{ + pnv_pci_sriov_disable(pdev); + + /* Release PCI data */ + remove_dev_pci_data(pdev); + return 0; +} + +int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + /* Allocate PCI data */ + add_dev_pci_data(pdev); + + pnv_pci_sriov_enable(pdev, num_vfs); + return 0; +} +#endif /* CONFIG_PCI_IOV */ + +static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) +{ + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *pe; + + /* + * The function can be called while the PE# + * hasn't been assigned. Do nothing for the + * case. + */ + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return; + + pe = &phb->ioda.pe_array[pdn->pe_number]; + WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); + set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table); +} + +static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, + struct pci_dev *pdev, u64 dma_mask) +{ + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *pe; + uint64_t top; + bool bypass = false; + + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return -ENODEV;; + + pe = &phb->ioda.pe_array[pdn->pe_number]; + if (pe->tce_bypass_enabled) { + top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + bypass = (dma_mask >= top); + } + + if (bypass) { + dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); + set_dma_ops(&pdev->dev, &dma_direct_ops); + set_dma_offset(&pdev->dev, pe->tce_bypass_base); + } else { + dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); + set_dma_ops(&pdev->dev, &dma_iommu_ops); + set_iommu_table_base(&pdev->dev, pe->tce32_table); + } + *pdev->dev.dma_mask = dma_mask; + return 0; +} + +static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb, + struct pci_dev *pdev) +{ + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *pe; + u64 end, mask; + + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return 0; + + pe = &phb->ioda.pe_array[pdn->pe_number]; + if (!pe->tce_bypass_enabled) + return __dma_get_required_mask(&pdev->dev); + + + end = pe->tce_bypass_base + memblock_end_of_DRAM(); + mask = 1ULL << (fls64(end) - 1); + mask += mask - 1; + + return mask; +} + +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, + struct pci_bus *bus, + bool add_to_iommu_group) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + if (add_to_iommu_group) + set_iommu_table_base_and_group(&dev->dev, + pe->tce32_table); + else + set_iommu_table_base(&dev->dev, pe->tce32_table); + + if (dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate, + add_to_iommu_group); + } +} + +static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, + struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) +{ + __be64 __iomem *invalidate = rm ? + (__be64 __iomem *)pe->tce_inval_reg_phys : + (__be64 __iomem *)tbl->it_index; + unsigned long start, end, inc; + const unsigned shift = tbl->it_page_shift; + + start = __pa(startp); + end = __pa(endp); + + /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ + if (tbl->it_busno) { + start <<= shift; + end <<= shift; + inc = 128ull << shift; + start |= tbl->it_busno; + end |= tbl->it_busno; + } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { + /* p7ioc-style invalidation, 2 TCEs per write */ + start |= (1ull << 63); + end |= (1ull << 63); + inc = 16; + } else { + /* Default (older HW) */ + inc = 128; + } + + end |= inc - 1; /* round up end to be different than start */ + + mb(); /* Ensure above stores are visible */ + while (start <= end) { + if (rm) + __raw_rm_writeq(cpu_to_be64(start), invalidate); + else + __raw_writeq(cpu_to_be64(start), invalidate); + start += inc; + } + + /* + * The iommu layer will do another mb() for us on build() + * and we don't care on free() + */ +} + +static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, + struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) +{ + unsigned long start, end, inc; + __be64 __iomem *invalidate = rm ? + (__be64 __iomem *)pe->tce_inval_reg_phys : + (__be64 __iomem *)tbl->it_index; + const unsigned shift = tbl->it_page_shift; + + /* We'll invalidate DMA address in PE scope */ + start = 0x2ull << 60; + start |= (pe->pe_number & 0xFF); + end = start; + + /* Figure out the start, end and step */ + inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); + start |= (inc << shift); + inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); + end |= (inc << shift); + inc = (0x1ull << shift); + mb(); + + while (start <= end) { + if (rm) + __raw_rm_writeq(cpu_to_be64(start), invalidate); + else + __raw_writeq(cpu_to_be64(start), invalidate); + start += inc; + } +} + +void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm) +{ + struct pnv_ioda_pe *pe = tbl->data; + struct pnv_phb *phb = pe->phb; + + if (phb->type == PNV_PHB_IODA1) + pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); + else + pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); +} + +static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, unsigned int base, + unsigned int segs) +{ + + struct page *tce_mem = NULL; + const __be64 *swinvp; + struct iommu_table *tbl; + unsigned int i; + int64_t rc; + void *addr; + + /* XXX FIXME: Handle 64-bit only DMA devices */ + /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ + /* XXX FIXME: Allocate multi-level tables on PHB3 */ + + /* We shouldn't already have a 32-bit DMA associated */ + if (WARN_ON(pe->tce32_seg >= 0)) + return; + + /* Grab a 32-bit TCE table */ + pe->tce32_seg = base; + pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", + (base << 28), ((base + segs) << 28) - 1); + + /* XXX Currently, we allocate one big contiguous table for the + * TCEs. We only really need one chunk per 256M of TCE space + * (ie per segment) but that's an optimization for later, it + * requires some added smarts with our get/put_tce implementation + */ + tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, + get_order(TCE32_TABLE_SIZE * segs)); + if (!tce_mem) { + pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, TCE32_TABLE_SIZE * segs); + + /* Configure HW */ + for (i = 0; i < segs; i++) { + rc = opal_pci_map_pe_dma_window(phb->opal_id, + pe->pe_number, + base + i, 1, + __pa(addr) + TCE32_TABLE_SIZE * i, + TCE32_TABLE_SIZE, 0x1000); + if (rc) { + pe_err(pe, " Failed to configure 32-bit TCE table," + " err %ld\n", rc); + goto fail; + } + } + + /* Setup linux iommu table */ + tbl = pe->tce32_table; + pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, + base << 28, IOMMU_PAGE_SHIFT_4K); + + /* OPAL variant of P7IOC SW invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data + * to or. Since the bus is only printed out on table free + * errors, and on the first pass the data will be a relative + * bus number, print that out instead. + */ + pe->tce_inval_reg_phys = be64_to_cpup(swinvp); + tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, + 8); + tbl->it_type |= (TCE_PCI_SWINV_CREATE | + TCE_PCI_SWINV_FREE | + TCE_PCI_SWINV_PAIR); + } + iommu_init_table(tbl, phb->hose->node); + + if (pe->flags & PNV_IODA_PE_DEV) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + set_iommu_table_base_and_group(&pe->pdev->dev, tbl); + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + } else if (pe->flags & PNV_IODA_PE_VF) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + } + + return; + fail: + /* XXX Failure: Try to fallback to 64-bit only ? */ + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + if (tce_mem) + __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); +} + +static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +{ + struct pnv_ioda_pe *pe = tbl->data; + uint16_t window_id = (pe->pe_number << 1 ) + 1; + int64_t rc; + + pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis"); + if (enable) { + phys_addr_t top = memblock_end_of_DRAM(); + + top = roundup_pow_of_two(top); + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + window_id, + pe->tce_bypass_base, + top); + } else { + rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, + pe->pe_number, + window_id, + pe->tce_bypass_base, + 0); + + /* + * EEH needs the mapping between IOMMU table and group + * of those VFIO/KVM pass-through devices. We can postpone + * resetting DMA ops until the DMA mask is configured in + * host side. + */ + if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, tbl); + else + pnv_ioda_setup_bus_dma(pe, pe->pbus, false); + } + if (rc) + pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); + else + pe->tce_bypass_enabled = enable; +} + +static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + /* TVE #1 is selected by PCI address bit 59 */ + pe->tce_bypass_base = 1ull << 59; + + /* Install set_bypass callback for VFIO */ + pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass; + + /* Enable bypass by default */ + pnv_pci_ioda2_set_bypass(pe->tce32_table, true); +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + struct page *tce_mem = NULL; + void *addr; + const __be64 *swinvp; + struct iommu_table *tbl; + unsigned int tce_table_size, end; + int64_t rc; + + /* We shouldn't already have a 32-bit DMA associated */ + if (WARN_ON(pe->tce32_seg >= 0)) + return; + + /* The PE will reserve all possible 32-bits space */ + pe->tce32_seg = 0; + end = (1 << ilog2(phb->ioda.m32_pci_base)); + tce_table_size = (end / 0x1000) * 8; + pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", + end); + + /* Allocate TCE table */ + tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, + get_order(tce_table_size)); + if (!tce_mem) { + pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); + goto fail; + } + addr = page_address(tce_mem); + memset(addr, 0, tce_table_size); + + /* + * Map TCE table through TVT. The TVE index is the PE number + * shifted by 1 bit for 32-bits DMA space. + */ + rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + pe->pe_number << 1, 1, __pa(addr), + tce_table_size, 0x1000); + if (rc) { + pe_err(pe, "Failed to configure 32-bit TCE table," + " err %ld\n", rc); + goto fail; + } + + /* Setup linux iommu table */ + tbl = pe->tce32_table; + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, + IOMMU_PAGE_SHIFT_4K); + + /* OPAL variant of PHB3 invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (swinvp) { + /* We need a couple more fields -- an address and a data + * to or. Since the bus is only printed out on table free + * errors, and on the first pass the data will be a relative + * bus number, print that out instead. + */ + pe->tce_inval_reg_phys = be64_to_cpup(swinvp); + tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, + 8); + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + } + iommu_init_table(tbl, phb->hose->node); + + if (pe->flags & PNV_IODA_PE_DEV) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + set_iommu_table_base_and_group(&pe->pdev->dev, tbl); + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + pnv_ioda_setup_bus_dma(pe, pe->pbus, true); + } else if (pe->flags & PNV_IODA_PE_VF) { + iommu_register_group(tbl, phb->hose->global_number, + pe->pe_number); + } + + /* Also create a bypass window */ + if (!pnv_iommu_bypass_disabled) + pnv_pci_ioda2_setup_bypass_pe(phb, pe); + + return; +fail: + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + if (tce_mem) + __free_pages(tce_mem, get_order(tce_table_size)); +} + +static void pnv_ioda_setup_dma(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + unsigned int residual, remaining, segs, tw, base; + struct pnv_ioda_pe *pe; + + /* If we have more PE# than segments available, hand out one + * per PE until we run out and let the rest fail. If not, + * then we assign at least one segment per PE, plus more based + * on the amount of devices under that PE + */ + if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) + residual = 0; + else + residual = phb->ioda.tce32_count - + phb->ioda.dma_pe_count; + + pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", + hose->global_number, phb->ioda.tce32_count); + pr_info("PCI: %d PE# for a total weight of %d\n", + phb->ioda.dma_pe_count, phb->ioda.dma_weight); + + /* Walk our PE list and configure their DMA segments, hand them + * out one base segment plus any residual segments based on + * weight + */ + remaining = phb->ioda.tce32_count; + tw = phb->ioda.dma_weight; + base = 0; + list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { + if (!pe->dma_weight) + continue; + if (!remaining) { + pe_warn(pe, "No DMA32 resources available\n"); + continue; + } + segs = 1; + if (residual) { + segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; + if (segs > remaining) + segs = remaining; + } + + /* + * For IODA2 compliant PHB3, we needn't care about the weight. + * The all available 32-bits DMA space will be assigned to + * the specific PE. + */ + if (phb->type == PNV_PHB_IODA1) { + pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", + pe->dma_weight, segs); + pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + } else { + pe_info(pe, "Assign DMA32 space\n"); + segs = 0; + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } + + remaining -= segs; + base += segs; + } +} + +#ifdef CONFIG_PCI_MSI +static void pnv_ioda2_msi_eoi(struct irq_data *d) +{ + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + struct irq_chip *chip = irq_data_get_irq_chip(d); + struct pnv_phb *phb = container_of(chip, struct pnv_phb, + ioda.irq_chip); + int64_t rc; + + rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); + WARN_ON_ONCE(rc); + + icp_native_eoi(d); +} + + +static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) +{ + struct irq_data *idata; + struct irq_chip *ichip; + + if (phb->type != PNV_PHB_IODA2) + return; + + if (!phb->ioda.irq_chip_init) { + /* + * First time we setup an MSI IRQ, we need to setup the + * corresponding IRQ chip to route correctly. + */ + idata = irq_get_irq_data(virq); + ichip = irq_data_get_irq_chip(idata); + phb->ioda.irq_chip_init = 1; + phb->ioda.irq_chip = *ichip; + phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; + } + irq_set_chip(virq, &phb->ioda.irq_chip); +} + +#ifdef CONFIG_CXL_BASE + +struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + return of_node_get(hose->dn); +} +EXPORT_SYMBOL(pnv_pci_get_phb_node); + +int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + int rc; + + pe = pnv_ioda_get_pe(dev); + if (!pe) + return -ENODEV; + + pe_info(pe, "Switching PHB to CXL\n"); + + rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number); + if (rc) + dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc); + + return rc; +} +EXPORT_SYMBOL(pnv_phb_to_cxl_mode); + +/* Find PHB for cxl dev and allocate MSI hwirqs? + * Returns the absolute hardware IRQ number + */ +int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num); + + if (hwirq < 0) { + dev_warn(&dev->dev, "Failed to find a free MSI\n"); + return -ENOSPC; + } + + return phb->msi_base + hwirq; +} +EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs); + +void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num); +} +EXPORT_SYMBOL(pnv_cxl_release_hwirqs); + +void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs, + struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + int i, hwirq; + + for (i = 1; i < CXL_IRQ_RANGES; i++) { + if (!irqs->range[i]) + continue; + pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n", + i, irqs->offset[i], + irqs->range[i]); + hwirq = irqs->offset[i] - phb->msi_base; + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, + irqs->range[i]); + } +} +EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges); + +int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, + struct pci_dev *dev, int num) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + int i, hwirq, try; + + memset(irqs, 0, sizeof(struct cxl_irq_ranges)); + + /* 0 is reserved for the multiplexed PSL DSI interrupt */ + for (i = 1; i < CXL_IRQ_RANGES && num; i++) { + try = num; + while (try) { + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try); + if (hwirq >= 0) + break; + try /= 2; + } + if (!try) + goto fail; + + irqs->offset[i] = phb->msi_base + hwirq; + irqs->range[i] = try; + pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n", + i, irqs->offset[i], irqs->range[i]); + num -= try; + } + if (num) + goto fail; + + return 0; +fail: + pnv_cxl_release_hwirq_ranges(irqs, dev); + return -ENOSPC; +} +EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges); + +int pnv_cxl_get_irq_count(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + + return phb->msi_bmp.irq_count; +} +EXPORT_SYMBOL(pnv_cxl_get_irq_count); + +int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, + unsigned int virq) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + unsigned int xive_num = hwirq - phb->msi_base; + struct pnv_ioda_pe *pe; + int rc; + + if (!(pe = pnv_ioda_get_pe(dev))) + return -ENODEV; + + /* Assign XIVE to PE */ + rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); + if (rc) { + pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x " + "hwirq 0x%x XIVE 0x%x PE\n", + pci_name(dev), rc, phb->msi_base, hwirq, xive_num); + return -EIO; + } + set_msi_irq_chip(phb, virq); + + return 0; +} +EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup); +#endif + +static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg) +{ + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); + unsigned int xive_num = hwirq - phb->msi_base; + __be32 data; + int rc; + + /* No PE assigned ? bail out ... no MSI for you ! */ + if (pe == NULL) + return -ENXIO; + + /* Check if we have an MVE */ + if (pe->mve_number < 0) + return -ENXIO; + + /* Force 32-bit MSI on some broken devices */ + if (dev->no_64bit_msi) + is_64 = 0; + + /* Assign XIVE to PE */ + rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); + if (rc) { + pr_warn("%s: OPAL error %d setting XIVE %d PE\n", + pci_name(dev), rc, xive_num); + return -EIO; + } + + if (is_64) { + __be64 addr64; + + rc = opal_get_msi_64(phb->opal_id, pe->mve_number, xive_num, 1, + &addr64, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 64-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = be64_to_cpu(addr64) >> 32; + msg->address_lo = be64_to_cpu(addr64) & 0xfffffffful; + } else { + __be32 addr32; + + rc = opal_get_msi_32(phb->opal_id, pe->mve_number, xive_num, 1, + &addr32, &data); + if (rc) { + pr_warn("%s: OPAL error %d getting 32-bit MSI data\n", + pci_name(dev), rc); + return -EIO; + } + msg->address_hi = 0; + msg->address_lo = be32_to_cpu(addr32); + } + msg->data = be32_to_cpu(data); + + set_msi_irq_chip(phb, virq); + + pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," + " address=%x_%08x data=%x PE# %d\n", + pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, + msg->address_hi, msg->address_lo, data, pe->pe_number); + + return 0; +} + +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +{ + unsigned int count; + const __be32 *prop = of_get_property(phb->hose->dn, + "ibm,opal-msi-ranges", NULL); + if (!prop) { + /* BML Fallback */ + prop = of_get_property(phb->hose->dn, "msi-ranges", NULL); + } + if (!prop) + return; + + phb->msi_base = be32_to_cpup(prop); + count = be32_to_cpup(prop + 1); + if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { + pr_err("PCI %d: Failed to allocate MSI bitmap !\n", + phb->hose->global_number); + return; + } + + phb->msi_setup = pnv_pci_ioda_msi_setup; + phb->msi32_support = 1; + pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", + count, phb->msi_base); +} +#else +static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +#ifdef CONFIG_PCI_IOV +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct resource *res; + int i; + resource_size_t size; + struct pci_dn *pdn; + int mul, total_vfs; + + if (!pdev->is_physfn || pdev->is_added) + return; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + + pdn = pci_get_pdn(pdev); + pdn->vfs_expanded = 0; + + total_vfs = pci_sriov_get_totalvfs(pdev); + pdn->m64_per_iov = 1; + mul = phb->ioda.total_pe; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || res->parent) + continue; + if (!pnv_pci_is_mem_pref_64(res->flags)) { + dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n", + i, res); + continue; + } + + size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + + /* bigger than 64M */ + if (size > (1 << 26)) { + dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n", + i, res); + pdn->m64_per_iov = M64_PER_IOV; + mul = roundup_pow_of_two(total_vfs); + break; + } + } + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || res->parent) + continue; + if (!pnv_pci_is_mem_pref_64(res->flags)) { + dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n", + i, res); + continue; + } + + dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); + size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + res->end = res->start + size * mul - 1; + dev_dbg(&pdev->dev, " %pR\n", res); + dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", + i, res, mul); + } + pdn->vfs_expanded = mul; +} +#endif /* CONFIG_PCI_IOV */ + +/* + * This function is supposed to be called on basis of PE from top + * to bottom style. So the the I/O or MMIO segment assigned to + * parent PE could be overrided by its child PEs if necessary. + */ +static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, + struct pnv_ioda_pe *pe) +{ + struct pnv_phb *phb = hose->private_data; + struct pci_bus_region region; + struct resource *res; + int i, index; + int rc; + + /* + * NOTE: We only care PCI bus based PE for now. For PCI + * device based PE, for example SRIOV sensitive VF should + * be figured out later. + */ + BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); + + pci_bus_for_each_resource(pe->pbus, res, i) { + if (!res || !res->flags || + res->start > res->end) + continue; + + if (res->flags & IORESOURCE_IO) { + region.start = res->start - phb->ioda.io_pci_base; + region.end = res->end - phb->ioda.io_pci_base; + index = region.start / phb->ioda.io_segsize; + + while (index < phb->ioda.total_pe && + region.start <= region.end) { + phb->ioda.io_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d when mapping IO " + "segment #%d to PE#%d\n", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.io_segsize; + index++; + } + } else if ((res->flags & IORESOURCE_MEM) && + !pnv_pci_is_mem_pref_64(res->flags)) { + region.start = res->start - + hose->mem_offset[0] - + phb->ioda.m32_pci_base; + region.end = res->end - + hose->mem_offset[0] - + phb->ioda.m32_pci_base; + index = region.start / phb->ioda.m32_segsize; + + while (index < phb->ioda.total_pe && + region.start <= region.end) { + phb->ioda.m32_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: OPAL error %d when mapping M32 " + "segment#%d to PE#%d", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.m32_segsize; + index++; + } + } + } +} + +static void pnv_pci_ioda_setup_seg(void) +{ + struct pci_controller *tmp, *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + pnv_ioda_setup_pe_seg(hose, pe); + } + } +} + +static void pnv_pci_ioda_setup_DMA(void) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pnv_ioda_setup_dma(hose->private_data); + + /* Mark the PHB initialization done */ + phb = hose->private_data; + phb->initialized = 1; + } +} + +static void pnv_pci_ioda_create_dbgfs(void) +{ +#ifdef CONFIG_DEBUG_FS + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + char name[16]; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + + sprintf(name, "PCI%04x", hose->global_number); + phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); + if (!phb->dbgfs) + pr_warning("%s: Error on creating debugfs on PHB#%x\n", + __func__, hose->global_number); + } +#endif /* CONFIG_DEBUG_FS */ +} + +static void pnv_pci_ioda_fixup(void) +{ + pnv_pci_ioda_setup_PEs(); + pnv_pci_ioda_setup_seg(); + pnv_pci_ioda_setup_DMA(); + + pnv_pci_ioda_create_dbgfs(); + +#ifdef CONFIG_EEH + eeh_init(); + eeh_addr_cache_build(); +#endif +} + +/* + * Returns the alignment for I/O or memory windows for P2P + * bridges. That actually depends on how PEs are segmented. + * For now, we return I/O or M32 segment size for PE sensitive + * P2P bridges. Otherwise, the default values (4KiB for I/O, + * 1MiB for memory) will be returned. + * + * The current PCI bus might be put into one PE, which was + * create against the parent PCI bridge. For that case, we + * needn't enlarge the alignment so that we can save some + * resources. + */ +static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, + unsigned long type) +{ + struct pci_dev *bridge; + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + int num_pci_bridges = 0; + + bridge = bus->self; + while (bridge) { + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE) { + num_pci_bridges++; + if (num_pci_bridges >= 2) + return 1; + } + + bridge = bridge->bus->self; + } + + /* We fail back to M32 if M64 isn't supported */ + if (phb->ioda.m64_segsize && + pnv_pci_is_mem_pref_64(type)) + return phb->ioda.m64_segsize; + if (type & IORESOURCE_MEM) + return phb->ioda.m32_segsize; + + return phb->ioda.io_segsize; +} + +#ifdef CONFIG_PCI_IOV +static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, + int resno) +{ + struct pci_dn *pdn = pci_get_pdn(pdev); + resource_size_t align, iov_align; + + iov_align = resource_size(&pdev->resource[resno]); + if (iov_align) + return iov_align; + + align = pci_iov_resource_size(pdev, resno); + if (pdn->vfs_expanded) + return pdn->vfs_expanded * align; + + return align; +} +#endif /* CONFIG_PCI_IOV */ + +/* Prevent enabling devices for which we couldn't properly + * assign a PE + */ +static bool pnv_pci_enable_device_hook(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn; + + /* The function is probably called while the PEs have + * not be created yet. For example, resource reassignment + * during PCI probe period. We just skip the check if + * PEs isn't ready. + */ + if (!phb->initialized) + return true; + + pdn = pci_get_pdn(dev); + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return false; + + return true; +} + +static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, + u32 devfn) +{ + return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; +} + +static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) +{ + opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE, + OPAL_ASSERT_RESET); +} + +static void __init pnv_pci_init_ioda_phb(struct device_node *np, + u64 hub_id, int ioda_type) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + unsigned long size, m32map_off, pemap_off, iomap_off = 0; + const __be64 *prop64; + const __be32 *prop32; + int len; + u64 phb_id; + void *aux; + long rc; + + pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); + + prop64 = of_get_property(np, "ibm,opal-phbid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-phbid\" property !\n"); + return; + } + phb_id = be64_to_cpup(prop64); + pr_debug(" PHB-ID : 0x%016llx\n", phb_id); + + phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0); + + /* Allocate PCI controller */ + phb->hose = hose = pcibios_alloc_controller(np); + if (!phb->hose) { + pr_err(" Can't allocate PCI controller for %s\n", + np->full_name); + memblock_free(__pa(phb), sizeof(struct pnv_phb)); + return; + } + + spin_lock_init(&phb->lock); + prop32 = of_get_property(np, "bus-range", &len); + if (prop32 && len == 8) { + hose->first_busno = be32_to_cpu(prop32[0]); + hose->last_busno = be32_to_cpu(prop32[1]); + } else { + pr_warn(" Broken <bus-range> on %s\n", np->full_name); + hose->first_busno = 0; + hose->last_busno = 0xff; + } + hose->private_data = phb; + phb->hub_id = hub_id; + phb->opal_id = phb_id; + phb->type = ioda_type; + mutex_init(&phb->ioda.pe_alloc_mutex); + + /* Detect specific models for error handling */ + if (of_device_is_compatible(np, "ibm,p7ioc-pciex")) + phb->model = PNV_PHB_MODEL_P7IOC; + else if (of_device_is_compatible(np, "ibm,power8-pciex")) + phb->model = PNV_PHB_MODEL_PHB3; + else + phb->model = PNV_PHB_MODEL_UNKNOWN; + + /* Parse 32-bit and IO ranges (if any) */ + pci_process_bridge_OF_ranges(hose, np, !hose->global_number); + + /* Get registers */ + phb->regs = of_iomap(np, 0); + if (phb->regs == NULL) + pr_err(" Failed to map registers !\n"); + + /* Initialize more IODA stuff */ + phb->ioda.total_pe = 1; + prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); + if (prop32) + phb->ioda.total_pe = be32_to_cpup(prop32); + prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); + if (prop32) + phb->ioda.reserved_pe = be32_to_cpup(prop32); + + /* Parse 64-bit MMIO range */ + pnv_ioda_parse_m64_window(phb); + + phb->ioda.m32_size = resource_size(&hose->mem_resources[0]); + /* FW Has already off top 64k of M32 space (MSI space) */ + phb->ioda.m32_size += 0x10000; + + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; + phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; + phb->ioda.io_size = hose->pci_io_size; + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; + phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + + /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ + size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + m32map_off = size; + size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); + if (phb->type == PNV_PHB_IODA1) { + iomap_off = size; + size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); + } + pemap_off = size; + size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); + aux = memblock_virt_alloc(size, 0); + phb->ioda.pe_alloc = aux; + phb->ioda.m32_segmap = aux + m32map_off; + if (phb->type == PNV_PHB_IODA1) + phb->ioda.io_segmap = aux + iomap_off; + phb->ioda.pe_array = aux + pemap_off; + set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); + + INIT_LIST_HEAD(&phb->ioda.pe_dma_list); + INIT_LIST_HEAD(&phb->ioda.pe_list); + mutex_init(&phb->ioda.pe_list_mutex); + + /* Calculate how many 32-bit TCE segments we have */ + phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + +#if 0 /* We should really do that ... */ + rc = opal_pci_set_phb_mem_window(opal->phb_id, + window_type, + window_num, + starting_real_address, + starting_pci_address, + segment_size); +#endif + + pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", + phb->ioda.total_pe, phb->ioda.reserved_pe, + phb->ioda.m32_size, phb->ioda.m32_segsize); + if (phb->ioda.m64_size) + pr_info(" M64: 0x%lx [segment=0x%lx]\n", + phb->ioda.m64_size, phb->ioda.m64_segsize); + if (phb->ioda.io_size) + pr_info(" IO: 0x%x [segment=0x%x]\n", + phb->ioda.io_size, phb->ioda.io_segsize); + + + phb->hose->ops = &pnv_pci_ops; + phb->get_pe_state = pnv_ioda_get_pe_state; + phb->freeze_pe = pnv_ioda_freeze_pe; + phb->unfreeze_pe = pnv_ioda_unfreeze_pe; + + /* Setup RID -> PE mapping function */ + phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; + + /* Setup TCEs */ + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; + phb->dma_set_mask = pnv_pci_ioda_dma_set_mask; + phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask; + + /* Setup shutdown function for kexec */ + phb->shutdown = pnv_pci_ioda_shutdown; + + /* Setup MSI support */ + pnv_pci_init_ioda_msis(phb); + + /* + * We pass the PCI probe flag PCI_REASSIGN_ALL_RSRC here + * to let the PCI core do resource assignment. It's supposed + * that the PCI core will do correct I/O and MMIO alignment + * for the P2P bridge bars so that each PCI bus (excluding + * the child P2P bridges) can form individual PE. + */ + ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; + pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook; + pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment; + pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus; + hose->controller_ops = pnv_pci_controller_ops; + +#ifdef CONFIG_PCI_IOV + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; + ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; +#endif + + pci_add_flags(PCI_REASSIGN_ALL_RSRC); + + /* Reset IODA tables to a clean state */ + rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET); + if (rc) + pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); + + /* If we're running in kdump kerenl, the previous kerenl never + * shutdown PCI devices correctly. We already got IODA table + * cleaned out. So we have to issue PHB reset to stop all PCI + * transactions from previous kerenl. + */ + if (is_kdump_kernel()) { + pr_info(" Issue PHB reset ...\n"); + pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); + pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); + } + + /* Remove M64 resource if we can't configure it successfully */ + if (!phb->init_m64 || phb->init_m64(phb)) + hose->mem_resources[1].flags = 0; +} + +void __init pnv_pci_init_ioda2_phb(struct device_node *np) +{ + pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); +} + +void __init pnv_pci_init_ioda_hub(struct device_node *np) +{ + struct device_node *phbn; + const __be64 *prop64; + u64 hub_id; + + pr_info("Probing IODA IO-Hub %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-hubid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-hubid\" property !\n"); + return; + } + hub_id = be64_to_cpup(prop64); + pr_devel(" HUB-ID : 0x%016llx\n", hub_id); + + /* Count child PHBs */ + for_each_child_of_node(np, phbn) { + /* Look for IODA1 PHBs */ + if (of_device_is_compatible(phbn, "ibm,ioda-phb")) + pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); + } +} diff --git a/kernel/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/kernel/arch/powerpc/platforms/powernv/pci-p5ioc2.c new file mode 100644 index 000000000..4729ca793 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -0,0 +1,236 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Currently supports only P5IOC2 + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> + +#include "powernv.h" +#include "pci.h" + +/* For now, use a fixed amount of TCE memory for each p5ioc2 + * hub, 16M will do + */ +#define P5IOC2_TCE_MEMORY 0x01000000 + +#ifdef CONFIG_PCI_MSI +static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg) +{ + if (WARN_ON(!is_64)) + return -ENXIO; + msg->data = hwirq - phb->msi_base; + msg->address_hi = 0x10000000; + msg->address_lo = 0; + + return 0; +} + +static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) +{ + unsigned int count; + const __be32 *prop = of_get_property(phb->hose->dn, + "ibm,opal-msi-ranges", NULL); + if (!prop) + return; + + /* Don't do MSI's on p5ioc2 PCI-X are they are not properly + * verified in HW + */ + if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix")) + return; + phb->msi_base = be32_to_cpup(prop); + count = be32_to_cpup(prop + 1); + if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) { + pr_err("PCI %d: Failed to allocate MSI bitmap !\n", + phb->hose->global_number); + return; + } + phb->msi_setup = pnv_pci_p5ioc2_msi_setup; + phb->msi32_support = 0; + pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", + count, phb->msi_base); +} +#else +static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } +#endif /* CONFIG_PCI_MSI */ + +static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, + struct pci_dev *pdev) +{ + if (phb->p5ioc2.iommu_table.it_map == NULL) { + iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); + iommu_register_group(&phb->p5ioc2.iommu_table, + pci_domain_nr(phb->hose->bus), phb->opal_id); + } + + set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table); +} + +static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, + void *tce_mem, u64 tce_size) +{ + struct pnv_phb *phb; + const __be64 *prop64; + u64 phb_id; + int64_t rc; + static int primary = 1; + + pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-phbid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-phbid\" property !\n"); + return; + } + phb_id = be64_to_cpup(prop64); + pr_devel(" PHB-ID : 0x%016llx\n", phb_id); + pr_devel(" TCE AT : 0x%016lx\n", __pa(tce_mem)); + pr_devel(" TCE SZ : 0x%016llx\n", tce_size); + + rc = opal_pci_set_phb_tce_memory(phb_id, __pa(tce_mem), tce_size); + if (rc != OPAL_SUCCESS) { + pr_err(" Failed to set TCE memory, OPAL error %lld\n", rc); + return; + } + + phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0); + phb->hose = pcibios_alloc_controller(np); + if (!phb->hose) { + pr_err(" Failed to allocate PCI controller\n"); + return; + } + + spin_lock_init(&phb->lock); + phb->hose->first_busno = 0; + phb->hose->last_busno = 0xff; + phb->hose->private_data = phb; + phb->hose->controller_ops = pnv_pci_controller_ops; + phb->hub_id = hub_id; + phb->opal_id = phb_id; + phb->type = PNV_PHB_P5IOC2; + phb->model = PNV_PHB_MODEL_P5IOC2; + + phb->regs = of_iomap(np, 0); + + if (phb->regs == NULL) + pr_err(" Failed to map registers !\n"); + else { + pr_devel(" P_BUID = 0x%08x\n", in_be32(phb->regs + 0x100)); + pr_devel(" P_IOSZ = 0x%08x\n", in_be32(phb->regs + 0x1b0)); + pr_devel(" P_IO_ST = 0x%08x\n", in_be32(phb->regs + 0x1e0)); + pr_devel(" P_MEM1_H = 0x%08x\n", in_be32(phb->regs + 0x1a0)); + pr_devel(" P_MEM1_L = 0x%08x\n", in_be32(phb->regs + 0x190)); + pr_devel(" P_MSZ1_L = 0x%08x\n", in_be32(phb->regs + 0x1c0)); + pr_devel(" P_MEM_ST = 0x%08x\n", in_be32(phb->regs + 0x1d0)); + pr_devel(" P_MEM2_H = 0x%08x\n", in_be32(phb->regs + 0x2c0)); + pr_devel(" P_MEM2_L = 0x%08x\n", in_be32(phb->regs + 0x2b0)); + pr_devel(" P_MSZ2_H = 0x%08x\n", in_be32(phb->regs + 0x2d0)); + pr_devel(" P_MSZ2_L = 0x%08x\n", in_be32(phb->regs + 0x2e0)); + } + + /* Interpret the "ranges" property */ + /* This also maps the I/O region and sets isa_io/mem_base */ + pci_process_bridge_OF_ranges(phb->hose, np, primary); + primary = 0; + + phb->hose->ops = &pnv_pci_ops; + + /* Setup MSI support */ + pnv_pci_init_p5ioc2_msis(phb); + + /* Setup TCEs */ + phb->dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup; + pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table, + tce_mem, tce_size, 0, + IOMMU_PAGE_SHIFT_4K); +} + +void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) +{ + struct device_node *phbn; + const __be64 *prop64; + u64 hub_id; + void *tce_mem; + uint64_t tce_per_phb; + int64_t rc; + int phb_count = 0; + + pr_info("Probing p5ioc2 IO-Hub %s\n", np->full_name); + + prop64 = of_get_property(np, "ibm,opal-hubid", NULL); + if (!prop64) { + pr_err(" Missing \"ibm,opal-hubid\" property !\n"); + return; + } + hub_id = be64_to_cpup(prop64); + pr_info(" HUB-ID : 0x%016llx\n", hub_id); + + /* Count child PHBs and calculate TCE space per PHB */ + for_each_child_of_node(np, phbn) { + if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || + of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) + phb_count++; + } + + if (phb_count <= 0) { + pr_info(" No PHBs for Hub %s\n", np->full_name); + return; + } + + tce_per_phb = __rounddown_pow_of_two(P5IOC2_TCE_MEMORY / phb_count); + pr_info(" Allocating %lld MB of TCE memory per PHB\n", + tce_per_phb >> 20); + + /* Currently allocate 16M of TCE memory for every Hub + * + * XXX TODO: Make it chip local if possible + */ + tce_mem = memblock_virt_alloc(P5IOC2_TCE_MEMORY, P5IOC2_TCE_MEMORY); + pr_debug(" TCE : 0x%016lx..0x%016lx\n", + __pa(tce_mem), __pa(tce_mem) + P5IOC2_TCE_MEMORY - 1); + rc = opal_pci_set_hub_tce_memory(hub_id, __pa(tce_mem), + P5IOC2_TCE_MEMORY); + if (rc != OPAL_SUCCESS) { + pr_err(" Failed to allocate TCE memory, OPAL error %lld\n", rc); + return; + } + + /* Initialize PHBs */ + for_each_child_of_node(np, phbn) { + if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") || + of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) { + pnv_pci_init_p5ioc2_phb(phbn, hub_id, + tce_mem, tce_per_phb); + tce_mem += tce_per_phb; + } + } +} diff --git a/kernel/arch/powerpc/platforms/powernv/pci.c b/kernel/arch/powerpc/platforms/powernv/pci.c new file mode 100644 index 000000000..bca2aeb6e --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/pci.c @@ -0,0 +1,783 @@ +/* + * Support PCI/PCIe on PowerNV platforms + * + * Currently supports only P5IOC2 + * + * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/io.h> +#include <linux/msi.h> +#include <linux/iommu.h> + +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/msi_bitmap.h> +#include <asm/ppc-pci.h> +#include <asm/opal.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/firmware.h> +#include <asm/eeh_event.h> +#include <asm/eeh.h> + +#include "powernv.h" +#include "pci.h" + +/* Delay in usec */ +#define PCI_RESET_DELAY_US 3000000 + +#define cfg_dbg(fmt...) do { } while(0) +//#define cfg_dbg(fmt...) printk(fmt) + +#ifdef CONFIG_PCI_MSI +static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct msi_desc *entry; + struct msi_msg msg; + int hwirq; + unsigned int virq; + int rc; + + if (WARN_ON(!phb) || !phb->msi_bmp.bitmap) + return -ENODEV; + + if (pdev->no_64bit_msi && !phb->msi32_support) + return -ENODEV; + + list_for_each_entry(entry, &pdev->msi_list, list) { + if (!entry->msi_attrib.is_64 && !phb->msi32_support) { + pr_warn("%s: Supports only 64-bit MSIs\n", + pci_name(pdev)); + return -ENXIO; + } + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1); + if (hwirq < 0) { + pr_warn("%s: Failed to find a free MSI\n", + pci_name(pdev)); + return -ENOSPC; + } + virq = irq_create_mapping(NULL, phb->msi_base + hwirq); + if (virq == NO_IRQ) { + pr_warn("%s: Failed to map MSI to linux irq\n", + pci_name(pdev)); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); + return -ENOMEM; + } + rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq, + virq, entry->msi_attrib.is_64, &msg); + if (rc) { + pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); + irq_dispose_mapping(virq); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); + return rc; + } + irq_set_msi_desc(virq, entry); + pci_write_msi_msg(virq, &msg); + } + return 0; +} + +static void pnv_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct msi_desc *entry; + + if (WARN_ON(!phb)) + return; + + list_for_each_entry(entry, &pdev->msi_list, list) { + if (entry->irq == NO_IRQ) + continue; + irq_set_msi_desc(entry->irq, NULL); + msi_bitmap_free_hwirqs(&phb->msi_bmp, + virq_to_hw(entry->irq) - phb->msi_base, 1); + irq_dispose_mapping(entry->irq); + } +} +#endif /* CONFIG_PCI_MSI */ + +static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoP7IOCPhbErrorData *data; + int i; + + data = (struct OpalIoP7IOCPhbErrorData *)common; + pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n", + hose->global_number, be32_to_cpu(common->version)); + + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + be32_to_cpu(data->brdgCtl)); + if (data->portStatusReg || data->rootCmplxStatus || + data->busAgentStatus) + pr_info("UtlSts: %08x %08x %08x\n", + be32_to_cpu(data->portStatusReg), + be32_to_cpu(data->rootCmplxStatus), + be32_to_cpu(data->busAgentStatus)); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + be32_to_cpu(data->deviceStatus), + be32_to_cpu(data->slotStatus), + be32_to_cpu(data->linkStatus), + be32_to_cpu(data->devCmdStatus), + be32_to_cpu(data->devSecStatus)); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + be32_to_cpu(data->rootErrorStatus), + be32_to_cpu(data->uncorrErrorStatus), + be32_to_cpu(data->corrErrorStatus)); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + be32_to_cpu(data->tlpHdr1), + be32_to_cpu(data->tlpHdr2), + be32_to_cpu(data->tlpHdr3), + be32_to_cpu(data->tlpHdr4)); + if (data->sourceId || data->errorClass || + data->correlator) + pr_info("RootErrLog1: %08x %016llx %016llx\n", + be32_to_cpu(data->sourceId), + be64_to_cpu(data->errorClass), + be64_to_cpu(data->correlator)); + if (data->p7iocPlssr || data->p7iocCsr) + pr_info("PhbSts: %016llx %016llx\n", + be64_to_cpu(data->p7iocPlssr), + be64_to_cpu(data->p7iocCsr)); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrorMask), + be64_to_cpu(data->lemWOF)); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbErrorStatus), + be64_to_cpu(data->phbFirstErrorStatus), + be64_to_cpu(data->phbErrorLog0), + be64_to_cpu(data->phbErrorLog1)); + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->mmioErrorStatus), + be64_to_cpu(data->mmioFirstErrorStatus), + be64_to_cpu(data->mmioErrorLog0), + be64_to_cpu(data->mmioErrorLog1)); + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma0ErrorStatus), + be64_to_cpu(data->dma0FirstErrorStatus), + be64_to_cpu(data->dma0ErrorLog0), + be64_to_cpu(data->dma0ErrorLog1)); + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma1ErrorStatus), + be64_to_cpu(data->dma1FirstErrorStatus), + be64_to_cpu(data->dma1ErrorLog0), + be64_to_cpu(data->dma1ErrorLog1)); + + for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) { + if ((data->pestA[i] >> 63) == 0 && + (data->pestB[i] >> 63) == 0) + continue; + + pr_info("PE[%3d] A/B: %016llx %016llx\n", + i, be64_to_cpu(data->pestA[i]), + be64_to_cpu(data->pestB[i])); + } +} + +static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose, + struct OpalIoPhbErrorCommon *common) +{ + struct OpalIoPhb3ErrorData *data; + int i; + + data = (struct OpalIoPhb3ErrorData*)common; + pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n", + hose->global_number, be32_to_cpu(common->version)); + if (data->brdgCtl) + pr_info("brdgCtl: %08x\n", + be32_to_cpu(data->brdgCtl)); + if (data->portStatusReg || data->rootCmplxStatus || + data->busAgentStatus) + pr_info("UtlSts: %08x %08x %08x\n", + be32_to_cpu(data->portStatusReg), + be32_to_cpu(data->rootCmplxStatus), + be32_to_cpu(data->busAgentStatus)); + if (data->deviceStatus || data->slotStatus || + data->linkStatus || data->devCmdStatus || + data->devSecStatus) + pr_info("RootSts: %08x %08x %08x %08x %08x\n", + be32_to_cpu(data->deviceStatus), + be32_to_cpu(data->slotStatus), + be32_to_cpu(data->linkStatus), + be32_to_cpu(data->devCmdStatus), + be32_to_cpu(data->devSecStatus)); + if (data->rootErrorStatus || data->uncorrErrorStatus || + data->corrErrorStatus) + pr_info("RootErrSts: %08x %08x %08x\n", + be32_to_cpu(data->rootErrorStatus), + be32_to_cpu(data->uncorrErrorStatus), + be32_to_cpu(data->corrErrorStatus)); + if (data->tlpHdr1 || data->tlpHdr2 || + data->tlpHdr3 || data->tlpHdr4) + pr_info("RootErrLog: %08x %08x %08x %08x\n", + be32_to_cpu(data->tlpHdr1), + be32_to_cpu(data->tlpHdr2), + be32_to_cpu(data->tlpHdr3), + be32_to_cpu(data->tlpHdr4)); + if (data->sourceId || data->errorClass || + data->correlator) + pr_info("RootErrLog1: %08x %016llx %016llx\n", + be32_to_cpu(data->sourceId), + be64_to_cpu(data->errorClass), + be64_to_cpu(data->correlator)); + if (data->nFir) + pr_info("nFir: %016llx %016llx %016llx\n", + be64_to_cpu(data->nFir), + be64_to_cpu(data->nFirMask), + be64_to_cpu(data->nFirWOF)); + if (data->phbPlssr || data->phbCsr) + pr_info("PhbSts: %016llx %016llx\n", + be64_to_cpu(data->phbPlssr), + be64_to_cpu(data->phbCsr)); + if (data->lemFir) + pr_info("Lem: %016llx %016llx %016llx\n", + be64_to_cpu(data->lemFir), + be64_to_cpu(data->lemErrorMask), + be64_to_cpu(data->lemWOF)); + if (data->phbErrorStatus) + pr_info("PhbErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->phbErrorStatus), + be64_to_cpu(data->phbFirstErrorStatus), + be64_to_cpu(data->phbErrorLog0), + be64_to_cpu(data->phbErrorLog1)); + if (data->mmioErrorStatus) + pr_info("OutErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->mmioErrorStatus), + be64_to_cpu(data->mmioFirstErrorStatus), + be64_to_cpu(data->mmioErrorLog0), + be64_to_cpu(data->mmioErrorLog1)); + if (data->dma0ErrorStatus) + pr_info("InAErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma0ErrorStatus), + be64_to_cpu(data->dma0FirstErrorStatus), + be64_to_cpu(data->dma0ErrorLog0), + be64_to_cpu(data->dma0ErrorLog1)); + if (data->dma1ErrorStatus) + pr_info("InBErr: %016llx %016llx %016llx %016llx\n", + be64_to_cpu(data->dma1ErrorStatus), + be64_to_cpu(data->dma1FirstErrorStatus), + be64_to_cpu(data->dma1ErrorLog0), + be64_to_cpu(data->dma1ErrorLog1)); + + for (i = 0; i < OPAL_PHB3_NUM_PEST_REGS; i++) { + if ((be64_to_cpu(data->pestA[i]) >> 63) == 0 && + (be64_to_cpu(data->pestB[i]) >> 63) == 0) + continue; + + pr_info("PE[%3d] A/B: %016llx %016llx\n", + i, be64_to_cpu(data->pestA[i]), + be64_to_cpu(data->pestB[i])); + } +} + +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, + unsigned char *log_buff) +{ + struct OpalIoPhbErrorCommon *common; + + if (!hose || !log_buff) + return; + + common = (struct OpalIoPhbErrorCommon *)log_buff; + switch (be32_to_cpu(common->ioType)) { + case OPAL_PHB_ERROR_DATA_TYPE_P7IOC: + pnv_pci_dump_p7ioc_diag_data(hose, common); + break; + case OPAL_PHB_ERROR_DATA_TYPE_PHB3: + pnv_pci_dump_phb3_diag_data(hose, common); + break; + default: + pr_warn("%s: Unrecognized ioType %d\n", + __func__, be32_to_cpu(common->ioType)); + } +} + +static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no) +{ + unsigned long flags, rc; + int has_diag, ret = 0; + + spin_lock_irqsave(&phb->lock, flags); + + /* Fetch PHB diag-data */ + rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob, + PNV_PCI_DIAG_BUF_SIZE); + has_diag = (rc == OPAL_SUCCESS); + + /* If PHB supports compound PE, to handle it */ + if (phb->unfreeze_pe) { + ret = phb->unfreeze_pe(phb, + pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + } else { + rc = opal_pci_eeh_freeze_clear(phb->opal_id, + pe_no, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + if (rc) { + pr_warn("%s: Failure %ld clearing frozen " + "PHB#%x-PE#%x\n", + __func__, rc, phb->hose->global_number, + pe_no); + ret = -EIO; + } + } + + /* + * For now, let's only display the diag buffer when we fail to clear + * the EEH status. We'll do more sensible things later when we have + * proper EEH support. We need to make sure we don't pollute ourselves + * with the normal errors generated when probing empty slots + */ + if (has_diag && ret) + pnv_pci_dump_phb_diag_data(phb->hose, phb->diag.blob); + + spin_unlock_irqrestore(&phb->lock, flags); +} + +static void pnv_pci_config_check_eeh(struct pci_dn *pdn) +{ + struct pnv_phb *phb = pdn->phb->private_data; + u8 fstate; + __be16 pcierr; + int pe_no; + s64 rc; + + /* + * Get the PE#. During the PCI probe stage, we might not + * setup that yet. So all ER errors should be mapped to + * reserved PE. + */ + pe_no = pdn->pe_number; + if (pe_no == IODA_INVALID_PE) { + if (phb->type == PNV_PHB_P5IOC2) + pe_no = 0; + else + pe_no = phb->ioda.reserved_pe; + } + + /* + * Fetch frozen state. If the PHB support compound PE, + * we need handle that case. + */ + if (phb->get_pe_state) { + fstate = phb->get_pe_state(phb, pe_no); + } else { + rc = opal_pci_eeh_freeze_status(phb->opal_id, + pe_no, + &fstate, + &pcierr, + NULL); + if (rc) { + pr_warn("%s: Failure %lld getting PHB#%x-PE#%x state\n", + __func__, rc, phb->hose->global_number, pe_no); + return; + } + } + + cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", + (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); + + /* Clear the frozen state if applicable */ + if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || + fstate == OPAL_EEH_STOPPED_DMA_FREEZE || + fstate == OPAL_EEH_STOPPED_MMIO_DMA_FREEZE) { + /* + * If PHB supports compound PE, freeze it for + * consistency. + */ + if (phb->freeze_pe) + phb->freeze_pe(phb, pe_no); + + pnv_pci_handle_eeh_config(phb, pe_no); + } +} + +int pnv_pci_cfg_read(struct pci_dn *pdn, + int where, int size, u32 *val) +{ + struct pnv_phb *phb = pdn->phb->private_data; + u32 bdfn = (pdn->busno << 8) | pdn->devfn; + s64 rc; + + switch (size) { + case 1: { + u8 v8; + rc = opal_pci_config_read_byte(phb->opal_id, bdfn, where, &v8); + *val = (rc == OPAL_SUCCESS) ? v8 : 0xff; + break; + } + case 2: { + __be16 v16; + rc = opal_pci_config_read_half_word(phb->opal_id, bdfn, where, + &v16); + *val = (rc == OPAL_SUCCESS) ? be16_to_cpu(v16) : 0xffff; + break; + } + case 4: { + __be32 v32; + rc = opal_pci_config_read_word(phb->opal_id, bdfn, where, &v32); + *val = (rc == OPAL_SUCCESS) ? be32_to_cpu(v32) : 0xffffffff; + break; + } + default: + return PCIBIOS_FUNC_NOT_SUPPORTED; + } + + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, *val); + return PCIBIOS_SUCCESSFUL; +} + +int pnv_pci_cfg_write(struct pci_dn *pdn, + int where, int size, u32 val) +{ + struct pnv_phb *phb = pdn->phb->private_data; + u32 bdfn = (pdn->busno << 8) | pdn->devfn; + + cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + pdn->busno, pdn->devfn, where, size, val); + switch (size) { + case 1: + opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); + break; + case 2: + opal_pci_config_write_half_word(phb->opal_id, bdfn, where, val); + break; + case 4: + opal_pci_config_write_word(phb->opal_id, bdfn, where, val); + break; + default: + return PCIBIOS_FUNC_NOT_SUPPORTED; + } + + return PCIBIOS_SUCCESSFUL; +} + +#if CONFIG_EEH +static bool pnv_pci_cfg_check(struct pci_dn *pdn) +{ + struct eeh_dev *edev = NULL; + struct pnv_phb *phb = pdn->phb->private_data; + + /* EEH not enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + return true; + + /* PE reset or device removed ? */ + edev = pdn->edev; + if (edev) { + if (edev->pe && + (edev->pe->state & EEH_PE_CFG_BLOCKED)) + return false; + + if (edev->mode & EEH_DEV_REMOVED) + return false; + } + + return true; +} +#else +static inline pnv_pci_cfg_check(struct pci_dn *pdn) +{ + return true; +} +#endif /* CONFIG_EEH */ + +static int pnv_pci_read_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 *val) +{ + struct pci_dn *pdn; + struct pnv_phb *phb; + int ret; + + *val = 0xFFFFFFFF; + pdn = pci_get_pdn_by_devfn(bus, devfn); + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (!pnv_pci_cfg_check(pdn)) + return PCIBIOS_DEVICE_NOT_FOUND; + + ret = pnv_pci_cfg_read(pdn, where, size, val); + phb = pdn->phb->private_data; + if (phb->flags & PNV_PHB_FLAG_EEH && pdn->edev) { + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(pdn->edev)) + return PCIBIOS_DEVICE_NOT_FOUND; + } else { + pnv_pci_config_check_eeh(pdn); + } + + return ret; +} + +static int pnv_pci_write_config(struct pci_bus *bus, + unsigned int devfn, + int where, int size, u32 val) +{ + struct pci_dn *pdn; + struct pnv_phb *phb; + int ret; + + pdn = pci_get_pdn_by_devfn(bus, devfn); + if (!pdn) + return PCIBIOS_DEVICE_NOT_FOUND; + + if (!pnv_pci_cfg_check(pdn)) + return PCIBIOS_DEVICE_NOT_FOUND; + + ret = pnv_pci_cfg_write(pdn, where, size, val); + phb = pdn->phb->private_data; + if (!(phb->flags & PNV_PHB_FLAG_EEH)) + pnv_pci_config_check_eeh(pdn); + + return ret; +} + +struct pci_ops pnv_pci_ops = { + .read = pnv_pci_read_config, + .write = pnv_pci_write_config, +}; + +static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + struct dma_attrs *attrs, bool rm) +{ + u64 proto_tce; + __be64 *tcep, *tces; + u64 rpn; + + proto_tce = TCE_PCI_READ; // Read allowed + + if (direction != DMA_TO_DEVICE) + proto_tce |= TCE_PCI_WRITE; + + tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; + rpn = __pa(uaddr) >> tbl->it_page_shift; + + while (npages--) + *(tcep++) = cpu_to_be64(proto_tce | + (rpn++ << tbl->it_page_shift)); + + /* Some implementations won't cache invalid TCEs and thus may not + * need that flush. We'll probably turn it_type into a bit mask + * of flags if that becomes the case + */ + if (tbl->it_type & TCE_PCI_SWINV_CREATE) + pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); + + return 0; +} + +static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, + false); +} + +static void pnv_tce_free(struct iommu_table *tbl, long index, long npages, + bool rm) +{ + __be64 *tcep, *tces; + + tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; + + while (npages--) + *(tcep++) = cpu_to_be64(0); + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); +} + +static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages) +{ + pnv_tce_free(tbl, index, npages, false); +} + +static unsigned long pnv_tce_get(struct iommu_table *tbl, long index) +{ + return ((u64 *)tbl->it_base)[index - tbl->it_offset]; +} + +static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true); +} + +static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages) +{ + pnv_tce_free(tbl, index, npages, true); +} + +void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, unsigned page_shift) +{ + tbl->it_blocksize = 16; + tbl->it_base = (unsigned long)tce_mem; + tbl->it_page_shift = page_shift; + tbl->it_offset = dma_offset >> tbl->it_page_shift; + tbl->it_index = 0; + tbl->it_size = tce_size >> 3; + tbl->it_busno = 0; + tbl->it_type = TCE_PCI; +} + +static void pnv_pci_dma_dev_setup(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; +#ifdef CONFIG_PCI_IOV + struct pnv_ioda_pe *pe; + struct pci_dn *pdn; + + /* Fix the VF pdn PE number */ + if (pdev->is_virtfn) { + pdn = pci_get_pdn(pdev); + WARN_ON(pdn->pe_number != IODA_INVALID_PE); + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (pe->rid == ((pdev->bus->number << 8) | + (pdev->devfn & 0xff))) { + pdn->pe_number = pe->pe_number; + pe->pdev = pdev; + break; + } + } + } +#endif /* CONFIG_PCI_IOV */ + + if (phb && phb->dma_dev_setup) + phb->dma_dev_setup(phb, pdev); +} + +int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + + if (phb && phb->dma_set_mask) + return phb->dma_set_mask(phb, pdev, dma_mask); + return __dma_set_mask(&pdev->dev, dma_mask); +} + +u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + + if (phb && phb->dma_get_required_mask) + return phb->dma_get_required_mask(phb, pdev); + + return __dma_get_required_mask(&pdev->dev); +} + +void pnv_pci_shutdown(void) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) { + struct pnv_phb *phb = hose->private_data; + + if (phb && phb->shutdown) + phb->shutdown(phb); + } +} + +/* Fixup wrong class code in p7ioc and p8 root complex */ +static void pnv_p7ioc_rc_quirk(struct pci_dev *dev) +{ + dev->class = PCI_CLASS_BRIDGE_PCI << 8; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk); + +void __init pnv_pci_init(void) +{ + struct device_node *np; + bool found_ioda = false; + + pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN); + + /* If we don't have OPAL, eg. in sim, just skip PCI probe */ + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return; + + /* Look for IODA IO-Hubs. We don't support mixing IODA + * and p5ioc2 due to the need to change some global + * probing flags + */ + for_each_compatible_node(np, NULL, "ibm,ioda-hub") { + pnv_pci_init_ioda_hub(np); + found_ioda = true; + } + + /* Look for p5ioc2 IO-Hubs */ + if (!found_ioda) + for_each_compatible_node(np, NULL, "ibm,p5ioc2") + pnv_pci_init_p5ioc2_hub(np); + + /* Look for ioda2 built-in PHB3's */ + for_each_compatible_node(np, NULL, "ibm,ioda2-phb") + pnv_pci_init_ioda2_phb(np); + + /* Setup the linkage between OF nodes and PHBs */ + pci_devs_phb_init(); + + /* Configure IOMMU DMA hooks */ + ppc_md.tce_build = pnv_tce_build_vm; + ppc_md.tce_free = pnv_tce_free_vm; + ppc_md.tce_build_rm = pnv_tce_build_rm; + ppc_md.tce_free_rm = pnv_tce_free_rm; + ppc_md.tce_get = pnv_tce_get; + set_pci_dma_ops(&dma_iommu_ops); + + /* Configure MSIs */ +#ifdef CONFIG_PCI_MSI + ppc_md.setup_msi_irqs = pnv_setup_msi_irqs; + ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs; +#endif +} + +machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init); + +struct pci_controller_ops pnv_pci_controller_ops = { + .dma_dev_setup = pnv_pci_dma_dev_setup, +}; diff --git a/kernel/arch/powerpc/platforms/powernv/pci.h b/kernel/arch/powerpc/platforms/powernv/pci.h new file mode 100644 index 000000000..070ee888f --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/pci.h @@ -0,0 +1,221 @@ +#ifndef __POWERNV_PCI_H +#define __POWERNV_PCI_H + +struct pci_dn; + +enum pnv_phb_type { + PNV_PHB_P5IOC2 = 0, + PNV_PHB_IODA1 = 1, + PNV_PHB_IODA2 = 2, +}; + +/* Precise PHB model for error management */ +enum pnv_phb_model { + PNV_PHB_MODEL_UNKNOWN, + PNV_PHB_MODEL_P5IOC2, + PNV_PHB_MODEL_P7IOC, + PNV_PHB_MODEL_PHB3, +}; + +#define PNV_PCI_DIAG_BUF_SIZE 8192 +#define PNV_IODA_PE_DEV (1 << 0) /* PE has single PCI device */ +#define PNV_IODA_PE_BUS (1 << 1) /* PE has primary PCI bus */ +#define PNV_IODA_PE_BUS_ALL (1 << 2) /* PE has subordinate buses */ +#define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ +#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ +#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ + +/* Data associated with a PE, including IOMMU tracking etc.. */ +struct pnv_phb; +struct pnv_ioda_pe { + unsigned long flags; + struct pnv_phb *phb; + + /* A PE can be associated with a single device or an + * entire bus (& children). In the former case, pdev + * is populated, in the later case, pbus is. + */ +#ifdef CONFIG_PCI_IOV + struct pci_dev *parent_dev; +#endif + struct pci_dev *pdev; + struct pci_bus *pbus; + + /* Effective RID (device RID for a device PE and base bus + * RID with devfn 0 for a bus PE) + */ + unsigned int rid; + + /* PE number */ + unsigned int pe_number; + + /* "Weight" assigned to the PE for the sake of DMA resource + * allocations + */ + unsigned int dma_weight; + + /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ + int tce32_seg; + int tce32_segcount; + struct iommu_table *tce32_table; + phys_addr_t tce_inval_reg_phys; + + /* 64-bit TCE bypass region */ + bool tce_bypass_enabled; + uint64_t tce_bypass_base; + + /* MSIs. MVE index is identical for for 32 and 64 bit MSI + * and -1 if not supported. (It's actually identical to the + * PE number) + */ + int mve_number; + + /* PEs in compound case */ + struct pnv_ioda_pe *master; + struct list_head slaves; + + /* Link in list of PE#s */ + struct list_head dma_link; + struct list_head list; +}; + +#define PNV_PHB_FLAG_EEH (1 << 0) + +struct pnv_phb { + struct pci_controller *hose; + enum pnv_phb_type type; + enum pnv_phb_model model; + u64 hub_id; + u64 opal_id; + int flags; + void __iomem *regs; + int initialized; + spinlock_t lock; + +#ifdef CONFIG_DEBUG_FS + int has_dbgfs; + struct dentry *dbgfs; +#endif + +#ifdef CONFIG_PCI_MSI + unsigned int msi_base; + unsigned int msi32_support; + struct msi_bitmap msi_bmp; +#endif + int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg); + void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); + int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev, + u64 dma_mask); + u64 (*dma_get_required_mask)(struct pnv_phb *phb, + struct pci_dev *pdev); + void (*fixup_phb)(struct pci_controller *hose); + u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); + void (*shutdown)(struct pnv_phb *phb); + int (*init_m64)(struct pnv_phb *phb); + void (*reserve_m64_pe)(struct pnv_phb *phb); + int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all); + int (*get_pe_state)(struct pnv_phb *phb, int pe_no); + void (*freeze_pe)(struct pnv_phb *phb, int pe_no); + int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); + + union { + struct { + struct iommu_table iommu_table; + } p5ioc2; + + struct { + /* Global bridge info */ + unsigned int total_pe; + unsigned int reserved_pe; + + /* 32-bit MMIO window */ + unsigned int m32_size; + unsigned int m32_segsize; + unsigned int m32_pci_base; + + /* 64-bit MMIO window */ + unsigned int m64_bar_idx; + unsigned long m64_size; + unsigned long m64_segsize; + unsigned long m64_base; + unsigned long m64_bar_alloc; + + /* IO ports */ + unsigned int io_size; + unsigned int io_segsize; + unsigned int io_pci_base; + + /* PE allocation bitmap */ + unsigned long *pe_alloc; + /* PE allocation mutex */ + struct mutex pe_alloc_mutex; + + /* M32 & IO segment maps */ + unsigned int *m32_segmap; + unsigned int *io_segmap; + struct pnv_ioda_pe *pe_array; + + /* IRQ chip */ + int irq_chip_init; + struct irq_chip irq_chip; + + /* Sorted list of used PE's based + * on the sequence of creation + */ + struct list_head pe_list; + struct mutex pe_list_mutex; + + /* Reverse map of PEs, will have to extend if + * we are to support more than 256 PEs, indexed + * bus { bus, devfn } + */ + unsigned char pe_rmap[0x10000]; + + /* 32-bit TCE tables allocation */ + unsigned long tce32_count; + + /* Total "weight" for the sake of DMA resources + * allocation + */ + unsigned int dma_weight; + unsigned int dma_pe_count; + + /* Sorted list of used PE's, sorted at + * boot for resource allocation purposes + */ + struct list_head pe_dma_list; + } ioda; + }; + + /* PHB and hub status structure */ + union { + unsigned char blob[PNV_PCI_DIAG_BUF_SIZE]; + struct OpalIoP7IOCPhbErrorData p7ioc; + struct OpalIoPhb3ErrorData phb3; + struct OpalIoP7IOCErrorData hub_diag; + } diag; + +}; + +extern struct pci_ops pnv_pci_ops; + +void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, + unsigned char *log_buff); +int pnv_pci_cfg_read(struct pci_dn *pdn, + int where, int size, u32 *val); +int pnv_pci_cfg_write(struct pci_dn *pdn, + int where, int size, u32 val); +extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, unsigned page_shift); +extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); +extern void pnv_pci_init_ioda_hub(struct device_node *np); +extern void pnv_pci_init_ioda2_phb(struct device_node *np); +extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, + __be64 *startp, __be64 *endp, bool rm); +extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); +extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); + +#endif /* __POWERNV_PCI_H */ diff --git a/kernel/arch/powerpc/platforms/powernv/powernv.h b/kernel/arch/powerpc/platforms/powernv/powernv.h new file mode 100644 index 000000000..826d2c9be --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/powernv.h @@ -0,0 +1,40 @@ +#ifndef _POWERNV_H +#define _POWERNV_H + +#ifdef CONFIG_SMP +extern void pnv_smp_init(void); +#else +static inline void pnv_smp_init(void) { } +#endif + +struct pci_dev; + +#ifdef CONFIG_PCI +extern void pnv_pci_init(void); +extern void pnv_pci_shutdown(void); +extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask); +extern u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev); +#else +static inline void pnv_pci_init(void) { } +static inline void pnv_pci_shutdown(void) { } + +static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +{ + return -ENODEV; +} + +static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev) +{ + return 0; +} +#endif + +extern struct pci_controller_ops pnv_pci_controller_ops; + +extern u32 pnv_get_supported_cpuidle_states(void); + +extern void pnv_lpc_init(void); + +bool cpu_core_split_required(void); + +#endif /* _POWERNV_H */ diff --git a/kernel/arch/powerpc/platforms/powernv/rng.c b/kernel/arch/powerpc/platforms/powernv/rng.c new file mode 100644 index 000000000..6eb808ff6 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/rng.c @@ -0,0 +1,155 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "powernv-rng: " fmt + +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <asm/archrandom.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/smp.h> + + +struct powernv_rng { + void __iomem *regs; + void __iomem *regs_real; + unsigned long mask; +}; + +static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); + + +int powernv_hwrng_present(void) +{ + struct powernv_rng *rng; + + rng = get_cpu_var(powernv_rng); + put_cpu_var(rng); + return rng != NULL; +} + +static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) +{ + unsigned long parity; + + /* Calculate the parity of the value */ + asm ("popcntd %0,%1" : "=r" (parity) : "r" (val)); + + /* xor our value with the previous mask */ + val ^= rng->mask; + + /* update the mask based on the parity of this value */ + rng->mask = (rng->mask << 1) | (parity & 1); + + return val; +} + +int powernv_get_random_real_mode(unsigned long *v) +{ + struct powernv_rng *rng; + + rng = raw_cpu_read(powernv_rng); + + *v = rng_whiten(rng, in_rm64(rng->regs_real)); + + return 1; +} + +int powernv_get_random_long(unsigned long *v) +{ + struct powernv_rng *rng; + + rng = get_cpu_var(powernv_rng); + + *v = rng_whiten(rng, in_be64(rng->regs)); + + put_cpu_var(rng); + + return 1; +} +EXPORT_SYMBOL_GPL(powernv_get_random_long); + +static __init void rng_init_per_cpu(struct powernv_rng *rng, + struct device_node *dn) +{ + int chip_id, cpu; + + chip_id = of_get_ibm_chip_id(dn); + if (chip_id == -1) + pr_warn("No ibm,chip-id found for %s.\n", dn->full_name); + + for_each_possible_cpu(cpu) { + if (per_cpu(powernv_rng, cpu) == NULL || + cpu_to_chip_id(cpu) == chip_id) { + per_cpu(powernv_rng, cpu) = rng; + } + } +} + +static __init int rng_create(struct device_node *dn) +{ + struct powernv_rng *rng; + struct resource res; + unsigned long val; + + rng = kzalloc(sizeof(*rng), GFP_KERNEL); + if (!rng) + return -ENOMEM; + + if (of_address_to_resource(dn, 0, &res)) { + kfree(rng); + return -ENXIO; + } + + rng->regs_real = (void __iomem *)res.start; + + rng->regs = of_iomap(dn, 0); + if (!rng->regs) { + kfree(rng); + return -ENXIO; + } + + val = in_be64(rng->regs); + rng->mask = val; + + rng_init_per_cpu(rng, dn); + + pr_info_once("Registering arch random hook.\n"); + + ppc_md.get_random_long = powernv_get_random_long; + + return 0; +} + +static __init int rng_init(void) +{ + struct device_node *dn; + int rc; + + for_each_compatible_node(dn, NULL, "ibm,power-rng") { + rc = rng_create(dn); + if (rc) { + pr_err("Failed creating rng for %s (%d).\n", + dn->full_name, rc); + continue; + } + + /* Create devices for hwrng driver */ + of_platform_device_create(dn, NULL, NULL); + } + + return 0; +} +machine_subsys_initcall(powernv, rng_init); diff --git a/kernel/arch/powerpc/platforms/powernv/setup.c b/kernel/arch/powerpc/platforms/powernv/setup.c new file mode 100644 index 000000000..16fdcb23f --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/setup.c @@ -0,0 +1,503 @@ +/* + * PowerNV setup code. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/tty.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/console.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/seq_file.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/interrupt.h> +#include <linux/bug.h> +#include <linux/pci.h> +#include <linux/cpufreq.h> + +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/xics.h> +#include <asm/opal.h> +#include <asm/kexec.h> +#include <asm/smp.h> +#include <asm/cputhreads.h> +#include <asm/cpuidle.h> +#include <asm/code-patching.h> + +#include "powernv.h" +#include "subcore.h" + +static void __init pnv_setup_arch(void) +{ + set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + + /* Initialize SMP */ + pnv_smp_init(); + + /* Setup PCI */ + pnv_pci_init(); + + /* Setup RTC and NVRAM callbacks */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + opal_nvram_init(); + + /* Enable NAP mode */ + powersave_nap = 1; + + /* XXX PMCS */ +} + +static void __init pnv_init_early(void) +{ + /* + * Initialize the LPC bus now so that legacy serial + * ports can be found on it + */ + opal_lpc_init(); + +#ifdef CONFIG_HVC_OPAL + if (firmware_has_feature(FW_FEATURE_OPAL)) + hvc_opal_init_early(); + else +#endif + add_preferred_console("hvc", 0, NULL); +} + +static void __init pnv_init_IRQ(void) +{ + xics_init(); + + WARN_ON(!ppc_md.get_irq); +} + +static void pnv_show_cpuinfo(struct seq_file *m) +{ + struct device_node *root; + const char *model = ""; + + root = of_find_node_by_path("/"); + if (root) + model = of_get_property(root, "model", NULL); + seq_printf(m, "machine\t\t: PowerNV %s\n", model); + if (firmware_has_feature(FW_FEATURE_OPALv3)) + seq_printf(m, "firmware\t: OPAL v3\n"); + else if (firmware_has_feature(FW_FEATURE_OPALv2)) + seq_printf(m, "firmware\t: OPAL v2\n"); + else if (firmware_has_feature(FW_FEATURE_OPAL)) + seq_printf(m, "firmware\t: OPAL v1\n"); + else + seq_printf(m, "firmware\t: BML\n"); + of_node_put(root); +} + +static void pnv_prepare_going_down(void) +{ + /* + * Disable all notifiers from OPAL, we can't + * service interrupts anymore anyway + */ + opal_notifier_disable(); + + /* Soft disable interrupts */ + local_irq_disable(); + + /* + * Return secondary CPUs to firwmare if a flash update + * is pending otherwise we will get all sort of error + * messages about CPU being stuck etc.. This will also + * have the side effect of hard disabling interrupts so + * past this point, the kernel is effectively dead. + */ + opal_flash_term_callback(); +} + +static void __noreturn pnv_restart(char *cmd) +{ + long rc = OPAL_BUSY; + + pnv_prepare_going_down(); + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_cec_reboot(); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + for (;;) + opal_poll_events(NULL); +} + +static void __noreturn pnv_power_off(void) +{ + long rc = OPAL_BUSY; + + pnv_prepare_going_down(); + + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_cec_power_down(0); + if (rc == OPAL_BUSY_EVENT) + opal_poll_events(NULL); + else + mdelay(10); + } + for (;;) + opal_poll_events(NULL); +} + +static void __noreturn pnv_halt(void) +{ + pnv_power_off(); +} + +static void pnv_progress(char *s, unsigned short hex) +{ +} + +static int pnv_dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (dev_is_pci(dev)) + return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask); + return __dma_set_mask(dev, dma_mask); +} + +static u64 pnv_dma_get_required_mask(struct device *dev) +{ + if (dev_is_pci(dev)) + return pnv_pci_dma_get_required_mask(to_pci_dev(dev)); + + return __dma_get_required_mask(dev); +} + +static void pnv_shutdown(void) +{ + /* Let the PCI code clear up IODA tables */ + pnv_pci_shutdown(); + + /* + * Stop OPAL activity: Unregister all OPAL interrupts so they + * don't fire up while we kexec and make sure all potentially + * DMA'ing ops are complete (such as dump retrieval). + */ + opal_shutdown(); +} + +#ifdef CONFIG_KEXEC +static void pnv_kexec_wait_secondaries_down(void) +{ + int my_cpu, i, notified = -1; + + my_cpu = get_cpu(); + + for_each_online_cpu(i) { + uint8_t status; + int64_t rc; + + if (i == my_cpu) + continue; + + for (;;) { + rc = opal_query_cpu_status(get_hard_smp_processor_id(i), + &status); + if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED) + break; + barrier(); + if (i != notified) { + printk(KERN_INFO "kexec: waiting for cpu %d " + "(physical %d) to enter OPAL\n", + i, paca[i].hw_cpu_id); + notified = i; + } + } + } +} + +static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) +{ + xics_kexec_teardown_cpu(secondary); + + /* On OPAL v3, we return all CPUs to firmware */ + + if (!firmware_has_feature(FW_FEATURE_OPALv3)) + return; + + if (secondary) { + /* Return secondary CPUs to firmware on OPAL v3 */ + mb(); + get_paca()->kexec_state = KEXEC_STATE_REAL_MODE; + mb(); + + /* Return the CPU to OPAL */ + opal_return_cpu(); + } else if (crash_shutdown) { + /* + * On crash, we don't wait for secondaries to go + * down as they might be unreachable or hung, so + * instead we just wait a bit and move on. + */ + mdelay(1); + } else { + /* Primary waits for the secondaries to have reached OPAL */ + pnv_kexec_wait_secondaries_down(); + } +} +#endif /* CONFIG_KEXEC */ + +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +static unsigned long pnv_memory_block_size(void) +{ + return 256UL * 1024 * 1024; +} +#endif + +static void __init pnv_setup_machdep_opal(void) +{ + ppc_md.get_boot_time = opal_get_boot_time; + ppc_md.restart = pnv_restart; + pm_power_off = pnv_power_off; + ppc_md.halt = pnv_halt; + ppc_md.machine_check_exception = opal_machine_check; + ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; + ppc_md.hmi_exception_early = opal_hmi_exception_early; + ppc_md.handle_hmi_exception = opal_handle_hmi_exception; +} + +static u32 supported_cpuidle_states; + +int pnv_save_sprs_for_winkle(void) +{ + int cpu; + int rc; + + /* + * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross + * all cpus at boot. Get these reg values of current cpu and use the + * same accross all cpus. + */ + uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + uint64_t hid0_val = mfspr(SPRN_HID0); + uint64_t hid1_val = mfspr(SPRN_HID1); + uint64_t hid4_val = mfspr(SPRN_HID4); + uint64_t hid5_val = mfspr(SPRN_HID5); + uint64_t hmeer_val = mfspr(SPRN_HMEER); + + for_each_possible_cpu(cpu) { + uint64_t pir = get_hard_smp_processor_id(cpu); + uint64_t hsprg0_val = (uint64_t)&paca[cpu]; + + /* + * HSPRG0 is used to store the cpu's pointer to paca. Hence last + * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0 + * with 63rd bit set, so that when a thread wakes up at 0x100 we + * can use this bit to distinguish between fastsleep and + * deep winkle. + */ + hsprg0_val |= 1; + + rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); + if (rc != 0) + return rc; + + /* HIDs are per core registers */ + if (cpu_thread_in_core(cpu) == 0) { + + rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); + if (rc != 0) + return rc; + } + } + + return 0; +} + +static void pnv_alloc_idle_core_states(void) +{ + int i, j; + int nr_cores = cpu_nr_cores(); + u32 *core_idle_state; + + /* + * core_idle_state - First 8 bits track the idle state of each thread + * of the core. The 8th bit is the lock bit. Initially all thread bits + * are set. They are cleared when the thread enters deep idle state + * like sleep and winkle. Initially the lock bit is cleared. + * The lock bit has 2 purposes + * a. While the first thread is restoring core state, it prevents + * other threads in the core from switching to process context. + * b. While the last thread in the core is saving the core state, it + * prevents a different thread from waking up. + */ + for (i = 0; i < nr_cores; i++) { + int first_cpu = i * threads_per_core; + int node = cpu_to_node(first_cpu); + + core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); + *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + + for (j = 0; j < threads_per_core; j++) { + int cpu = first_cpu + j; + + paca[cpu].core_idle_state_ptr = core_idle_state; + paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; + paca[cpu].thread_mask = 1 << j; + } + } + + update_subcore_sibling_mask(); + + if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) + pnv_save_sprs_for_winkle(); +} + +u32 pnv_get_supported_cpuidle_states(void) +{ + return supported_cpuidle_states; +} +EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); + +static int __init pnv_init_idle_states(void) +{ + struct device_node *power_mgt; + int dt_idle_states; + u32 *flags; + int i; + + supported_cpuidle_states = 0; + + if (cpuidle_disable != IDLE_NO_OVERRIDE) + goto out; + + if (!firmware_has_feature(FW_FEATURE_OPALv3)) + goto out; + + power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); + if (!power_mgt) { + pr_warn("opal: PowerMgmt Node not found\n"); + goto out; + } + dt_idle_states = of_property_count_u32_elems(power_mgt, + "ibm,cpu-idle-state-flags"); + if (dt_idle_states < 0) { + pr_warn("cpuidle-powernv: no idle states found in the DT\n"); + goto out; + } + + flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL); + if (of_property_read_u32_array(power_mgt, + "ibm,cpu-idle-state-flags", flags, dt_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n"); + goto out_free; + } + + for (i = 0; i < dt_idle_states; i++) + supported_cpuidle_states |= flags[i]; + + if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_entry, + PPC_INST_NOP); + patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_exit, + PPC_INST_NOP); + } + pnv_alloc_idle_core_states(); +out_free: + kfree(flags); +out: + return 0; +} + +subsys_initcall(pnv_init_idle_states); + +static int __init pnv_probe(void) +{ + unsigned long root = of_get_flat_dt_root(); + + if (!of_flat_dt_is_compatible(root, "ibm,powernv")) + return 0; + + hpte_init_native(); + + if (firmware_has_feature(FW_FEATURE_OPAL)) + pnv_setup_machdep_opal(); + + pr_debug("PowerNV detected !\n"); + + return 1; +} + +/* + * Returns the cpu frequency for 'cpu' in Hz. This is used by + * /proc/cpuinfo + */ +static unsigned long pnv_get_proc_freq(unsigned int cpu) +{ + unsigned long ret_freq; + + ret_freq = cpufreq_quick_get(cpu) * 1000ul; + + /* + * If the backend cpufreq driver does not exist, + * then fallback to old way of reporting the clockrate. + */ + if (!ret_freq) + ret_freq = ppc_proc_freq; + return ret_freq; +} + +define_machine(powernv) { + .name = "PowerNV", + .probe = pnv_probe, + .init_early = pnv_init_early, + .setup_arch = pnv_setup_arch, + .init_IRQ = pnv_init_IRQ, + .show_cpuinfo = pnv_show_cpuinfo, + .get_proc_freq = pnv_get_proc_freq, + .progress = pnv_progress, + .machine_shutdown = pnv_shutdown, + .power_save = power7_idle, + .calibrate_decr = generic_calibrate_decr, + .dma_set_mask = pnv_dma_set_mask, + .dma_get_required_mask = pnv_dma_get_required_mask, +#ifdef CONFIG_KEXEC + .kexec_cpu_down = pnv_kexec_cpu_down, +#endif +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE + .memory_block_size = pnv_memory_block_size, +#endif +}; diff --git a/kernel/arch/powerpc/platforms/powernv/smp.c b/kernel/arch/powerpc/platforms/powernv/smp.c new file mode 100644 index 000000000..8f70ba681 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/smp.c @@ -0,0 +1,256 @@ +/* + * SMP support for PowerNV machines. + * + * Copyright 2011 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> + +#include <asm/irq.h> +#include <asm/smp.h> +#include <asm/paca.h> +#include <asm/machdep.h> +#include <asm/cputable.h> +#include <asm/firmware.h> +#include <asm/vdso_datapage.h> +#include <asm/cputhreads.h> +#include <asm/xics.h> +#include <asm/opal.h> +#include <asm/runlatch.h> +#include <asm/code-patching.h> +#include <asm/dbell.h> +#include <asm/kvm_ppc.h> +#include <asm/ppc-opcode.h> + +#include "powernv.h" + +#ifdef DEBUG +#include <asm/udbg.h> +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +static void pnv_smp_setup_cpu(int cpu) +{ + if (cpu != boot_cpuid) + xics_setup_cpu(); + +#ifdef CONFIG_PPC_DOORBELL + if (cpu_has_feature(CPU_FTR_DBELL)) + doorbell_setup_this_cpu(); +#endif +} + +static int pnv_smp_kick_cpu(int nr) +{ + unsigned int pcpu = get_hard_smp_processor_id(nr); + unsigned long start_here = + __pa(ppc_function_entry(generic_secondary_smp_init)); + long rc; + + BUG_ON(nr < 0 || nr >= NR_CPUS); + + /* + * If we already started or OPALv2 is not supported, we just + * kick the CPU via the PACA + */ + if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2)) + goto kick; + + /* + * At this point, the CPU can either be spinning on the way in + * from kexec or be inside OPAL waiting to be started for the + * first time. OPAL v3 allows us to query OPAL to know if it + * has the CPUs, so we do that + */ + if (firmware_has_feature(FW_FEATURE_OPALv3)) { + uint8_t status; + + rc = opal_query_cpu_status(pcpu, &status); + if (rc != OPAL_SUCCESS) { + pr_warn("OPAL Error %ld querying CPU %d state\n", + rc, nr); + return -ENODEV; + } + + /* + * Already started, just kick it, probably coming from + * kexec and spinning + */ + if (status == OPAL_THREAD_STARTED) + goto kick; + + /* + * Available/inactive, let's kick it + */ + if (status == OPAL_THREAD_INACTIVE) { + pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", + nr, pcpu); + rc = opal_start_cpu(pcpu, start_here); + if (rc != OPAL_SUCCESS) { + pr_warn("OPAL Error %ld starting CPU %d\n", + rc, nr); + return -ENODEV; + } + } else { + /* + * An unavailable CPU (or any other unknown status) + * shouldn't be started. It should also + * not be in the possible map but currently it can + * happen + */ + pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable" + " (status %d)...\n", nr, pcpu, status); + return -ENODEV; + } + } else { + /* + * On OPAL v2, we just kick it and hope for the best, + * we must not test the error from opal_start_cpu() or + * we would fail to get CPUs from kexec. + */ + opal_start_cpu(pcpu, start_here); + } + kick: + return smp_generic_kick_cpu(nr); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static int pnv_smp_cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* This is identical to pSeries... might consolidate by + * moving migrate_irqs_away to a ppc_md with default to + * the generic fixup_irqs. --BenH. + */ + set_cpu_online(cpu, false); + vdso_data->processorCount--; + if (cpu == boot_cpuid) + boot_cpuid = cpumask_any(cpu_online_mask); + xics_migrate_irqs_away(); + return 0; +} + +static void pnv_smp_cpu_kill_self(void) +{ + unsigned int cpu; + unsigned long srr1, wmask; + u32 idle_states; + + /* Standard hot unplug procedure */ + local_irq_disable(); + idle_task_exit(); + current->active_mm = NULL; /* for sanity */ + cpu = smp_processor_id(); + DBG("CPU%d offline\n", cpu); + generic_set_cpu_dead(cpu); + smp_wmb(); + + wmask = SRR1_WAKEMASK; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + wmask = SRR1_WAKEMASK_P8; + + idle_states = pnv_get_supported_cpuidle_states(); + /* We don't want to take decrementer interrupts while we are offline, + * so clear LPCR:PECE1. We keep PECE2 enabled. + */ + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); + while (!generic_check_cpu_restart(cpu)) { + + ppc64_runlatch_off(); + + if (idle_states & OPAL_PM_WINKLE_ENABLED) + srr1 = power7_winkle(); + else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) + srr1 = power7_sleep(); + else + srr1 = power7_nap(1); + + ppc64_runlatch_on(); + + /* + * If the SRR1 value indicates that we woke up due to + * an external interrupt, then clear the interrupt. + * We clear the interrupt before checking for the + * reason, so as to avoid a race where we wake up for + * some other reason, find nothing and clear the interrupt + * just as some other cpu is sending us an interrupt. + * If we returned from power7_nap as a result of + * having finished executing in a KVM guest, then srr1 + * contains 0. + */ + if ((srr1 & wmask) == SRR1_WAKEEE) { + icp_native_flush_interrupt(); + local_paca->irq_happened &= PACA_IRQ_HARD_DIS; + smp_mb(); + } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + asm volatile(PPC_MSGCLR(%0) : : "r" (msg)); + kvmppc_set_host_ipi(cpu, 0); + } + + if (cpu_core_split_required()) + continue; + + if (!generic_check_cpu_restart(cpu)) + DBG("CPU%d Unexpected exit while offline !\n", cpu); + } + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); + DBG("CPU%d coming online...\n", cpu); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static int pnv_cpu_bootable(unsigned int nr) +{ + /* + * Starting with POWER8, the subcore logic relies on all threads of a + * core being booted so that they can participate in split mode + * switches. So on those machines we ignore the smt_enabled_at_boot + * setting (smt-enabled on the kernel command line). + */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return 1; + + return smp_generic_cpu_bootable(nr); +} + +static struct smp_ops_t pnv_smp_ops = { + .message_pass = smp_muxed_ipi_message_pass, + .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ + .probe = xics_smp_probe, + .kick_cpu = pnv_smp_kick_cpu, + .setup_cpu = pnv_smp_setup_cpu, + .cpu_bootable = pnv_cpu_bootable, +#ifdef CONFIG_HOTPLUG_CPU + .cpu_disable = pnv_smp_cpu_disable, + .cpu_die = generic_cpu_die, +#endif /* CONFIG_HOTPLUG_CPU */ +}; + +/* This is called very early during platform setup_arch */ +void __init pnv_smp_init(void) +{ + smp_ops = &pnv_smp_ops; + +#ifdef CONFIG_HOTPLUG_CPU + ppc_md.cpu_die = pnv_smp_cpu_kill_self; +#endif +} diff --git a/kernel/arch/powerpc/platforms/powernv/subcore-asm.S b/kernel/arch/powerpc/platforms/powernv/subcore-asm.S new file mode 100644 index 000000000..39bb24aa8 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/subcore-asm.S @@ -0,0 +1,95 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/reg.h> + +#include "subcore.h" + + +_GLOBAL(split_core_secondary_loop) + /* + * r3 = u8 *state, used throughout the routine + * r4 = temp + * r5 = temp + * .. + * r12 = MSR + */ + mfmsr r12 + + /* Disable interrupts so SRR0/1 don't get trashed */ + li r4,0 + ori r4,r4,MSR_EE|MSR_SE|MSR_BE|MSR_RI + andc r4,r12,r4 + sync + mtmsrd r4 + + /* Switch to real mode and leave interrupts off */ + li r5, MSR_IR|MSR_DR + andc r5, r4, r5 + + LOAD_REG_ADDR(r4, real_mode) + + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 + rfid + b . /* prevent speculative execution */ + +real_mode: + /* Grab values from unsplit SPRs */ + mfspr r6, SPRN_LDBAR + mfspr r7, SPRN_PMMAR + mfspr r8, SPRN_PMCR + mfspr r9, SPRN_RPR + mfspr r10, SPRN_SDR1 + + /* Order reading the SPRs vs telling the primary we are ready to split */ + sync + + /* Tell thread 0 we are in real mode */ + li r4, SYNC_STEP_REAL_MODE + stb r4, 0(r3) + + li r5, (HID0_POWER8_4LPARMODE | HID0_POWER8_2LPARMODE)@highest + sldi r5, r5, 48 + + /* Loop until we see the split happen in HID0 */ +1: mfspr r4, SPRN_HID0 + and. r4, r4, r5 + beq 1b + + /* + * We only need to initialise the below regs once for each subcore, + * but it's simpler and harmless to do it on each thread. + */ + + /* Make sure various SPRS have sane values */ + li r4, 0 + mtspr SPRN_LPID, r4 + mtspr SPRN_PCR, r4 + mtspr SPRN_HDEC, r4 + + /* Restore SPR values now we are split */ + mtspr SPRN_LDBAR, r6 + mtspr SPRN_PMMAR, r7 + mtspr SPRN_PMCR, r8 + mtspr SPRN_RPR, r9 + mtspr SPRN_SDR1, r10 + + LOAD_REG_ADDR(r5, virtual_mode) + + /* Get out of real mode */ + mtspr SPRN_SRR0,r5 + mtspr SPRN_SRR1,r12 + rfid + b . /* prevent speculative execution */ + +virtual_mode: + blr diff --git a/kernel/arch/powerpc/platforms/powernv/subcore.c b/kernel/arch/powerpc/platforms/powernv/subcore.c new file mode 100644 index 000000000..f60f80ada --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/subcore.c @@ -0,0 +1,427 @@ +/* + * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "powernv: " fmt + +#include <linux/kernel.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/device.h> +#include <linux/gfp.h> +#include <linux/smp.h> +#include <linux/stop_machine.h> + +#include <asm/cputhreads.h> +#include <asm/kvm_ppc.h> +#include <asm/machdep.h> +#include <asm/opal.h> +#include <asm/smp.h> + +#include "subcore.h" +#include "powernv.h" + + +/* + * Split/unsplit procedure: + * + * A core can be in one of three states, unsplit, 2-way split, and 4-way split. + * + * The mapping to subcores_per_core is simple: + * + * State | subcores_per_core + * ------------|------------------ + * Unsplit | 1 + * 2-way split | 2 + * 4-way split | 4 + * + * The core is split along thread boundaries, the mapping between subcores and + * threads is as follows: + * + * Unsplit: + * ---------------------------- + * Subcore | 0 | + * ---------------------------- + * Thread | 0 1 2 3 4 5 6 7 | + * ---------------------------- + * + * 2-way split: + * ------------------------------------- + * Subcore | 0 | 1 | + * ------------------------------------- + * Thread | 0 1 2 3 | 4 5 6 7 | + * ------------------------------------- + * + * 4-way split: + * ----------------------------------------- + * Subcore | 0 | 1 | 2 | 3 | + * ----------------------------------------- + * Thread | 0 1 | 2 3 | 4 5 | 6 7 | + * ----------------------------------------- + * + * + * Transitions + * ----------- + * + * It is not possible to transition between either of the split states, the + * core must first be unsplit. The legal transitions are: + * + * ----------- --------------- + * | | <----> | 2-way split | + * | | --------------- + * | Unsplit | + * | | --------------- + * | | <----> | 4-way split | + * ----------- --------------- + * + * Unsplitting + * ----------- + * + * Unsplitting is the simpler procedure. It requires thread 0 to request the + * unsplit while all other threads NAP. + * + * Thread 0 clears HID0_POWER8_DYNLPARDIS (Dynamic LPAR Disable). This tells + * the hardware that if all threads except 0 are napping, the hardware should + * unsplit the core. + * + * Non-zero threads are sent to a NAP loop, they don't exit the loop until they + * see the core unsplit. + * + * Core 0 spins waiting for the hardware to see all the other threads napping + * and perform the unsplit. + * + * Once thread 0 sees the unsplit, it IPIs the secondary threads to wake them + * out of NAP. They will then see the core unsplit and exit the NAP loop. + * + * Splitting + * --------- + * + * The basic splitting procedure is fairly straight forward. However it is + * complicated by the fact that after the split occurs, the newly created + * subcores are not in a fully initialised state. + * + * Most notably the subcores do not have the correct value for SDR1, which + * means they must not be running in virtual mode when the split occurs. The + * subcores have separate timebases SPRs but these are pre-synchronised by + * opal. + * + * To begin with secondary threads are sent to an assembly routine. There they + * switch to real mode, so they are immune to the uninitialised SDR1 value. + * Once in real mode they indicate that they are in real mode, and spin waiting + * to see the core split. + * + * Thread 0 waits to see that all secondaries are in real mode, and then begins + * the splitting procedure. It firstly sets HID0_POWER8_DYNLPARDIS, which + * prevents the hardware from unsplitting. Then it sets the appropriate HID bit + * to request the split, and spins waiting to see that the split has happened. + * + * Concurrently the secondaries will notice the split. When they do they set up + * their SPRs, notably SDR1, and then they can return to virtual mode and exit + * the procedure. + */ + +/* Initialised at boot by subcore_init() */ +static int subcores_per_core; + +/* + * Used to communicate to offline cpus that we want them to pop out of the + * offline loop and do a split or unsplit. + * + * 0 - no split happening + * 1 - unsplit in progress + * 2 - split to 2 in progress + * 4 - split to 4 in progress + */ +static int new_split_mode; + +static cpumask_var_t cpu_offline_mask; + +struct split_state { + u8 step; + u8 master; +}; + +static DEFINE_PER_CPU(struct split_state, split_state); + +static void wait_for_sync_step(int step) +{ + int i, cpu = smp_processor_id(); + + for (i = cpu + 1; i < cpu + threads_per_core; i++) + while(per_cpu(split_state, i).step < step) + barrier(); + + /* Order the wait loop vs any subsequent loads/stores. */ + mb(); +} + +static void update_hid_in_slw(u64 hid0) +{ + u64 idle_states = pnv_get_supported_cpuidle_states(); + + if (idle_states & OPAL_PM_WINKLE_ENABLED) { + /* OPAL call to patch slw with the new HID0 value */ + u64 cpu_pir = hard_smp_processor_id(); + + opal_slw_set_reg(cpu_pir, SPRN_HID0, hid0); + } +} + +static void unsplit_core(void) +{ + u64 hid0, mask; + int i, cpu; + + mask = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + while (mfspr(SPRN_HID0) & mask) + power7_nap(0); + + per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; + return; + } + + hid0 = mfspr(SPRN_HID0); + hid0 &= ~HID0_POWER8_DYNLPARDIS; + mtspr(SPRN_HID0, hid0); + update_hid_in_slw(hid0); + + while (mfspr(SPRN_HID0) & mask) + cpu_relax(); + + /* Wake secondaries out of NAP */ + for (i = cpu + 1; i < cpu + threads_per_core; i++) + smp_send_reschedule(i); + + wait_for_sync_step(SYNC_STEP_UNSPLIT); +} + +static void split_core(int new_mode) +{ + struct { u64 value; u64 mask; } split_parms[2] = { + { HID0_POWER8_1TO2LPAR, HID0_POWER8_2LPARMODE }, + { HID0_POWER8_1TO4LPAR, HID0_POWER8_4LPARMODE } + }; + int i, cpu; + u64 hid0; + + /* Convert new_mode (2 or 4) into an index into our parms array */ + i = (new_mode >> 1) - 1; + BUG_ON(i < 0 || i > 1); + + cpu = smp_processor_id(); + if (cpu_thread_in_core(cpu) != 0) { + split_core_secondary_loop(&per_cpu(split_state, cpu).step); + return; + } + + wait_for_sync_step(SYNC_STEP_REAL_MODE); + + /* Write new mode */ + hid0 = mfspr(SPRN_HID0); + hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value; + mtspr(SPRN_HID0, hid0); + update_hid_in_slw(hid0); + + /* Wait for it to happen */ + while (!(mfspr(SPRN_HID0) & split_parms[i].mask)) + cpu_relax(); +} + +static void cpu_do_split(int new_mode) +{ + /* + * At boot subcores_per_core will be 0, so we will always unsplit at + * boot. In the usual case where the core is already unsplit it's a + * nop, and this just ensures the kernel's notion of the mode is + * consistent with the hardware. + */ + if (subcores_per_core != 1) + unsplit_core(); + + if (new_mode != 1) + split_core(new_mode); + + mb(); + per_cpu(split_state, smp_processor_id()).step = SYNC_STEP_FINISHED; +} + +bool cpu_core_split_required(void) +{ + smp_rmb(); + + if (!new_split_mode) + return false; + + cpu_do_split(new_split_mode); + + return true; +} + +void update_subcore_sibling_mask(void) +{ + int cpu; + /* + * sibling mask for the first cpu. Left shift this by required bits + * to get sibling mask for the rest of the cpus. + */ + int sibling_mask_first_cpu = (1 << threads_per_subcore) - 1; + + for_each_possible_cpu(cpu) { + int tid = cpu_thread_in_core(cpu); + int offset = (tid / threads_per_subcore) * threads_per_subcore; + int mask = sibling_mask_first_cpu << offset; + + paca[cpu].subcore_sibling_mask = mask; + + } +} + +static int cpu_update_split_mode(void *data) +{ + int cpu, new_mode = *(int *)data; + + if (this_cpu_ptr(&split_state)->master) { + new_split_mode = new_mode; + smp_wmb(); + + cpumask_andnot(cpu_offline_mask, cpu_present_mask, + cpu_online_mask); + + /* This should work even though the cpu is offline */ + for_each_cpu(cpu, cpu_offline_mask) + smp_send_reschedule(cpu); + } + + cpu_do_split(new_mode); + + if (this_cpu_ptr(&split_state)->master) { + /* Wait for all cpus to finish before we touch subcores_per_core */ + for_each_present_cpu(cpu) { + if (cpu >= setup_max_cpus) + break; + + while(per_cpu(split_state, cpu).step < SYNC_STEP_FINISHED) + barrier(); + } + + new_split_mode = 0; + + /* Make the new mode public */ + subcores_per_core = new_mode; + threads_per_subcore = threads_per_core / subcores_per_core; + update_subcore_sibling_mask(); + + /* Make sure the new mode is written before we exit */ + mb(); + } + + return 0; +} + +static int set_subcores_per_core(int new_mode) +{ + struct split_state *state; + int cpu; + + if (kvm_hv_mode_active()) { + pr_err("Unable to change split core mode while KVM active.\n"); + return -EBUSY; + } + + /* + * We are only called at boot, or from the sysfs write. If that ever + * changes we'll need a lock here. + */ + BUG_ON(new_mode < 1 || new_mode > 4 || new_mode == 3); + + for_each_present_cpu(cpu) { + state = &per_cpu(split_state, cpu); + state->step = SYNC_STEP_INITIAL; + state->master = 0; + } + + get_online_cpus(); + + /* This cpu will update the globals before exiting stop machine */ + this_cpu_ptr(&split_state)->master = 1; + + /* Ensure state is consistent before we call the other cpus */ + mb(); + + stop_machine(cpu_update_split_mode, &new_mode, cpu_online_mask); + + put_online_cpus(); + + return 0; +} + +static ssize_t __used store_subcores_per_core(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + unsigned long val; + int rc; + + /* We are serialised by the attribute lock */ + + rc = sscanf(buf, "%lx", &val); + if (rc != 1) + return -EINVAL; + + switch (val) { + case 1: + case 2: + case 4: + if (subcores_per_core == val) + /* Nothing to do */ + goto out; + break; + default: + return -EINVAL; + } + + rc = set_subcores_per_core(val); + if (rc) + return rc; + +out: + return count; +} + +static ssize_t show_subcores_per_core(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%x\n", subcores_per_core); +} + +static DEVICE_ATTR(subcores_per_core, 0644, + show_subcores_per_core, store_subcores_per_core); + +static int subcore_init(void) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return 0; + + /* + * We need all threads in a core to be present to split/unsplit so + * continue only if max_cpus are aligned to threads_per_core. + */ + if (setup_max_cpus % threads_per_core) + return 0; + + BUG_ON(!alloc_cpumask_var(&cpu_offline_mask, GFP_KERNEL)); + + set_subcores_per_core(1); + + return device_create_file(cpu_subsys.dev_root, + &dev_attr_subcores_per_core); +} +machine_device_initcall(powernv, subcore_init); diff --git a/kernel/arch/powerpc/platforms/powernv/subcore.h b/kernel/arch/powerpc/platforms/powernv/subcore.h new file mode 100644 index 000000000..84e02ae52 --- /dev/null +++ b/kernel/arch/powerpc/platforms/powernv/subcore.h @@ -0,0 +1,25 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* These are ordered and tested with <= */ +#define SYNC_STEP_INITIAL 0 +#define SYNC_STEP_UNSPLIT 1 /* Set by secondary when it sees unsplit */ +#define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */ +#define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_SMP +void split_core_secondary_loop(u8 *state); +extern void update_subcore_sibling_mask(void); +#else +static inline void update_subcore_sibling_mask(void) { }; +#endif /* CONFIG_SMP */ + +#endif /* __ASSEMBLY__ */ |