summaryrefslogtreecommitdiffstats
path: root/kernel/arch/x86/kernel/cpu
diff options
context:
space:
mode:
authorJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-11 10:41:07 +0300
committerJosé Pekkarinen <jose.pekkarinen@nokia.com>2016-04-13 08:17:18 +0300
commite09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
treed10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/arch/x86/kernel/cpu
parentf93b97fd65072de626c074dbe099a1fff05ce060 (diff)
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/arch/x86/kernel/cpu')
-rw-r--r--kernel/arch/x86/kernel/cpu/Makefile3
-rw-r--r--kernel/arch/x86/kernel/cpu/amd.c64
-rw-r--r--kernel/arch/x86/kernel/cpu/bugs.c55
-rw-r--r--kernel/arch/x86/kernel/cpu/common.c118
-rw-r--r--kernel/arch/x86/kernel/cpu/cpu.h1
-rw-r--r--kernel/arch/x86/kernel/cpu/intel.c48
-rw-r--r--kernel/arch/x86/kernel/cpu/intel_cacheinfo.c16
-rw-r--r--kernel/arch/x86/kernel/cpu/intel_pt.h39
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c1
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/mce-genpool.c99
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h14
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/mce.c348
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c141
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c62
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/p5.c5
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c8
-rw-r--r--kernel/arch/x86/kernel/cpu/mcheck/winchip.c4
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/Makefile3
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/amd.c509
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/amd_early.c422
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/core.c312
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/core_early.c148
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/intel.c866
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/intel_early.c786
-rw-r--r--kernel/arch/x86/kernel/cpu/microcode/intel_lib.c46
-rw-r--r--kernel/arch/x86/kernel/cpu/mshyperv.c58
-rw-r--r--kernel/arch/x86/kernel/cpu/mtrr/cleanup.c3
-rw-r--r--kernel/arch/x86/kernel/cpu/mtrr/generic.c209
-rw-r--r--kernel/arch/x86/kernel/cpu/mtrr/main.c50
-rw-r--r--kernel/arch/x86/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event.c110
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event.h54
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel.c562
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c20
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c118
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_cstate.c694
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c415
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c83
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c133
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c31
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c80
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h15
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c59
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c198
-rw-r--r--kernel/arch/x86/kernel/cpu/perf_event_msr.c241
-rw-r--r--kernel/arch/x86/kernel/cpu/proc.c3
-rw-r--r--kernel/arch/x86/kernel/cpu/scattered.c2
48 files changed, 4756 insertions, 2504 deletions
diff --git a/kernel/arch/x86/kernel/cpu/Makefile b/kernel/arch/x86/kernel/cpu/Makefile
index 9bff68798..58031303e 100644
--- a/kernel/arch/x86/kernel/cpu/Makefile
+++ b/kernel/arch/x86/kernel/cpu/Makefile
@@ -41,11 +41,14 @@ obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_cstate.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
perf_event_intel_uncore_snbep.o \
perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o
+obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o
endif
diff --git a/kernel/arch/x86/kernel/cpu/amd.c b/kernel/arch/x86/kernel/cpu/amd.c
index e4cf63301..a8816b325 100644
--- a/kernel/arch/x86/kernel/cpu/amd.c
+++ b/kernel/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
#include <asm/cpu.h>
#include <asm/smp.h>
#include <asm/pci-direct.h>
+#include <asm/delay.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -19,6 +20,13 @@
#include "cpu.h"
+/*
+ * nodes_per_socket: Stores the number of nodes per socket.
+ * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
+ * Node Identifiers[10:8]
+ */
+static u32 nodes_per_socket = 1;
+
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -107,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
- unsigned long d, d2;
+ u64 d, d2;
printk(KERN_INFO "AMD K6 stepping B detected - ");
@@ -118,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
n = K6_BUG_LOOP;
f_vide = vide;
- rdtscl(d);
+ d = rdtsc();
while (n--)
f_vide();
- rdtscl(d2);
+ d2 = rdtsc();
d = d2-d;
if (d > 20*K6_BUG_LOOP)
@@ -288,10 +296,10 @@ static int nearby_node(int apicid)
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
static void amd_get_topology(struct cpuinfo_x86 *c)
{
- u32 nodes, cores_per_cu = 1;
+ u32 cores_per_cu = 1;
u8 node_id;
int cpu = smp_processor_id();
@@ -300,7 +308,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- nodes = ((ecx >> 8) & 7) + 1;
+ nodes_per_socket = ((ecx >> 8) & 7) + 1;
node_id = ecx & 7;
/* get compute unit information */
@@ -311,18 +319,18 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- nodes = ((value >> 3) & 7) + 1;
+ nodes_per_socket = ((value >> 3) & 7) + 1;
node_id = value & 7;
} else
return;
/* fixup multi-node processor information */
- if (nodes > 1) {
+ if (nodes_per_socket > 1) {
u32 cores_per_node;
u32 cus_per_node;
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cores_per_node = c->x86_max_cores / nodes;
+ cores_per_node = c->x86_max_cores / nodes_per_socket;
cus_per_node = cores_per_node / cores_per_cu;
/* store NodeID, use llc_shared_map to store sibling info */
@@ -341,9 +349,10 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
+ unsigned int socket_id, core_complex_id;
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
@@ -353,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
+
+ /*
+ * Fix percpu cpu_llc_id here as LLC topology is different
+ * for Fam17h systems.
+ */
+ if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
+ return;
+
+ socket_id = (c->apicid >> bits) - 1;
+ core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3;
+
+ per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id;
#endif
}
@@ -366,6 +387,12 @@ u16 amd_get_nb_id(int cpu)
}
EXPORT_SYMBOL_GPL(amd_get_nb_id);
+u32 amd_get_nodes_per_socket(void)
+{
+ return nodes_per_socket;
+}
+EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
+
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
@@ -420,7 +447,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned bits, ecx;
/* Multi core CPU? */
@@ -493,6 +520,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
/* A random value per boot for bit slice [12:upper_bit) */
va_align.bits = get_random_int() & va_align.mask;
}
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
}
static void early_init_amd(struct cpuinfo_x86 *c)
@@ -520,8 +550,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
- /* check CPU config space for extended APIC ID */
- if (cpu_has_apic && c->x86 >= 0xf) {
+ /*
+ * ApicID can always be treated as an 8-bit value for AMD APIC versions
+ * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+ * after 16h.
+ */
+ if (cpu_has_apic && c->x86 > 0x16) {
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ } else if (cpu_has_apic && c->x86 >= 0xf) {
+ /* check CPU config space for extended APIC ID */
unsigned int val;
val = read_pci_config(0, 24, 0, 0x68);
if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
diff --git a/kernel/arch/x86/kernel/cpu/bugs.c b/kernel/arch/x86/kernel/cpu/bugs.c
index 03445346e..bd17db15a 100644
--- a/kernel/arch/x86/kernel/cpu/bugs.c
+++ b/kernel/arch/x86/kernel/cpu/bugs.c
@@ -12,57 +12,11 @@
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
#include <asm/msr.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
-
-/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
- *
- * We should really only care about bugs here
- * anyway. Not features.
- */
-static void __init check_fpu(void)
-{
- s32 fdiv_bug;
-
- kernel_fpu_begin();
-
- /*
- * trap_init() enabled FXSR and company _before_ testing for FP
- * problems here.
- *
- * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
- */
- __asm__("fninit\n\t"
- "fldl %1\n\t"
- "fdivl %2\n\t"
- "fmull %2\n\t"
- "fldl %1\n\t"
- "fsubp %%st,%%st(1)\n\t"
- "fistpl %0\n\t"
- "fwait\n\t"
- "fninit"
- : "=m" (*&fdiv_bug)
- : "m" (*&x), "m" (*&y));
-
- kernel_fpu_end();
-
- if (fdiv_bug) {
- set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
- pr_warn("Hmm, FPU with FDIV bug\n");
- }
-}
-
void __init check_bugs(void)
{
identify_boot_cpu();
@@ -85,10 +39,5 @@ void __init check_bugs(void)
'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
- /*
- * kernel_fpu_begin/end() in check_fpu() relies on the patched
- * alternative instructions.
- */
- if (cpu_has_fpu)
- check_fpu();
+ fpu__init_check_bugs();
}
diff --git a/kernel/arch/x86/kernel/cpu/common.c b/kernel/arch/x86/kernel/cpu/common.c
index 205e0f3df..c2b7522cb 100644
--- a/kernel/arch/x86/kernel/cpu/common.c
+++ b/kernel/arch/x86/kernel/cpu/common.c
@@ -5,6 +5,7 @@
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/string.h>
+#include <linux/ctype.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
@@ -12,6 +13,7 @@
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/io.h>
+#include <linux/syscore_ops.h>
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
@@ -31,8 +33,7 @@
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
#include <asm/mtrr.h>
#include <linux/numa.h>
#include <asm/asm.h>
@@ -145,32 +146,21 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-static int __init x86_xsave_setup(char *s)
+static int __init x86_mpx_setup(char *s)
{
+ /* require an exact match without trailing characters */
if (strlen(s))
return 0;
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- return 1;
-}
-__setup("noxsave", x86_xsave_setup);
-static int __init x86_xsaveopt_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_MPX))
+ return 1;
-static int __init x86_xsaves_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
return 1;
}
-__setup("noxsaves", x86_xsaves_setup);
+__setup("nompx", x86_mpx_setup);
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
@@ -183,14 +173,6 @@ static int __init cachesize_setup(char *str)
}
__setup("cachesize=", cachesize_setup);
-static int __init x86_fxsr_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
- return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
static int __init x86_sep_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_SEP);
@@ -291,10 +273,9 @@ __setup("nosmap", setup_disable_smap);
static __always_inline void setup_smap(struct cpuinfo_x86 *c)
{
- unsigned long eflags;
+ unsigned long eflags = native_save_fl();
/* This should have been cleared long ago */
- raw_local_save_flags(eflags);
BUG_ON(eflags & X86_EFLAGS_AC);
if (cpu_has(c, X86_FEATURE_SMAP)) {
@@ -419,7 +400,7 @@ static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
static void get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
- char *p, *q;
+ char *p, *q, *s;
if (c->extended_cpuid_level < 0x80000004)
return;
@@ -430,19 +411,21 @@ static void get_model_name(struct cpuinfo_x86 *c)
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /*
- * Intel chips right-justify this string for some dumb reason;
- * undo that brain damage:
- */
- p = q = &c->x86_model_id[0];
+ /* Trim whitespace */
+ p = q = s = &c->x86_model_id[0];
+
while (*p == ' ')
p++;
- if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+
+ while (*p) {
+ /* Note the last non-whitespace index */
+ if (!isspace(*p))
+ s = q;
+
+ *q++ = *p++;
}
+
+ *(s + 1) = '\0';
}
void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
@@ -508,7 +491,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
void detect_ht(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
static bool printed;
@@ -686,6 +669,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+ c->x86_capability[13] = cpuid_ebx(0x80000008);
}
#ifdef CONFIG_X86_32
else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -759,7 +743,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
- fpu_detect(c);
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
@@ -771,6 +754,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_bsp_init(c);
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+ fpu__init_system(c);
}
void __init early_cpu_init(void)
@@ -844,7 +828,7 @@ static void generic_identify(struct cpuinfo_x86 *c)
if (c->cpuid_level >= 0x00000001) {
c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
# else
c->apicid = c->initial_apicid;
@@ -1026,7 +1010,7 @@ void enable_sep_cpu(void)
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
out:
put_cpu();
@@ -1122,14 +1106,14 @@ void print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT "%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", strim(c->x86_model_id));
+ printk(KERN_CONT "%s", c->x86_model_id);
else
printk(KERN_CONT "%d86", c->x86);
- printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+ printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
+ printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
else
printk(KERN_CONT ")\n");
@@ -1155,10 +1139,6 @@ static __init int setup_disablecpuid(char *arg)
}
__setup("clearcpuid=", setup_disablecpuid);
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
@@ -1183,8 +1163,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1208,10 +1186,10 @@ void syscall_init(void)
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1220,9 +1198,9 @@ void syscall_init(void)
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
- wrmsrl(MSR_CSTAR, ignore_sysret);
+ wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
@@ -1275,7 +1253,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
/*
* On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
@@ -1439,7 +1416,7 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- fpu_init();
+ fpu__init_cpu();
if (is_uv_system())
uv_cpu_init();
@@ -1495,7 +1472,7 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- fpu_init();
+ fpu__init_cpu();
}
#endif
@@ -1512,3 +1489,20 @@ inline bool __static_cpu_has_safe(u16 bit)
return boot_cpu_has(bit);
}
EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+
+static void bsp_resume(void)
+{
+ if (this_cpu->c_bsp_resume)
+ this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+ .resume = bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+ register_syscore_ops(&cpu_syscore_ops);
+ return 0;
+}
+core_initcall(init_cpu_syscore);
diff --git a/kernel/arch/x86/kernel/cpu/cpu.h b/kernel/arch/x86/kernel/cpu/cpu.h
index c37dc37e8..2584265d4 100644
--- a/kernel/arch/x86/kernel/cpu/cpu.h
+++ b/kernel/arch/x86/kernel/cpu/cpu.h
@@ -13,6 +13,7 @@ struct cpu_dev {
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
void (*c_detect_tlb)(struct cpuinfo_x86 *);
+ void (*c_bsp_resume)(struct cpuinfo_x86 *);
int c_x86_vendor;
#ifdef CONFIG_X86_32
/* Optional vendor specific routine to obtain the cache size. */
diff --git a/kernel/arch/x86/kernel/cpu/intel.c b/kernel/arch/x86/kernel/cpu/intel.c
index 50163fa90..209ac1e7d 100644
--- a/kernel/arch/x86/kernel/cpu/intel.c
+++ b/kernel/arch/x86/kernel/cpu/intel.c
@@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
switch (c->x86_model) {
case 0x27: /* Penwell */
case 0x35: /* Cloverview */
+ case 0x4a: /* Merrifield */
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
break;
default:
@@ -371,6 +372,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
}
}
+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+ u64 epb;
+
+ /*
+ * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+ * (x86_energy_perf_policy(8) is available to change it at run-time.)
+ */
+ if (!cpu_has(c, X86_FEATURE_EPB))
+ return;
+
+ rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+ return;
+
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+ wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+ /*
+ * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+ * so reinitialize it properly like during bootup:
+ */
+ init_intel_energy_perf(c);
+}
+
static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
@@ -478,21 +509,7 @@ static void init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
- /*
- * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
- * x86_energy_perf_policy(8) is available to change it at run-time
- */
- if (cpu_has(c, X86_FEATURE_EPB)) {
- u64 epb;
-
- rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
- if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
- pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
- pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
- epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
- wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
- }
- }
+ init_intel_energy_perf(c);
}
#ifdef CONFIG_X86_32
@@ -747,6 +764,7 @@ static const struct cpu_dev intel_cpu_dev = {
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
.c_init = init_intel,
+ .c_bsp_resume = intel_bsp_resume,
.c_x86_vendor = X86_VENDOR_INTEL,
};
diff --git a/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c b/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c
index edcb0e28c..e38d338a6 100644
--- a/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -157,7 +157,7 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-unsigned short num_cache_leaves;
+static unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
information to the user. This makes some assumptions about the machine:
@@ -326,7 +326,7 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
*
* @returns: the disabled index if used or negative value if slot free.
*/
-int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
{
unsigned int reg = 0;
@@ -403,8 +403,8 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
*
* @return: 0 on success, error status on failure
*/
-int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
- unsigned long index)
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned slot, unsigned long index)
{
int ret = 0;
@@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned int cpu = c->cpu_index;
#endif
@@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l2) {
l2 = new_l2;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id;
#endif
}
if (new_l3) {
l3 = new_l3;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l3_id;
#endif
}
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
/*
* If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
* turns means that the only possibility is SMT (as indicated in
diff --git a/kernel/arch/x86/kernel/cpu/intel_pt.h b/kernel/arch/x86/kernel/cpu/intel_pt.h
index 1c338b0eb..336878a5d 100644
--- a/kernel/arch/x86/kernel/cpu/intel_pt.h
+++ b/kernel/arch/x86/kernel/cpu/intel_pt.h
@@ -25,32 +25,11 @@
*/
#define TOPA_PMI_MARGIN 512
-/*
- * Table of Physical Addresses bits
- */
-enum topa_sz {
- TOPA_4K = 0,
- TOPA_8K,
- TOPA_16K,
- TOPA_32K,
- TOPA_64K,
- TOPA_128K,
- TOPA_256K,
- TOPA_512K,
- TOPA_1MB,
- TOPA_2MB,
- TOPA_4MB,
- TOPA_8MB,
- TOPA_16MB,
- TOPA_32MB,
- TOPA_64MB,
- TOPA_128MB,
- TOPA_SZ_END,
-};
+#define TOPA_SHIFT 12
-static inline unsigned int sizes(enum topa_sz tsz)
+static inline unsigned int sizes(unsigned int tsz)
{
- return 1 << (tsz + 12);
+ return 1 << (tsz + TOPA_SHIFT);
};
struct topa_entry {
@@ -66,20 +45,26 @@ struct topa_entry {
u64 rsvd4 : 16;
};
-#define TOPA_SHIFT 12
-#define PT_CPUID_LEAVES 2
+#define PT_CPUID_LEAVES 2
+#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
+ PT_CAP_psb_cyc,
+ PT_CAP_mtc,
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
+ PT_CAP_single_range_output,
PT_CAP_payloads_lip,
+ PT_CAP_mtc_periods,
+ PT_CAP_cycle_thresholds,
+ PT_CAP_psb_periods,
};
struct pt_pmu {
struct pmu pmu;
- u32 caps[4 * PT_CPUID_LEAVES];
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
};
/**
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/Makefile b/kernel/arch/x86/kernel/cpu/mcheck/Makefile
index bb34b03af..a3311c886 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/kernel/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,4 +1,4 @@
-obj-y = mce.o mce-severity.o
+obj-y = mce.o mce-severity.o mce-genpool.o
obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c b/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c
index a1aef9533..34c89a3e8 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
m.addr = mem_err->physical_addr;
mce_log(&m);
- mce_notify_irq();
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/kernel/arch/x86/kernel/cpu/mcheck/mce-genpool.c
new file mode 100644
index 000000000..0a850100c
--- /dev/null
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -0,0 +1,99 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "mce-internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ (2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+void mce_gen_pool_process(void)
+{
+ struct llist_node *head;
+ struct mce_evt_llist *node;
+ struct mce *mce;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return;
+
+ head = llist_reverse_order(head);
+ llist_for_each_entry(node, head, llnode) {
+ mce = &node->mce;
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+ }
+}
+
+bool mce_gen_pool_empty(void)
+{
+ return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+ struct mce_evt_llist *node;
+
+ if (!mce_evt_pool)
+ return -EINVAL;
+
+ node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+ if (!node) {
+ pr_warn_ratelimited("MCE records pool full!\n");
+ return -ENOMEM;
+ }
+
+ memcpy(&node->mce, mce, sizeof(*mce));
+ llist_add(&node->llnode, &mce_event_llist);
+
+ return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+ struct gen_pool *tmpp;
+ int ret = -ENOMEM;
+
+ tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+ if (!tmpp)
+ goto out;
+
+ ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+ if (ret) {
+ gen_pool_destroy(tmpp);
+ goto out;
+ }
+
+ mce_evt_pool = tmpp;
+
+out:
+ return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+ /* Just init mce_gen_pool once. */
+ if (mce_evt_pool)
+ return 0;
+
+ return mce_gen_pool_create();
+}
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h b/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fe32074b8..547720efd 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,6 +13,8 @@ enum severity_level {
MCE_PANIC_SEVERITY,
};
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
@@ -24,6 +26,16 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
+struct mce_evt_llist {
+ struct llist_node llnode;
+ struct mce mce;
+};
+
+void mce_gen_pool_process(void);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
@@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id)
return -EINVAL;
}
#endif
+
+void mce_inject_log(struct mce *m);
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce.c b/kernel/arch/x86/kernel/cpu/mcheck/mce.c
index 9d46f9a13..a080b4939 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce.c
@@ -54,10 +54,13 @@
static DEFINE_MUTEX(mce_chrdev_read_mutex);
-#define rcu_dereference_check_mce(p) \
- rcu_dereference_index_check((p), \
- rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_chrdev_read_mutex))
+#define mce_log_get_idx_check(p) \
+({ \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&mce_chrdev_read_mutex), \
+ "suspicious mce_log_get_idx_check() usage"); \
+ smp_load_acquire(&(p)); \
+})
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -109,22 +112,24 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
*/
mce_banks_t mce_banks_ce_disabled;
-static DEFINE_PER_CPU(struct work_struct, mce_work);
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+static int mce_usable_address(struct mce *m);
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
-static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
- rdtscll(m->tsc);
+ m->tsc = rdtsc();
/* We hope get_seconds stays lockless */
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -156,12 +161,13 @@ void mce_log(struct mce *mce)
/* Emit the trace record: */
trace_mce_record(mce);
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ if (!mce_gen_pool_add(mce))
+ irq_work_queue(&mce_irq_work);
mce->finished = 0;
wmb();
for (;;) {
- entry = rcu_dereference_check_mce(mcelog.next);
+ entry = mce_log_get_idx_check(mcelog.next);
for (;;) {
/*
@@ -195,48 +201,23 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
-static void drain_mcelog_buffer(void)
+void mce_inject_log(struct mce *m)
{
- unsigned int next, i, prev = 0;
-
- next = ACCESS_ONCE(mcelog.next);
-
- do {
- struct mce *m;
-
- /* drain what was logged during boot */
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- unsigned retries = 1;
-
- m = &mcelog.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2*retries))
- retries++;
-
- cpu_relax();
-
- if (!m->finished && retries >= 4) {
- pr_err("skipping error being logged currently!\n");
- break;
- }
- }
- smp_rmb();
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
- }
-
- memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
+ mutex_lock(&mce_chrdev_read_mutex);
+ mce_log(m);
+ mutex_unlock(&mce_chrdev_read_mutex);
}
+EXPORT_SYMBOL_GPL(mce_inject_log);
+static struct notifier_block mce_srao_nb;
void mce_register_decode_chain(struct notifier_block *nb)
{
+ /* Ensure SRAO notifier has the highest priority in the decode chain. */
+ if (nb != &mce_srao_nb && nb->priority == INT_MAX)
+ nb->priority -= 1;
+
atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
- drain_mcelog_buffer();
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
@@ -460,61 +441,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
}
}
-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16 /* we use one entry less */
-
-struct mce_ring {
- unsigned short start;
- unsigned short end;
- unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
- struct mce_ring *r = this_cpu_ptr(&mce_ring);
-
- return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
- struct mce_ring *r;
- int ret = 0;
-
- *pfn = 0;
- get_cpu();
- r = this_cpu_ptr(&mce_ring);
- if (r->start == r->end)
- goto out;
- *pfn = r->ring[r->start];
- r->start = (r->start + 1) % MCE_RING_SIZE;
- ret = 1;
-out:
- put_cpu();
- return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
- struct mce_ring *r = this_cpu_ptr(&mce_ring);
- unsigned next;
-
- next = (r->end + 1) % MCE_RING_SIZE;
- if (next == r->start)
- return -1;
- r->ring[r->end] = pfn;
- wmb();
- r->end = next;
- return 0;
-}
-
int mce_available(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
@@ -524,12 +450,10 @@ int mce_available(struct cpuinfo_x86 *c)
static void mce_schedule_work(void)
{
- if (!mce_ring_empty())
- schedule_work(this_cpu_ptr(&mce_work));
+ if (!mce_gen_pool_empty() && keventd_up())
+ schedule_work(&mce_work);
}
-static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-
static void mce_irq_work_cb(struct irq_work *entry)
{
mce_notify_irq();
@@ -550,9 +474,30 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
- irq_work_queue(this_cpu_ptr(&mce_irq_work));
+ irq_work_queue(&mce_irq_work);
}
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned long pfn;
+
+ if (!mce)
+ return NOTIFY_DONE;
+
+ if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+ pfn = mce->addr >> PAGE_SHIFT;
+ memory_failure(pfn, MCE_VECTOR, 0);
+ }
+
+ return NOTIFY_OK;
+}
+static struct notifier_block mce_srao_nb = {
+ .notifier_call = srao_decode_notifier,
+ .priority = INT_MAX,
+};
+
/*
* Read ADDR and MISC registers.
*/
@@ -671,8 +616,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
if (m.status & MCI_STATUS_ADDRV) {
- mce_ring_add(m.addr >> PAGE_SHIFT);
- mce_schedule_work();
+ m.severity = severity;
+ m.usable_addr = mce_usable_address(&m);
+
+ if (!mce_gen_pool_add(&m))
+ mce_schedule_work();
}
}
@@ -1028,7 +976,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
{
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
- enum ctx_state prev_state;
int i;
int worst = 0;
int severity;
@@ -1052,8 +999,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
char *msg = "Unknown";
u64 recover_paddr = ~0ull;
int flags = MF_ACTION_REQUIRED;
+ int lmce = 0;
- prev_state = ist_enter(regs);
+ /* If this CPU is offline, just bail out. */
+ if (cpu_is_offline(smp_processor_id())) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return;
+ }
+ }
+
+ ist_enter(regs);
this_cpu_inc(mce_exception_count);
@@ -1079,11 +1038,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
kill_it = 1;
/*
- * Go through all the banks in exclusion of the other CPUs.
- * This way we don't report duplicated events on shared banks
- * because the first one to see it will clear it.
+ * Check if this MCE is signaled to only this logical processor
*/
- order = mce_start(&no_way_out);
+ if (m.mcgstatus & MCG_STATUS_LMCES)
+ lmce = 1;
+ else {
+ /*
+ * Go through all the banks in exclusion of the other CPUs.
+ * This way we don't report duplicated events on shared banks
+ * because the first one to see it will clear it.
+ * If this is a Local MCE, then no need to perform rendezvous.
+ */
+ order = mce_start(&no_way_out);
+ }
+
for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
if (!test_bit(i, valid_banks))
@@ -1132,15 +1100,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_read_aux(&m, i);
- /*
- * Action optional error. Queue address for later processing.
- * When the ring overflows we just ignore the AO error.
- * RED-PEN add some logging mechanism when
- * usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
- */
- if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
- mce_ring_add(m.addr >> PAGE_SHIFT);
+ /* assuming valid severity level != 0 */
+ m.severity = severity;
+ m.usable_addr = mce_usable_address(&m);
mce_log(&m);
@@ -1160,8 +1122,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* Do most of the synchronization with other CPUs.
* When there's any problem use only local no_way_out state.
*/
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
+ if (!lmce) {
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ } else {
+ /*
+ * Local MCE skipped calling mce_reign()
+ * If we found a fatal error, we need to panic here.
+ */
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Machine check from unknown source",
+ NULL, NULL);
+ }
/*
* At insane "tolerant" levels we take no action. Otherwise
@@ -1206,7 +1178,7 @@ out:
local_irq_disable();
ist_end_non_atomic();
done:
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1226,14 +1198,11 @@ int memory_failure(unsigned long pfn, int vector, int flags)
/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()
- * placed into the "ring").
+ * placed into the genpool).
*/
static void mce_process_work(struct work_struct *dummy)
{
- unsigned long pfn;
-
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR, 0);
+ mce_gen_pool_process();
}
#ifdef CONFIG_X86_MCE_INTEL
@@ -1655,6 +1624,8 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
winchip_mcheck_init(c);
return 1;
break;
+ default:
+ return 0;
}
return 0;
@@ -1667,9 +1638,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_intel_feature_init(c);
mce_adjust_timer = cmci_intel_adjust_timer;
break;
- case X86_VENDOR_AMD:
+
+ case X86_VENDOR_AMD: {
+ u32 ebx = cpuid_ebx(0x80000007);
+
mce_amd_feature_init(c);
- mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
+ mce_flags.overflow_recov = !!(ebx & BIT(0));
+ mce_flags.succor = !!(ebx & BIT(1));
+ mce_flags.smca = !!(ebx & BIT(3));
+
+ break;
+ }
+
+ default:
+ break;
+ }
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_clear(c);
break;
default:
break;
@@ -1730,13 +1720,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
}
+ if (mce_gen_pool_init()) {
+ mca_cfg.disabled = true;
+ pr_emerg("Couldn't allocate MCE records pool!\n");
+ return;
+ }
+
machine_check_vector = do_machine_check;
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
- INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
- init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ /*
+ * Possibly to clear general settings generic to x86
+ * __mcheck_cpu_clear_generic(c);
+ */
+ __mcheck_cpu_clear_vendor(c);
+
}
/*
@@ -1783,7 +1796,7 @@ static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
- rdtscll(cpu_tsc[smp_processor_id()]);
+ cpu_tsc[smp_processor_id()] = rdtsc();
}
static int mce_apei_read_done;
@@ -1849,7 +1862,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
goto out;
}
- next = rcu_dereference_check_mce(mcelog.next);
+ next = mce_log_get_idx_check(mcelog.next);
/* Only supports full reads right now */
err = -EINVAL;
@@ -1915,7 +1928,7 @@ out:
static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
- if (rcu_access_index(mcelog.next))
+ if (READ_ONCE(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return POLLIN | POLLRDNORM;
@@ -1960,8 +1973,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
}
EXPORT_SYMBOL_GPL(register_mce_write_callback);
-ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
{
if (mce_write)
return mce_write(filp, ubuf, usize, off);
@@ -2007,6 +2020,7 @@ void mce_disable_bank(int bank)
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
@@ -2030,6 +2044,8 @@ static int __init mcheck_enable(char *str)
cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
@@ -2039,11 +2055,8 @@ static int __init mcheck_enable(char *str)
else if (!strcmp(str, "bios_cmci_threshold"))
cfg->bios_cmci_threshold = true;
else if (isdigit(str[0])) {
- get_option(&str, &(cfg->tolerant));
- if (*str == ',') {
- ++str;
+ if (get_option(&str, &cfg->tolerant) == 2)
get_option(&str, &(cfg->monarch_timeout));
- }
} else {
pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
@@ -2055,8 +2068,12 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mce_register_decode_chain(&mce_srao_nb);
mcheck_vendor_init_severity();
+ INIT_WORK(&mce_work, mce_process_work);
+ init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
return 0;
}
@@ -2068,7 +2085,7 @@ int __init mcheck_init(void)
* Disable machine checks on suspend and shutdown. We can't really handle
* them later.
*/
-static int mce_disable_error_reporting(void)
+static void mce_disable_error_reporting(void)
{
int i;
@@ -2078,17 +2095,32 @@ static int mce_disable_error_reporting(void)
if (b->init)
wrmsrl(MSR_IA32_MCx_CTL(i), 0);
}
- return 0;
+ return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+ /*
+ * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
+ * Disabling them for just a single offlined CPU is bad, since it will
+ * inhibit reporting for all shared resources on the socket like the
+ * last level cache (LLC), the integrated memory controller (iMC), etc.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return;
+
+ mce_disable_error_reporting();
}
static int mce_syscore_suspend(void)
{
- return mce_disable_error_reporting();
+ vendor_disable_error_reporting();
+ return 0;
}
static void mce_syscore_shutdown(void)
{
- mce_disable_error_reporting();
+ vendor_disable_error_reporting();
}
/*
@@ -2368,7 +2400,6 @@ static void mce_device_remove(unsigned int cpu)
static void mce_disable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
- int i;
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
@@ -2377,12 +2408,8 @@ static void mce_disable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), 0);
- }
+ vendor_disable_error_reporting();
}
static void mce_reenable_cpu(void *h)
@@ -2594,5 +2621,20 @@ static int __init mcheck_debugfs_init(void)
return 0;
}
-late_initcall(mcheck_debugfs_init);
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif
+
+static int __init mcheck_late_init(void)
+{
+ mcheck_debugfs_init();
+
+ /*
+ * Flush out everything that has been logged during early boot, now that
+ * everything has been initialized (workqueues, decoders, ...).
+ */
+ mce_schedule_work();
+
+ return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c b/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 55ad9b37c..e99b15077 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,19 +1,13 @@
/*
- * (c) 2005-2012 Advanced Micro Devices, Inc.
+ * (c) 2005-2015 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Written by Jacob Shin - AMD, Inc.
- *
* Maintained by: Borislav Petkov <bp@alien8.de>
*
- * April 2006
- * - added support for AMD Family 0x10 processors
- * May 2012
- * - major scrubbing
- *
- * All MC4_MISCi registers are shared between multi-cores
+ * All MC4_MISCi registers are shared between cores on a node.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
@@ -32,6 +26,7 @@
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
@@ -47,6 +42,13 @@
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+#define MASK_DEF_INT_TYPE 0x00000006
+#define DEF_LVT_OFF 0x2
+#define DEF_INT_TYPE_APIC 0x2
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
/*
* CPU Initialization
@@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
threshold_restart_bank(&tr);
};
-static int setup_APIC_mce(int reserved, int new)
+static int setup_APIC_mce_threshold(int reserved, int new)
{
if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
APIC_EILVT_MSG_FIX, 0))
@@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new)
return reserved;
}
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+ u32 low = 0, high = 0;
+ int def_offset = -1, def_new;
+
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+ return;
+
+ def_new = (low & MASK_DEF_LVTOFF) >> 4;
+ if (!(low & MASK_DEF_LVTOFF)) {
+ pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+ def_new = DEF_LVT_OFF;
+ low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+ }
+
+ def_offset = setup_APIC_deferred_error(def_offset, def_new);
+ if ((def_offset == def_new) &&
+ (deferred_error_int_vector != amd_deferred_error_interrupt))
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+
+ low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+ wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
- offset = setup_APIC_mce(offset, new);
+ offset = setup_APIC_mce_threshold(offset, new);
if ((offset == new) &&
(mce_threshold_vector != amd_threshold_interrupt))
@@ -262,6 +304,73 @@ init:
mce_threshold_block_init(&b, offset);
}
}
+
+ if (mce_flags.succor)
+ deferred_error_interrupt_enable(c);
+}
+
+static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+{
+ struct mce m;
+ u64 status;
+
+ rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+ if (!(status & MCI_STATUS_VAL))
+ return;
+
+ mce_setup(&m);
+
+ m.status = status;
+ m.bank = bank;
+
+ if (threshold_err)
+ m.misc = misc;
+
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+
+ mce_log(&m);
+ wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+}
+
+static inline void __smp_deferred_error_interrupt(void)
+{
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+}
+
+asmlinkage __visible void smp_deferred_error_interrupt(void)
+{
+ entering_irq();
+ __smp_deferred_error_interrupt();
+ exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+{
+ entering_irq();
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ __smp_deferred_error_interrupt();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ exiting_ack_irq();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ u64 status;
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+
+ if (!(status & MCI_STATUS_VAL) ||
+ !(status & MCI_STATUS_DEFERRED))
+ continue;
+
+ __log_error(bank, false, 0);
+ break;
+ }
}
/*
@@ -273,12 +382,12 @@ init:
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
+
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
int cpu = smp_processor_id();
unsigned int bank, block;
- struct mce m;
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void)
return;
log:
- mce_setup(&m);
- rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
- if (!(m.status & MCI_STATUS_VAL))
- return;
- m.misc = ((u64)high << 32) | low;
- m.bank = bank;
- mce_log(&m);
-
- wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+ __log_error(bank, true, ((u64)high << 32) | low);
}
/*
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c b/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e166d833c..1e8bb6c94 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -91,6 +91,36 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+ * generate a #GP fault.
+ */
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+ if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+ (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+ return true;
+
+ return false;
+}
+
bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
@@ -221,7 +251,6 @@ static void intel_threshold_interrupt(void)
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
- mce_notify_irq();
}
/*
@@ -410,8 +439,39 @@ static void intel_init_cmci(void)
cmci_recheck();
}
+static void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+static void intel_clear_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
+ intel_init_lmce();
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
}
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/p5.c b/kernel/arch/x86/kernel/cpu/mcheck/p5.c
index 737b0ad4e..12402e10a 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state;
u32 loaddr, hi, lotype;
- prev_state = ist_enter(regs);
+ ist_enter(regs);
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c b/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1af51b158..2c5aaf8c2 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -503,14 +503,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
- /* Check whether a vector already exists */
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG
- "CPU%d: Thermal LVT vector (%#x) already installed\n",
- cpu, (h & APIC_VECTOR_MASK));
- return;
- }
-
/* early Pentium M models use different method for enabling TM2 */
if (cpu_has(c, X86_FEATURE_TM2)) {
if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/winchip.c b/kernel/arch/x86/kernel/cpu/mcheck/winchip.c
index 44f138296..01dd87028 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state = ist_enter(regs);
+ ist_enter(regs);
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
/* Set up machine check reporting on the Winchip C6 series */
diff --git a/kernel/arch/x86/kernel/cpu/microcode/Makefile b/kernel/arch/x86/kernel/cpu/microcode/Makefile
index 285c85427..220b1a508 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/Makefile
+++ b/kernel/arch/x86/kernel/cpu/microcode/Makefile
@@ -2,6 +2,3 @@ microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
microcode-$(CONFIG_MICROCODE_AMD) += amd.o
-obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
-obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
-obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o
diff --git a/kernel/arch/x86/kernel/cpu/microcode/amd.c b/kernel/arch/x86/kernel/cpu/microcode/amd.c
index 12829c3ce..2233f8a76 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/amd.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/amd.c
@@ -1,5 +1,9 @@
/*
* AMD CPU Microcode Update Driver for Linux
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
* Copyright (C) 2008-2011 Advanced Micro Devices Inc.
*
* Author: Peter Oruba <peter.oruba@amd.com>
@@ -7,34 +11,31 @@
* Based on work by:
* Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
*
- * Maintainers:
- * Andreas Herrmann <herrmann.der.user@googlemail.com>
- * Borislav Petkov <bp@alien8.de>
+ * early loader:
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
*
- * This driver allows to upgrade microcode on F10h AMD
- * CPUs and later.
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
*
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
+#define pr_fmt(fmt) "microcode: " fmt
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
+#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/initrd.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/pci.h>
+#include <asm/microcode_amd.h>
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
#include <asm/msr.h>
-#include <asm/microcode_amd.h>
-
-MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba");
-MODULE_LICENSE("GPL v2");
static struct equiv_cpu_entry *equiv_cpu_table;
@@ -47,6 +48,432 @@ struct ucode_patch {
static LIST_HEAD(pcache);
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
+
+static u32 ucode_new_rev;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u16 this_equiv_id;
+
+static struct cpio_data ucode_cpio;
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio format.
+ * See Documentation/x86/early-microcode.txt
+ */
+static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+
+static struct cpio_data __init find_ucode_in_initrd(void)
+{
+ long offset = 0;
+ char *path;
+ void *start;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *p;
+
+ /*
+ * On 32-bit, early load occurs before paging is turned on so we need
+ * to use physical addresses.
+ */
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ path = (char *)__pa_nodebug(ucode_path);
+ start = (void *)p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
+#else
+ path = ucode_path;
+ start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
+ size = boot_params.hdr.ramdisk_size;
+#endif
+
+ return find_cpio_data(path, start, size, &offset);
+}
+
+static size_t compute_container_size(u8 *data, u32 total_size)
+{
+ size_t size = 0;
+ u32 *header = (u32 *)data;
+
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ header[2] == 0) /* size */
+ return size;
+
+ size = header[2] + CONTAINER_HDR_SZ;
+ total_size -= size;
+ data += size;
+
+ while (total_size) {
+ u16 patch_size;
+
+ header = (u32 *)data;
+
+ if (header[0] != UCODE_UCODE_TYPE)
+ break;
+
+ /*
+ * Sanity-check patch size.
+ */
+ patch_size = header[1];
+ if (patch_size > PATCH_MAX_SIZE)
+ break;
+
+ size += patch_size + SECTION_HDR_SIZE;
+ data += patch_size + SECTION_HDR_SIZE;
+ total_size -= patch_size + SECTION_HDR_SIZE;
+ }
+
+ return size;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+{
+ struct equiv_cpu_entry *eq;
+ size_t *cont_sz;
+ u32 *header;
+ u8 *data, **cont;
+ u8 (*patch)[PATCH_MAX_SIZE];
+ u16 eq_id = 0;
+ int offset, left;
+ u32 rev, eax, ebx, ecx, edx;
+ u32 *new_rev;
+
+#ifdef CONFIG_X86_32
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ cont_sz = (size_t *)__pa_nodebug(&container_size);
+ cont = (u8 **)__pa_nodebug(&container);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+ new_rev = &ucode_new_rev;
+ cont_sz = &container_size;
+ cont = &container;
+ patch = &amd_ucode_patch;
+#endif
+
+ data = ucode;
+ left = size;
+ header = (u32 *)data;
+
+ /* find equiv cpu table */
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ header[2] == 0) /* size */
+ return;
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ while (left > 0) {
+ eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
+
+ *cont = data;
+
+ /* Advance past the container header */
+ offset = header[2] + CONTAINER_HDR_SZ;
+ data += offset;
+ left -= offset;
+
+ eq_id = find_equiv_id(eq, eax);
+ if (eq_id) {
+ this_equiv_id = eq_id;
+ *cont_sz = compute_container_size(*cont, left + offset);
+
+ /*
+ * truncate how much we need to iterate over in the
+ * ucode update loop below
+ */
+ left = *cont_sz - offset;
+ break;
+ }
+
+ /*
+ * support multiple container files appended together. if this
+ * one does not have a matching equivalent cpu entry, we fast
+ * forward to the next container file.
+ */
+ while (left > 0) {
+ header = (u32 *)data;
+ if (header[0] == UCODE_MAGIC &&
+ header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
+ break;
+
+ offset = header[1] + SECTION_HDR_SIZE;
+ data += offset;
+ left -= offset;
+ }
+
+ /* mark where the next microcode container file starts */
+ offset = data - (u8 *)ucode;
+ ucode = data;
+ }
+
+ if (!eq_id) {
+ *cont = NULL;
+ *cont_sz = 0;
+ return;
+ }
+
+ if (check_current_patch_level(&rev, true))
+ return;
+
+ while (left > 0) {
+ struct microcode_amd *mc;
+
+ header = (u32 *)data;
+ if (header[0] != UCODE_UCODE_TYPE || /* type */
+ header[1] == 0) /* size */
+ break;
+
+ mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
+
+ if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+
+ if (!__apply_microcode_amd(mc)) {
+ rev = mc->hdr.patch_id;
+ *new_rev = rev;
+
+ if (save_patch)
+ memcpy(patch, mc,
+ min_t(u32, header[1], PATCH_MAX_SIZE));
+ }
+ }
+
+ offset = header[1] + SECTION_HDR_SIZE;
+ data += offset;
+ left -= offset;
+ }
+}
+
+static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
+ unsigned int family)
+{
+#ifdef CONFIG_X86_64
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+ if (family >= 0x15)
+ snprintf(fw_name, sizeof(fw_name),
+ "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+ return get_builtin_firmware(cp, fw_name);
+#else
+ return false;
+#endif
+}
+
+void __init load_ucode_amd_bsp(unsigned int family)
+{
+ struct cpio_data cp;
+ void **data;
+ size_t *size;
+
+#ifdef CONFIG_X86_32
+ data = (void **)__pa_nodebug(&ucode_cpio.data);
+ size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+ data = &ucode_cpio.data;
+ size = &ucode_cpio.size;
+#endif
+
+ cp = find_ucode_in_initrd();
+ if (!cp.data) {
+ if (!load_builtin_amd_microcode(&cp, family))
+ return;
+ }
+
+ *data = cp.data;
+ *size = cp.size;
+
+ apply_ucode_in_initrd(cp.data, cp.size, true);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit, since AP's early load occurs before paging is turned on, we
+ * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
+ * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * which is used upon resume from suspend.
+ */
+void load_ucode_amd_ap(void)
+{
+ struct microcode_amd *mc;
+ size_t *usize;
+ void **ucode;
+
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+ if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
+ __apply_microcode_amd(mc);
+ return;
+ }
+
+ ucode = (void *)__pa_nodebug(&container);
+ usize = (size_t *)__pa_nodebug(&container_size);
+
+ if (!*ucode || !*usize)
+ return;
+
+ apply_ucode_in_initrd(*ucode, *usize, false);
+}
+
+static void __init collect_cpu_sig_on_bsp(void *arg)
+{
+ unsigned int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->cpu_sig.sig = cpuid_eax(0x00000001);
+}
+
+static void __init get_bsp_sig(void)
+{
+ unsigned int bsp = boot_cpu_data.cpu_index;
+ struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
+
+ if (!uci->cpu_sig.sig)
+ smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+}
+#else
+void load_ucode_amd_ap(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct equiv_cpu_entry *eq;
+ struct microcode_amd *mc;
+ u32 rev, eax;
+ u16 eq_id;
+
+ /* Exit if called on the BSP. */
+ if (!cpu)
+ return;
+
+ if (!container)
+ return;
+
+ /*
+ * 64-bit runs with paging enabled, thus early==false.
+ */
+ if (check_current_patch_level(&rev, false))
+ return;
+
+ eax = cpuid_eax(0x00000001);
+ eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+
+ eq_id = find_equiv_id(eq, eax);
+ if (!eq_id)
+ return;
+
+ if (eq_id == this_equiv_id) {
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ if (mc && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc))
+ ucode_new_rev = mc->hdr.patch_id;
+ }
+
+ } else {
+ if (!ucode_cpio.data)
+ return;
+
+ /*
+ * AP has a different equivalence ID than BSP, looks like
+ * mixed-steppings silicon so go through the ucode blob anew.
+ */
+ apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
+ }
+}
+#endif
+
+int __init save_microcode_in_initrd_amd(void)
+{
+ unsigned long cont;
+ int retval = 0;
+ enum ucode_state ret;
+ u8 *cont_va;
+ u32 eax;
+
+ if (!container)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ get_bsp_sig();
+ cont = (unsigned long)container;
+ cont_va = __va(container);
+#else
+ /*
+ * We need the physical address of the container for both bitness since
+ * boot_params.hdr.ramdisk_image is a physical address.
+ */
+ cont = __pa(container);
+ cont_va = container;
+#endif
+
+ /*
+ * Take into account the fact that the ramdisk might get relocated and
+ * therefore we need to recompute the container's position in virtual
+ * memory space.
+ */
+ if (relocated_ramdisk)
+ container = (u8 *)(__va(relocated_ramdisk) +
+ (cont - boot_params.hdr.ramdisk_image));
+ else
+ container = cont_va;
+
+ if (ucode_new_rev)
+ pr_info("microcode: updated early to new patch_level=0x%08x\n",
+ ucode_new_rev);
+
+ eax = cpuid_eax(0x00000001);
+ eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+
+ ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+ if (ret != UCODE_OK)
+ retval = -EINVAL;
+
+ /*
+ * This will be freed any msec now, stash patches for the current
+ * family and switch to patch cache for cpu hotplug, etc later.
+ */
+ container = NULL;
+ container_size = 0;
+
+ return retval;
+}
+
+void reload_ucode_amd(void)
+{
+ struct microcode_amd *mc;
+ u32 rev;
+
+ /*
+ * early==false because this is a syscore ->resume path and by
+ * that time paging is long enabled.
+ */
+ if (check_current_patch_level(&rev, false))
+ return;
+
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ if (mc && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ ucode_new_rev = mc->hdr.patch_id;
+ pr_info("microcode: reload patch_level=0x%08x\n",
+ ucode_new_rev);
+ }
+ }
+}
static u16 __find_equiv_id(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -177,6 +604,53 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
return patch_size;
}
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * @rev: Use it to return the patch level. It is set to 0 in the case of
+ * error.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+bool check_current_patch_level(u32 *rev, bool early)
+{
+ u32 lvl, dummy, i;
+ bool ret = false;
+ u32 *levels;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ if (IS_ENABLED(CONFIG_X86_32) && early)
+ levels = (u32 *)__pa_nodebug(&final_levels);
+ else
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i]) {
+ lvl = 0;
+ ret = true;
+ break;
+ }
+ }
+
+ if (rev)
+ *rev = lvl;
+
+ return ret;
+}
+
int __apply_microcode_amd(struct microcode_amd *mc_amd)
{
u32 rev, dummy;
@@ -197,7 +671,7 @@ int apply_microcode_amd(int cpu)
struct microcode_amd *mc_amd;
struct ucode_cpu_info *uci;
struct ucode_patch *p;
- u32 rev, dummy;
+ u32 rev;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -210,7 +684,8 @@ int apply_microcode_amd(int cpu)
mc_amd = p->data;
uci->mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (check_current_patch_level(&rev, false))
+ return -1;
/* need to apply patch? */
if (rev >= mc_amd->hdr.patch_id) {
@@ -387,7 +862,7 @@ enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t s
if (ret != UCODE_OK)
cleanup();
-#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
+#ifdef CONFIG_X86_32
/* save BSP's matching patch for early load */
if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
struct ucode_patch *p = find_patch(cpu);
@@ -475,7 +950,7 @@ static struct microcode_ops microcode_amd_ops = {
struct microcode_ops * __init init_amd_microcode(void)
{
- struct cpuinfo_x86 *c = &cpu_data(0);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
diff --git a/kernel/arch/x86/kernel/cpu/microcode/amd_early.c b/kernel/arch/x86/kernel/cpu/microcode/amd_early.c
deleted file mode 100644
index 737737edb..000000000
--- a/kernel/arch/x86/kernel/cpu/microcode/amd_early.c
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- * Fixes: Borislav Petkov <bp@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-
-#include <asm/cpu.h>
-#include <asm/setup.h>
-#include <asm/microcode_amd.h>
-
-/*
- * This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
- */
-static u8 *container;
-static size_t container_size;
-
-static u32 ucode_new_rev;
-u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static u16 this_equiv_id;
-
-static struct cpio_data ucode_cpio;
-
-/*
- * Microcode patch container file is prepended to the initrd in cpio format.
- * See Documentation/x86/early-microcode.txt
- */
-static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
-static struct cpio_data __init find_ucode_in_initrd(void)
-{
- long offset = 0;
- char *path;
- void *start;
- size_t size;
-
-#ifdef CONFIG_X86_32
- struct boot_params *p;
-
- /*
- * On 32-bit, early load occurs before paging is turned on so we need
- * to use physical addresses.
- */
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- path = (char *)__pa_nodebug(ucode_path);
- start = (void *)p->hdr.ramdisk_image;
- size = p->hdr.ramdisk_size;
-#else
- path = ucode_path;
- start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
- size = boot_params.hdr.ramdisk_size;
-#endif
-
- return find_cpio_data(path, start, size, &offset);
-}
-
-static size_t compute_container_size(u8 *data, u32 total_size)
-{
- size_t size = 0;
- u32 *header = (u32 *)data;
-
- if (header[0] != UCODE_MAGIC ||
- header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
- header[2] == 0) /* size */
- return size;
-
- size = header[2] + CONTAINER_HDR_SZ;
- total_size -= size;
- data += size;
-
- while (total_size) {
- u16 patch_size;
-
- header = (u32 *)data;
-
- if (header[0] != UCODE_UCODE_TYPE)
- break;
-
- /*
- * Sanity-check patch size.
- */
- patch_size = header[1];
- if (patch_size > PATCH_MAX_SIZE)
- break;
-
- size += patch_size + SECTION_HDR_SIZE;
- data += patch_size + SECTION_HDR_SIZE;
- total_size -= patch_size + SECTION_HDR_SIZE;
- }
-
- return size;
-}
-
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- */
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
-{
- struct equiv_cpu_entry *eq;
- size_t *cont_sz;
- u32 *header;
- u8 *data, **cont;
- u8 (*patch)[PATCH_MAX_SIZE];
- u16 eq_id = 0;
- int offset, left;
- u32 rev, eax, ebx, ecx, edx;
- u32 *new_rev;
-
-#ifdef CONFIG_X86_32
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- cont_sz = (size_t *)__pa_nodebug(&container_size);
- cont = (u8 **)__pa_nodebug(&container);
- patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
-#else
- new_rev = &ucode_new_rev;
- cont_sz = &container_size;
- cont = &container;
- patch = &amd_ucode_patch;
-#endif
-
- data = ucode;
- left = size;
- header = (u32 *)data;
-
- /* find equiv cpu table */
- if (header[0] != UCODE_MAGIC ||
- header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
- header[2] == 0) /* size */
- return;
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- while (left > 0) {
- eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
-
- *cont = data;
-
- /* Advance past the container header */
- offset = header[2] + CONTAINER_HDR_SZ;
- data += offset;
- left -= offset;
-
- eq_id = find_equiv_id(eq, eax);
- if (eq_id) {
- this_equiv_id = eq_id;
- *cont_sz = compute_container_size(*cont, left + offset);
-
- /*
- * truncate how much we need to iterate over in the
- * ucode update loop below
- */
- left = *cont_sz - offset;
- break;
- }
-
- /*
- * support multiple container files appended together. if this
- * one does not have a matching equivalent cpu entry, we fast
- * forward to the next container file.
- */
- while (left > 0) {
- header = (u32 *)data;
- if (header[0] == UCODE_MAGIC &&
- header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
- break;
-
- offset = header[1] + SECTION_HDR_SIZE;
- data += offset;
- left -= offset;
- }
-
- /* mark where the next microcode container file starts */
- offset = data - (u8 *)ucode;
- ucode = data;
- }
-
- if (!eq_id) {
- *cont = NULL;
- *cont_sz = 0;
- return;
- }
-
- /* find ucode and update if needed */
-
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
- while (left > 0) {
- struct microcode_amd *mc;
-
- header = (u32 *)data;
- if (header[0] != UCODE_UCODE_TYPE || /* type */
- header[1] == 0) /* size */
- break;
-
- mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-
- if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
-
- if (!__apply_microcode_amd(mc)) {
- rev = mc->hdr.patch_id;
- *new_rev = rev;
-
- if (save_patch)
- memcpy(patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
- }
- }
-
- offset = header[1] + SECTION_HDR_SIZE;
- data += offset;
- left -= offset;
- }
-}
-
-void __init load_ucode_amd_bsp(void)
-{
- struct cpio_data cp;
- void **data;
- size_t *size;
-
-#ifdef CONFIG_X86_32
- data = (void **)__pa_nodebug(&ucode_cpio.data);
- size = (size_t *)__pa_nodebug(&ucode_cpio.size);
-#else
- data = &ucode_cpio.data;
- size = &ucode_cpio.size;
-#endif
-
- cp = find_ucode_in_initrd();
- if (!cp.data)
- return;
-
- *data = cp.data;
- *size = cp.size;
-
- apply_ucode_in_initrd(cp.data, cp.size, true);
-}
-
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * which is used upon resume from suspend.
- */
-void load_ucode_amd_ap(void)
-{
- struct microcode_amd *mc;
- size_t *usize;
- void **ucode;
-
- mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
- if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
- __apply_microcode_amd(mc);
- return;
- }
-
- ucode = (void *)__pa_nodebug(&container);
- usize = (size_t *)__pa_nodebug(&container_size);
-
- if (!*ucode || !*usize)
- return;
-
- apply_ucode_in_initrd(*ucode, *usize, false);
-}
-
-static void __init collect_cpu_sig_on_bsp(void *arg)
-{
- unsigned int cpu = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
- unsigned int bsp = boot_cpu_data.cpu_index;
- struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
- if (!uci->cpu_sig.sig)
- smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
-}
-#else
-void load_ucode_amd_ap(void)
-{
- unsigned int cpu = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct equiv_cpu_entry *eq;
- struct microcode_amd *mc;
- u32 rev, eax;
- u16 eq_id;
-
- /* Exit if called on the BSP. */
- if (!cpu)
- return;
-
- if (!container)
- return;
-
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
- uci->cpu_sig.rev = rev;
- uci->cpu_sig.sig = eax;
-
- eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
-
- eq_id = find_equiv_id(eq, eax);
- if (!eq_id)
- return;
-
- if (eq_id == this_equiv_id) {
- mc = (struct microcode_amd *)amd_ucode_patch;
-
- if (mc && rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc))
- ucode_new_rev = mc->hdr.patch_id;
- }
-
- } else {
- if (!ucode_cpio.data)
- return;
-
- /*
- * AP has a different equivalence ID than BSP, looks like
- * mixed-steppings silicon so go through the ucode blob anew.
- */
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
- }
-}
-#endif
-
-int __init save_microcode_in_initrd_amd(void)
-{
- unsigned long cont;
- int retval = 0;
- enum ucode_state ret;
- u8 *cont_va;
- u32 eax;
-
- if (!container)
- return -EINVAL;
-
-#ifdef CONFIG_X86_32
- get_bsp_sig();
- cont = (unsigned long)container;
- cont_va = __va(container);
-#else
- /*
- * We need the physical address of the container for both bitness since
- * boot_params.hdr.ramdisk_image is a physical address.
- */
- cont = __pa(container);
- cont_va = container;
-#endif
-
- /*
- * Take into account the fact that the ramdisk might get relocated and
- * therefore we need to recompute the container's position in virtual
- * memory space.
- */
- if (relocated_ramdisk)
- container = (u8 *)(__va(relocated_ramdisk) +
- (cont - boot_params.hdr.ramdisk_image));
- else
- container = cont_va;
-
- if (ucode_new_rev)
- pr_info("microcode: updated early to new patch_level=0x%08x\n",
- ucode_new_rev);
-
- eax = cpuid_eax(0x00000001);
- eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
-
- ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
- if (ret != UCODE_OK)
- retval = -EINVAL;
-
- /*
- * This will be freed any msec now, stash patches for the current
- * family and switch to patch cache for cpu hotplug, etc later.
- */
- container = NULL;
- container_size = 0;
-
- return retval;
-}
-
-void reload_ucode_amd(void)
-{
- struct microcode_amd *mc;
- u32 rev, eax;
-
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
- mc = (struct microcode_amd *)amd_ucode_patch;
-
- if (mc && rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- pr_info("microcode: reload patch_level=0x%08x\n",
- ucode_new_rev);
- }
- }
-}
diff --git a/kernel/arch/x86/kernel/cpu/microcode/core.c b/kernel/arch/x86/kernel/cpu/microcode/core.c
index 36a83617e..b3e94ef46 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/core.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/core.c
@@ -1,104 +1,57 @@
/*
- * Intel CPU Microcode Update Driver for Linux
+ * CPU Microcode Update Driver for Linux
*
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ * 2013-2015 Borislav Petkov <bp@alien8.de>
*
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
+ * X86 CPU microcode early update for Linux:
*
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ * (C) 2015 Borislav Petkov <bp@alien8.de>
*
- * http://developer.intel.com/Assets/PDF/manual/253668.pdf
+ * This driver allows to upgrade microcode on x86 processors.
*
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to
- * speculative nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define pr_fmt(fmt) "microcode: " fmt
#include <linux/platform_device.h>
+#include <linux/syscore_ops.h>
#include <linux/miscdevice.h>
#include <linux/capability.h>
+#include <linux/firmware.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/syscore_ops.h>
-#include <asm/microcode.h>
-#include <asm/processor.h>
+#include <asm/microcode_intel.h>
#include <asm/cpu_device_id.h>
+#include <asm/microcode_amd.h>
#include <asm/perf_event.h>
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION "2.00"
+#define MICROCODE_VERSION "2.01"
static struct microcode_ops *microcode_ops;
-bool dis_ucode_ldr;
-module_param(dis_ucode_ldr, bool, 0);
+static bool dis_ucode_ldr;
+
+static int __init disable_loader(char *str)
+{
+ dis_ucode_ldr = true;
+ return 1;
+}
+__setup("dis_ucode_ldr", disable_loader);
/*
* Synchronization.
@@ -126,6 +79,150 @@ struct cpu_info_ctx {
int err;
};
+static bool __init check_loader_disabled_bsp(void)
+{
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *opt = "dis_ucode_ldr";
+ const char *option = (const char *)__pa_nodebug(opt);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = "dis_ucode_ldr";
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ if (cmdline_find_option_bool(cmdline, option))
+ *res = true;
+
+ return *res;
+}
+
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+ struct builtin_fw *b_fw;
+
+ for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+ if (!strcmp(name, b_fw->name)) {
+ cd->size = b_fw->size;
+ cd->data = b_fw->data;
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
+void __init load_ucode_bsp(void)
+{
+ int vendor;
+ unsigned int family;
+
+ if (check_loader_disabled_bsp())
+ return;
+
+ if (!have_cpuid_p())
+ return;
+
+ vendor = x86_vendor();
+ family = x86_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ load_ucode_intel_bsp();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ load_ucode_amd_bsp(family);
+ break;
+ default:
+ break;
+ }
+}
+
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return *((bool *)__pa_nodebug(&dis_ucode_ldr));
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
+void load_ucode_ap(void)
+{
+ int vendor, family;
+
+ if (check_loader_disabled_ap())
+ return;
+
+ if (!have_cpuid_p())
+ return;
+
+ vendor = x86_vendor();
+ family = x86_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ load_ucode_amd_ap();
+ break;
+ default:
+ break;
+ }
+}
+
+int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (c->x86 >= 6)
+ save_microcode_in_initrd_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (c->x86 >= 0x10)
+ save_microcode_in_initrd_amd();
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+void reload_early_microcode(void)
+{
+ int vendor, family;
+
+ vendor = x86_vendor();
+ family = x86_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ reload_ucode_amd();
+ break;
+ default:
+ break;
+ }
+}
+
static void collect_cpu_info_local(void *arg)
{
struct cpu_info_ctx *ctx = arg;
@@ -268,9 +365,6 @@ static void __exit microcode_dev_exit(void)
{
misc_deregister(&microcode_dev);
}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-MODULE_ALIAS("devname:cpu/microcode");
#else
#define microcode_dev_init() 0
#define microcode_dev_exit() do { } while (0)
@@ -435,17 +529,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
return err;
}
-static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
{
int cpu = dev->id;
if (!cpu_online(cpu))
- return 0;
+ return;
pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
sysfs_remove_group(&dev->kobj, &mc_attr_group);
- return 0;
}
static struct subsys_interface mc_cpu_interface = {
@@ -518,24 +611,10 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-static struct notifier_block __refdata mc_cpu_notifier = {
+static struct notifier_block mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
-#ifdef MODULE
-/* Autoload on Intel and AMD systems */
-static const struct x86_cpu_id __initconst microcode_id[] = {
-#ifdef CONFIG_MICROCODE_INTEL
- { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
-#ifdef CONFIG_MICROCODE_AMD
- { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, microcode_id);
-#endif
-
static struct attribute *cpu_root_microcode_attrs[] = {
&dev_attr_reload.attr,
NULL
@@ -546,9 +625,9 @@ static struct attribute_group cpu_root_microcode_group = {
.attrs = cpu_root_microcode_attrs,
};
-static int __init microcode_init(void)
+int __init microcode_init(void)
{
- struct cpuinfo_x86 *c = &cpu_data(0);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
if (paravirt_enabled() || dis_ucode_ldr)
@@ -619,35 +698,4 @@ static int __init microcode_init(void)
return error;
}
-module_init(microcode_init);
-
-static void __exit microcode_exit(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- microcode_dev_exit();
-
- unregister_hotcpu_notifier(&mc_cpu_notifier);
- unregister_syscore_ops(&mc_syscore_ops);
-
- sysfs_remove_group(&cpu_subsys.dev_root->kobj,
- &cpu_root_microcode_group);
-
- get_online_cpus();
- mutex_lock(&microcode_mutex);
-
- subsys_interface_unregister(&mc_cpu_interface);
-
- mutex_unlock(&microcode_mutex);
- put_online_cpus();
-
- platform_device_unregister(microcode_pdev);
-
- microcode_ops = NULL;
-
- if (c->x86_vendor == X86_VENDOR_AMD)
- exit_amd_microcode();
-
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
-}
-module_exit(microcode_exit);
+late_initcall(microcode_init);
diff --git a/kernel/arch/x86/kernel/cpu/microcode/core_early.c b/kernel/arch/x86/kernel/cpu/microcode/core_early.c
deleted file mode 100644
index a413a69cb..000000000
--- a/kernel/arch/x86/kernel/cpu/microcode/core_early.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * X86 CPU microcode early update for Linux
- *
- * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- * H Peter Anvin" <hpa@zytor.com>
- *
- * This driver allows to early upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- * Software Developer's Manual.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <asm/microcode.h>
-#include <asm/microcode_intel.h>
-#include <asm/microcode_amd.h>
-#include <asm/processor.h>
-#include <asm/cmdline.h>
-
-static bool __init check_loader_disabled_bsp(void)
-{
-#ifdef CONFIG_X86_32
- const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
- const char *opt = "dis_ucode_ldr";
- const char *option = (const char *)__pa_nodebug(opt);
- bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
-
-#else /* CONFIG_X86_64 */
- const char *cmdline = boot_command_line;
- const char *option = "dis_ucode_ldr";
- bool *res = &dis_ucode_ldr;
-#endif
-
- if (cmdline_find_option_bool(cmdline, option))
- *res = true;
-
- return *res;
-}
-
-void __init load_ucode_bsp(void)
-{
- int vendor, family;
-
- if (check_loader_disabled_bsp())
- return;
-
- if (!have_cpuid_p())
- return;
-
- vendor = x86_vendor();
- family = x86_family();
-
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if (family >= 6)
- load_ucode_intel_bsp();
- break;
- case X86_VENDOR_AMD:
- if (family >= 0x10)
- load_ucode_amd_bsp();
- break;
- default:
- break;
- }
-}
-
-static bool check_loader_disabled_ap(void)
-{
-#ifdef CONFIG_X86_32
- return *((bool *)__pa_nodebug(&dis_ucode_ldr));
-#else
- return dis_ucode_ldr;
-#endif
-}
-
-void load_ucode_ap(void)
-{
- int vendor, family;
-
- if (check_loader_disabled_ap())
- return;
-
- if (!have_cpuid_p())
- return;
-
- vendor = x86_vendor();
- family = x86_family();
-
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if (family >= 6)
- load_ucode_intel_ap();
- break;
- case X86_VENDOR_AMD:
- if (family >= 0x10)
- load_ucode_amd_ap();
- break;
- default:
- break;
- }
-}
-
-int __init save_microcode_in_initrd(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- if (c->x86 >= 6)
- save_microcode_in_initrd_intel();
- break;
- case X86_VENDOR_AMD:
- if (c->x86 >= 0x10)
- save_microcode_in_initrd_amd();
- break;
- default:
- break;
- }
-
- return 0;
-}
-
-void reload_early_microcode(void)
-{
- int vendor, family;
-
- vendor = x86_vendor();
- family = x86_family();
-
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if (family >= 6)
- reload_ucode_intel();
- break;
- case X86_VENDOR_AMD:
- if (family >= 0x10)
- reload_ucode_amd();
- break;
- default:
- break;
- }
-}
diff --git a/kernel/arch/x86/kernel/cpu/microcode/intel.c b/kernel/arch/x86/kernel/cpu/microcode/intel.c
index a41beadb3..ce47402eb 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/intel.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/intel.c
@@ -1,91 +1,807 @@
/*
- * Intel CPU Microcode Update Driver for Linux
+ * Intel CPU Microcode Update Driver for Linux
*
- * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- * 2006 Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
*
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
+ * Intel CPU microcode early update for Linux
*
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
*
- * http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Initial release.
- * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added read() support + cleanups.
- * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Added 'device trimming' support. open(O_WRONLY) zeroes
- * and frees the saved copy of applied microcode.
- * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- * Made to use devfs (/dev/cpu/microcode) + cleanups.
- * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Added misc device support (now uses both devfs and misc).
- * Added MICROCODE_IOCFREE ioctl to clear memory.
- * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
- * Messages for error cases (non Intel & no suitable microcode).
- * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- * Removed ->release(). Removed exclusive open and status bitmap.
- * Added microcode_rwsem to serialize read()/write()/ioctl().
- * Removed global kernel lock usage.
- * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- * Write 0 to 0x8B msr and then cpuid before reading revision,
- * so that it works even if there were no update done by the
- * BIOS. Otherwise, reading from 0x8B gives junk (which happened
- * to be 0 on my machine which is why it worked even when I
- * disabled update by the BIOS)
- * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>
- * Intel Pentium 4 processor support and bugfixes.
- * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- * Bugfix for HT (Hyper-Threading) enabled processors
- * whereby processor resources are shared by all logical processors
- * in a single CPU package.
- * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- * Tigran Aivazian <tigran@veritas.com>,
- * Serialize updates as required on HT processors due to
- * speculative nature of implementation.
- * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- * Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- * Jun Nakajima <jun.nakajima@intel.com>
- * Support for the microcode updates in the new format.
- * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
- * in kernel memory.
- * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- * Fix sigmatch() macro to handle old CPUs with pf == 0.
- * Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
#include <asm/msr.h>
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
+static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+static struct mc_saved_data {
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state
+load_microcode_early(struct microcode_intel **saved,
+ unsigned int num_saved, struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *ucode_ptr, *new_mc = NULL;
+ struct microcode_header_intel *mc_hdr;
+ int new_rev, ret, i;
+
+ new_rev = uci->cpu_sig.rev;
+
+ for (i = 0; i < num_saved; i++) {
+ ucode_ptr = saved[i];
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ ret = has_newer_microcode(ucode_ptr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ new_rev);
+ if (!ret)
+ continue;
+
+ new_rev = mc_hdr->rev;
+ new_mc = ucode_ptr;
+ }
+
+ if (!new_mc)
+ return UCODE_NFOUND;
+
+ uci->mc = (struct microcode_intel *)new_mc;
+ return UCODE_OK;
+}
+
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+ unsigned long off, int num_saved)
+{
+ int i;
+
+ for (i = 0; i < num_saved; i++)
+ mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
+}
+
+#ifdef CONFIG_X86_32
+static void
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+ struct mc_saved_data *mc_saved_data)
+{
+ int i;
+ struct microcode_intel ***mc_saved;
+
+ mc_saved = (struct microcode_intel ***)
+ __pa_nodebug(&mc_saved_data->mc_saved);
+ for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+ struct microcode_intel *p;
+
+ p = *(struct microcode_intel **)
+ __pa_nodebug(mc_saved_data->mc_saved + i);
+ mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+ }
+}
+#endif
+
+static enum ucode_state
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long initrd_start, struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int count = mc_saved_data->mc_saved_count;
+
+ if (!mc_saved_data->mc_saved) {
+ copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
+
+ return load_microcode_early(mc_saved_tmp, count, uci);
+ } else {
+#ifdef CONFIG_X86_32
+ microcode_phys(mc_saved_tmp, mc_saved_data);
+ return load_microcode_early(mc_saved_tmp, count, uci);
+#else
+ return load_microcode_early(mc_saved_data->mc_saved,
+ count, uci);
+#endif
+ }
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ */
+static enum ucode_state
+matching_model_microcode(struct microcode_header_intel *mc_header,
+ unsigned long sig)
+{
+ unsigned int fam, model;
+ unsigned int fam_ucode, model_ucode;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ unsigned long data_size = get_datasize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+
+ fam = __x86_family(sig);
+ model = x86_model(sig);
+
+ fam_ucode = __x86_family(mc_header->sig);
+ model_ucode = x86_model(mc_header->sig);
+
+ if (fam == fam_ucode && model == model_ucode)
+ return UCODE_OK;
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ return UCODE_NFOUND;
+
+ ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ fam_ucode = __x86_family(ext_sig->sig);
+ model_ucode = x86_model(ext_sig->sig);
+
+ if (fam == fam_ucode && model == model_ucode)
+ return UCODE_OK;
+
+ ext_sig++;
+ }
+ return UCODE_NFOUND;
+}
+
+static int
+save_microcode(struct mc_saved_data *mc_saved_data,
+ struct microcode_intel **mc_saved_src,
+ unsigned int mc_saved_count)
+{
+ int i, j;
+ struct microcode_intel **saved_ptr;
+ int ret;
+
+ if (!mc_saved_count)
+ return -EINVAL;
+
+ /*
+ * Copy new microcode data.
+ */
+ saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+ if (!saved_ptr)
+ return -ENOMEM;
+
+ for (i = 0; i < mc_saved_count; i++) {
+ struct microcode_header_intel *mc_hdr;
+ struct microcode_intel *mc;
+ unsigned long size;
+
+ if (!mc_saved_src[i]) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ mc = mc_saved_src[i];
+ mc_hdr = &mc->hdr;
+ size = get_totalsize(mc_hdr);
+
+ saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+ if (!saved_ptr[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ memcpy(saved_ptr[i], mc, size);
+ }
+
+ /*
+ * Point to newly saved microcode.
+ */
+ mc_saved_data->mc_saved = saved_ptr;
+ mc_saved_data->mc_saved_count = mc_saved_count;
+
+ return 0;
+
+err:
+ for (j = 0; j <= i; j++)
+ kfree(saved_ptr[j]);
+ kfree(saved_ptr);
+
+ return ret;
+}
+
+/*
+ * A microcode patch in ucode_ptr is saved into mc_saved
+ * - if it has matching signature and newer revision compared to an existing
+ * patch mc_saved.
+ * - or if it is a newly discovered microcode patch.
+ *
+ * The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
+ */
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+ u8 *ucode_ptr, unsigned int num_saved)
+{
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ unsigned int sig, pf;
+ int found = 0, i;
+
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ for (i = 0; i < num_saved; i++) {
+ mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+
+ if (!find_matching_signature(ucode_ptr, sig, pf))
+ continue;
+
+ found = 1;
+
+ if (mc_hdr->rev <= mc_saved_hdr->rev)
+ continue;
+
+ /*
+ * Found an older ucode saved earlier. Replace it with
+ * this newer one.
+ */
+ mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+
+ /* Newly detected microcode, save it to memory. */
+ if (i >= num_saved && !found)
+ mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+
+ return num_saved;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static enum ucode_state __init
+get_matching_model_microcode(int cpu, unsigned long start,
+ void *data, size_t size,
+ struct mc_saved_data *mc_saved_data,
+ unsigned long *mc_saved_in_initrd,
+ struct ucode_cpu_info *uci)
+{
+ u8 *ucode_ptr = data;
+ unsigned int leftover = size;
+ enum ucode_state state = UCODE_OK;
+ unsigned int mc_size;
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+ int i;
+
+ while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+
+ if (leftover < sizeof(mc_header))
+ break;
+
+ mc_header = (struct microcode_header_intel *)ucode_ptr;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size || mc_size > leftover ||
+ microcode_sanity_check(ucode_ptr, 0) < 0)
+ break;
+
+ leftover -= mc_size;
+
+ /*
+ * Since APs with same family and model as the BSP may boot in
+ * the platform, we need to find and save microcode patches
+ * with the same family and model as the BSP.
+ */
+ if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
+ UCODE_OK) {
+ ucode_ptr += mc_size;
+ continue;
+ }
+
+ mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
+
+ ucode_ptr += mc_size;
+ }
+
+ if (leftover) {
+ state = UCODE_ERROR;
+ goto out;
+ }
+
+ if (mc_saved_count == 0) {
+ state = UCODE_NFOUND;
+ goto out;
+ }
+
+ for (i = 0; i < mc_saved_count; i++)
+ mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+ mc_saved_data->mc_saved_count = mc_saved_count;
+out:
+ return state;
+}
+
+static int collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig;
+ unsigned int eax, ebx, ecx, edx;
+
+ csig.sig = 0;
+ csig.pf = 0;
+ csig.rev = 0;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = __x86_family(csig.sig);
+ model = x86_model(csig.sig);
+
+ if ((model >= 5) || (family > 6)) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+ csig.rev = val[1];
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+
+static void show_saved_mc(void)
+{
+#ifdef DEBUG
+ int i, j;
+ unsigned int sig, pf, rev, total_size, data_size, date;
+ struct ucode_cpu_info uci;
+
+ if (mc_saved_data.mc_saved_count == 0) {
+ pr_debug("no microcode data saved.\n");
+ return;
+ }
+ pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+
+ collect_cpu_info_early(&uci);
+
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
+
+ for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+ struct microcode_header_intel *mc_saved_header;
+ struct extended_sigtable *ext_header;
+ int ext_sigcount;
+ struct extended_signature *ext_sig;
+
+ mc_saved_header = (struct microcode_header_intel *)
+ mc_saved_data.mc_saved[i];
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
+ date = mc_saved_header->date;
+
+ pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
+ i, sig, pf, rev, total_size,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ continue;
+
+ ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (j = 0; j < ext_sigcount; j++) {
+ sig = ext_sig->sig;
+ pf = ext_sig->pf;
+
+ pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+ j, sig, pf);
+
+ ext_sig++;
+ }
+
+ }
+#endif
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+/*
+ * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
+ * hot added or resumes.
+ *
+ * Please make sure this mc should be a valid microcode patch before calling
+ * this function.
+ */
+int save_mc_for_early(u8 *mc)
+{
+ struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+ unsigned int mc_saved_count_init;
+ unsigned int mc_saved_count;
+ struct microcode_intel **mc_saved;
+ int ret = 0;
+ int i;
+
+ /*
+ * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
+ * hotplug.
+ */
+ mutex_lock(&x86_cpu_microcode_mutex);
+
+ mc_saved_count_init = mc_saved_data.mc_saved_count;
+ mc_saved_count = mc_saved_data.mc_saved_count;
+ mc_saved = mc_saved_data.mc_saved;
+
+ if (mc_saved && mc_saved_count)
+ memcpy(mc_saved_tmp, mc_saved,
+ mc_saved_count * sizeof(struct microcode_intel *));
+ /*
+ * Save the microcode patch mc in mc_save_tmp structure if it's a newer
+ * version.
+ */
+ mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
+
+ /*
+ * Save the mc_save_tmp in global mc_saved_data.
+ */
+ ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+ if (ret) {
+ pr_err("Cannot save microcode patch.\n");
+ goto out;
+ }
+
+ show_saved_mc();
+
+ /*
+ * Free old saved microcode data.
+ */
+ if (mc_saved) {
+ for (i = 0; i < mc_saved_count_init; i++)
+ kfree(mc_saved[i]);
+ kfree(mc_saved);
+ }
+
+out:
+ mutex_unlock(&x86_cpu_microcode_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(save_mc_for_early);
+#endif
+
+static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+{
+#ifdef CONFIG_X86_64
+ unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+ unsigned int family, model, stepping;
+ char name[30];
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ family = __x86_family(eax);
+ model = x86_model(eax);
+ stepping = eax & 0xf;
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
+
+ return get_builtin_firmware(cp, name);
+#else
+ return false;
+#endif
+}
+
+static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+static __init enum ucode_state
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long start, unsigned long size,
+ struct ucode_cpu_info *uci)
+{
+ struct cpio_data cd;
+ long offset = 0;
+#ifdef CONFIG_X86_32
+ char *p = (char *)__pa_nodebug(ucode_name);
+#else
+ char *p = ucode_name;
+#endif
+
+ cd.data = NULL;
+ cd.size = 0;
+
+ cd = find_cpio_data(p, (void *)start, size, &offset);
+ if (!cd.data) {
+ if (!load_builtin_intel_microcode(&cd))
+ return UCODE_ERROR;
+ }
+
+ return get_matching_model_microcode(0, start, cd.data, cd.size,
+ mc_saved_data, initrd, uci);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+ int cpu = smp_processor_id();
+
+ pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+ cpu,
+ uci->cpu_sig.rev,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void show_ucode_info_early(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (delay_ucode_info) {
+ collect_cpu_info_early(&uci);
+ print_ucode_info(&uci, current_mc_date);
+ delay_ucode_info = 0;
+ }
+}
+
+/*
+ * At this point, we can not call printk() yet. Keep microcode patch number in
+ * mc_saved_data.mc_saved and delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+ int *delay_ucode_info_p;
+ int *current_mc_date_p;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
+ current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+
+ *delay_ucode_info_p = 1;
+ *current_mc_date_p = mc_intel->hdr.date;
+}
+#else
+
+/*
+ * Flush global tlb. We only do this in x86_64 where paging has been enabled
+ * already and PGE should be enabled as well.
+ */
+static inline void flush_tlb_early(void)
+{
+ __native_flush_tlb_global_irq_disabled();
+}
+
+static inline void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc_intel;
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return;
+
+ print_ucode_info(uci, mc_intel->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
+{
+ struct microcode_intel *mc_intel;
+ unsigned int val[2];
+
+ mc_intel = uci->mc;
+ if (mc_intel == NULL)
+ return 0;
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsr(MSR_IA32_UCODE_WRITE,
+ (unsigned long) mc_intel->bits,
+ (unsigned long) mc_intel->bits >> 16 >> 16);
+ native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ sync_core();
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+ if (val[1] != mc_intel->hdr.rev)
+ return -1;
+
+#ifdef CONFIG_X86_64
+ /* Flush global tlb. This is precaution. */
+ flush_tlb_early();
+#endif
+ uci->cpu_sig.rev = val[1];
+
+ if (early)
+ print_ucode(uci);
+ else
+ print_ucode_info(uci, mc_intel->hdr.date);
+
+ return 0;
+}
+
+/*
+ * This function converts microcode patch offsets previously stored in
+ * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ */
+int __init save_microcode_in_initrd_intel(void)
+{
+ unsigned int count = mc_saved_data.mc_saved_count;
+ struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
+ int ret = 0;
+
+ if (count == 0)
+ return ret;
+
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ ret = save_microcode(&mc_saved_data, mc_saved, count);
+ if (ret)
+ pr_err("Cannot save microcode patches from initrd.\n");
+
+ show_saved_mc();
+
+ return ret;
+}
+
+static void __init
+_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
+ unsigned long *initrd,
+ unsigned long start, unsigned long size)
+{
+ struct ucode_cpu_info uci;
+ enum ucode_state ret;
+
+ collect_cpu_info_early(&uci);
+
+ ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ ret = load_microcode(mc_saved_data, initrd, start, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
+}
+
+void __init load_ucode_intel_bsp(void)
+{
+ u64 start, size;
+#ifdef CONFIG_X86_32
+ struct boot_params *p;
+
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ start = p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(
+ (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
+#else
+ start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+ size = boot_params.hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
+#endif
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct mc_saved_data *mc_saved_data_p;
+ struct ucode_cpu_info uci;
+ unsigned long *mc_saved_in_initrd_p;
+ unsigned long initrd_start_addr;
+ enum ucode_state ret;
+#ifdef CONFIG_X86_32
+ unsigned long *initrd_start_p;
+
+ mc_saved_in_initrd_p =
+ (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+ mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+ initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+ initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+ mc_saved_data_p = &mc_saved_data;
+ mc_saved_in_initrd_p = mc_saved_in_initrd;
+ initrd_start_addr = initrd_start;
+#endif
+
+ /*
+ * If there is no valid ucode previously saved in memory, no need to
+ * update ucode on this AP.
+ */
+ if (mc_saved_data_p->mc_saved_count == 0)
+ return;
+
+ collect_cpu_info_early(&uci);
+ ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
+}
+
+void reload_ucode_intel(void)
+{
+ struct ucode_cpu_info uci;
+ enum ucode_state ret;
+
+ if (!mc_saved_data.mc_saved_count)
+ return;
+
+ collect_cpu_info_early(&uci);
+
+ ret = load_microcode_early(mc_saved_data.mc_saved,
+ mc_saved_data.mc_saved_count, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, false);
+}
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
@@ -124,7 +840,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
- return get_matching_microcode(csig, cpf, crev, mc_intel);
+ return has_newer_microcode(mc_intel, csig, cpf, crev);
}
static int apply_microcode_intel(int cpu)
@@ -226,7 +942,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
csig = uci->cpu_sig.sig;
cpf = uci->cpu_sig.pf;
- if (get_matching_microcode(csig, cpf, new_rev, mc)) {
+ if (has_newer_microcode(mc, csig, cpf, new_rev)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
@@ -325,7 +1041,7 @@ static struct microcode_ops microcode_intel_ops = {
struct microcode_ops * __init init_intel_microcode(void)
{
- struct cpuinfo_x86 *c = &cpu_data(0);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
diff --git a/kernel/arch/x86/kernel/cpu/microcode/intel_early.c b/kernel/arch/x86/kernel/cpu/microcode/intel_early.c
deleted file mode 100644
index 2f49ab4ac..000000000
--- a/kernel/arch/x86/kernel/cpu/microcode/intel_early.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Intel CPU microcode early update for Linux
- *
- * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- * H Peter Anvin" <hpa@zytor.com>
- *
- * This allows to early upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- * Software Developer's Manual.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-/*
- * This needs to be before all headers so that pr_debug in printk.h doesn't turn
- * printk calls into no_printk().
- *
- *#define DEBUG
- */
-
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-#include <linux/cpu.h>
-#include <asm/msr.h>
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/setup.h>
-
-#undef pr_fmt
-#define pr_fmt(fmt) "microcode: " fmt
-
-static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
-static struct mc_saved_data {
- unsigned int mc_saved_count;
- struct microcode_intel **mc_saved;
-} mc_saved_data;
-
-static enum ucode_state
-load_microcode_early(struct microcode_intel **saved,
- unsigned int num_saved, struct ucode_cpu_info *uci)
-{
- struct microcode_intel *ucode_ptr, *new_mc = NULL;
- struct microcode_header_intel *mc_hdr;
- int new_rev, ret, i;
-
- new_rev = uci->cpu_sig.rev;
-
- for (i = 0; i < num_saved; i++) {
- ucode_ptr = saved[i];
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
- ret = get_matching_microcode(uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- new_rev,
- ucode_ptr);
- if (!ret)
- continue;
-
- new_rev = mc_hdr->rev;
- new_mc = ucode_ptr;
- }
-
- if (!new_mc)
- return UCODE_NFOUND;
-
- uci->mc = (struct microcode_intel *)new_mc;
- return UCODE_OK;
-}
-
-static inline void
-copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
- unsigned long off, int num_saved)
-{
- int i;
-
- for (i = 0; i < num_saved; i++)
- mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
- struct mc_saved_data *mc_saved_data)
-{
- int i;
- struct microcode_intel ***mc_saved;
-
- mc_saved = (struct microcode_intel ***)
- __pa_nodebug(&mc_saved_data->mc_saved);
- for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
- struct microcode_intel *p;
-
- p = *(struct microcode_intel **)
- __pa_nodebug(mc_saved_data->mc_saved + i);
- mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
- }
-}
-#endif
-
-static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
- unsigned long initrd_start, struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int count = mc_saved_data->mc_saved_count;
-
- if (!mc_saved_data->mc_saved) {
- copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
-
- return load_microcode_early(mc_saved_tmp, count, uci);
- } else {
-#ifdef CONFIG_X86_32
- microcode_phys(mc_saved_tmp, mc_saved_data);
- return load_microcode_early(mc_saved_tmp, count, uci);
-#else
- return load_microcode_early(mc_saved_data->mc_saved,
- count, uci);
-#endif
- }
-}
-
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- */
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
- unsigned long sig)
-{
- unsigned int fam, model;
- unsigned int fam_ucode, model_ucode;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- unsigned long data_size = get_datasize(mc_header);
- int ext_sigcount, i;
- struct extended_signature *ext_sig;
-
- fam = __x86_family(sig);
- model = x86_model(sig);
-
- fam_ucode = __x86_family(mc_header->sig);
- model_ucode = x86_model(mc_header->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- return UCODE_NFOUND;
-
- ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
- ext_sigcount = ext_header->count;
-
- for (i = 0; i < ext_sigcount; i++) {
- fam_ucode = __x86_family(ext_sig->sig);
- model_ucode = x86_model(ext_sig->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
-
- ext_sig++;
- }
- return UCODE_NFOUND;
-}
-
-static int
-save_microcode(struct mc_saved_data *mc_saved_data,
- struct microcode_intel **mc_saved_src,
- unsigned int mc_saved_count)
-{
- int i, j;
- struct microcode_intel **saved_ptr;
- int ret;
-
- if (!mc_saved_count)
- return -EINVAL;
-
- /*
- * Copy new microcode data.
- */
- saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
- if (!saved_ptr)
- return -ENOMEM;
-
- for (i = 0; i < mc_saved_count; i++) {
- struct microcode_header_intel *mc_hdr;
- struct microcode_intel *mc;
- unsigned long size;
-
- if (!mc_saved_src[i]) {
- ret = -EINVAL;
- goto err;
- }
-
- mc = mc_saved_src[i];
- mc_hdr = &mc->hdr;
- size = get_totalsize(mc_hdr);
-
- saved_ptr[i] = kmalloc(size, GFP_KERNEL);
- if (!saved_ptr[i]) {
- ret = -ENOMEM;
- goto err;
- }
-
- memcpy(saved_ptr[i], mc, size);
- }
-
- /*
- * Point to newly saved microcode.
- */
- mc_saved_data->mc_saved = saved_ptr;
- mc_saved_data->mc_saved_count = mc_saved_count;
-
- return 0;
-
-err:
- for (j = 0; j <= i; j++)
- kfree(saved_ptr[j]);
- kfree(saved_ptr);
-
- return ret;
-}
-
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- * patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
- u8 *ucode_ptr, unsigned int num_saved)
-{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- unsigned int sig, pf, new_rev;
- int found = 0, i;
-
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
- for (i = 0; i < num_saved; i++) {
- mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
- new_rev = mc_hdr->rev;
-
- if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
- continue;
-
- found = 1;
-
- if (!revision_is_newer(mc_hdr, new_rev))
- continue;
-
- /*
- * Found an older ucode saved earlier. Replace it with
- * this newer one.
- */
- mc_saved[i] = (struct microcode_intel *)ucode_ptr;
- break;
- }
-
- /* Newly detected microcode, save it to memory. */
- if (i >= num_saved && !found)
- mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
-
- return num_saved;
-}
-
-/*
- * Get microcode matching with BSP's model. Only CPUs with the same model as
- * BSP can stay in the platform.
- */
-static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
- void *data, size_t size,
- struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- struct ucode_cpu_info *uci)
-{
- u8 *ucode_ptr = data;
- unsigned int leftover = size;
- enum ucode_state state = UCODE_OK;
- unsigned int mc_size;
- struct microcode_header_intel *mc_header;
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
- int i;
-
- while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
-
- if (leftover < sizeof(mc_header))
- break;
-
- mc_header = (struct microcode_header_intel *)ucode_ptr;
-
- mc_size = get_totalsize(mc_header);
- if (!mc_size || mc_size > leftover ||
- microcode_sanity_check(ucode_ptr, 0) < 0)
- break;
-
- leftover -= mc_size;
-
- /*
- * Since APs with same family and model as the BSP may boot in
- * the platform, we need to find and save microcode patches
- * with the same family and model as the BSP.
- */
- if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
- UCODE_OK) {
- ucode_ptr += mc_size;
- continue;
- }
-
- mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
-
- ucode_ptr += mc_size;
- }
-
- if (leftover) {
- state = UCODE_ERROR;
- goto out;
- }
-
- if (mc_saved_count == 0) {
- state = UCODE_NFOUND;
- goto out;
- }
-
- for (i = 0; i < mc_saved_count; i++)
- mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
-
- mc_saved_data->mc_saved_count = mc_saved_count;
-out:
- return state;
-}
-
-static int collect_cpu_info_early(struct ucode_cpu_info *uci)
-{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig;
- unsigned int eax, ebx, ecx, edx;
-
- csig.sig = 0;
- csig.pf = 0;
- csig.rev = 0;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
-
- family = __x86_family(csig.sig);
- model = x86_model(csig.sig);
-
- if ((model >= 5) || (family > 6)) {
- /* get processor flags from MSR 0x17 */
- native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
- }
- native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- csig.rev = val[1];
-
- uci->cpu_sig = csig;
- uci->valid = 1;
-
- return 0;
-}
-
-#ifdef DEBUG
-static void __ref show_saved_mc(void)
-{
- int i, j;
- unsigned int sig, pf, rev, total_size, data_size, date;
- struct ucode_cpu_info uci;
-
- if (mc_saved_data.mc_saved_count == 0) {
- pr_debug("no microcode data saved.\n");
- return;
- }
- pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
-
- collect_cpu_info_early(&uci);
-
- sig = uci.cpu_sig.sig;
- pf = uci.cpu_sig.pf;
- rev = uci.cpu_sig.rev;
- pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
-
- for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
- struct microcode_header_intel *mc_saved_header;
- struct extended_sigtable *ext_header;
- int ext_sigcount;
- struct extended_signature *ext_sig;
-
- mc_saved_header = (struct microcode_header_intel *)
- mc_saved_data.mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- rev = mc_saved_header->rev;
- total_size = get_totalsize(mc_saved_header);
- data_size = get_datasize(mc_saved_header);
- date = mc_saved_header->date;
-
- pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
- i, sig, pf, rev, total_size,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- continue;
-
- ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
- for (j = 0; j < ext_sigcount; j++) {
- sig = ext_sig->sig;
- pf = ext_sig->pf;
-
- pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
- j, sig, pf);
-
- ext_sig++;
- }
-
- }
-}
-#else
-static inline void show_saved_mc(void)
-{
-}
-#endif
-
-#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
-static DEFINE_MUTEX(x86_cpu_microcode_mutex);
-/*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
- */
-int save_mc_for_early(u8 *mc)
-{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count_init;
- unsigned int mc_saved_count;
- struct microcode_intel **mc_saved;
- int ret = 0;
- int i;
-
- /*
- * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
- * hotplug.
- */
- mutex_lock(&x86_cpu_microcode_mutex);
-
- mc_saved_count_init = mc_saved_data.mc_saved_count;
- mc_saved_count = mc_saved_data.mc_saved_count;
- mc_saved = mc_saved_data.mc_saved;
-
- if (mc_saved && mc_saved_count)
- memcpy(mc_saved_tmp, mc_saved,
- mc_saved_count * sizeof(struct microcode_intel *));
- /*
- * Save the microcode patch mc in mc_save_tmp structure if it's a newer
- * version.
- */
- mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
-
- /*
- * Save the mc_save_tmp in global mc_saved_data.
- */
- ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
- if (ret) {
- pr_err("Cannot save microcode patch.\n");
- goto out;
- }
-
- show_saved_mc();
-
- /*
- * Free old saved microcode data.
- */
- if (mc_saved) {
- for (i = 0; i < mc_saved_count_init; i++)
- kfree(mc_saved[i]);
- kfree(mc_saved);
- }
-
-out:
- mutex_unlock(&x86_cpu_microcode_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(save_mc_for_early);
-#endif
-
-static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
- unsigned long start, unsigned long size,
- struct ucode_cpu_info *uci)
-{
- struct cpio_data cd;
- long offset = 0;
-#ifdef CONFIG_X86_32
- char *p = (char *)__pa_nodebug(ucode_name);
-#else
- char *p = ucode_name;
-#endif
-
- cd.data = NULL;
- cd.size = 0;
-
- cd = find_cpio_data(p, (void *)start, size, &offset);
- if (!cd.data)
- return UCODE_ERROR;
-
- return get_matching_model_microcode(0, start, cd.data, cd.size,
- mc_saved_data, initrd, uci);
-}
-
-/*
- * Print ucode update info.
- */
-static void
-print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
-{
- int cpu = smp_processor_id();
-
- pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
- cpu,
- uci->cpu_sig.rev,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-}
-
-#ifdef CONFIG_X86_32
-
-static int delay_ucode_info;
-static int current_mc_date;
-
-/*
- * Print early updated ucode info after printk works. This is delayed info dump.
- */
-void show_ucode_info_early(void)
-{
- struct ucode_cpu_info uci;
-
- if (delay_ucode_info) {
- collect_cpu_info_early(&uci);
- print_ucode_info(&uci, current_mc_date);
- delay_ucode_info = 0;
- }
-}
-
-/*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
- * show_ucode_info_early() until printk() works.
- */
-static void print_ucode(struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc_intel;
- int *delay_ucode_info_p;
- int *current_mc_date_p;
-
- mc_intel = uci->mc;
- if (mc_intel == NULL)
- return;
-
- delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
- current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
-
- *delay_ucode_info_p = 1;
- *current_mc_date_p = mc_intel->hdr.date;
-}
-#else
-
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
- __native_flush_tlb_global_irq_disabled();
-}
-
-static inline void print_ucode(struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc_intel;
-
- mc_intel = uci->mc;
- if (mc_intel == NULL)
- return;
-
- print_ucode_info(uci, mc_intel->hdr.date);
-}
-#endif
-
-static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
-{
- struct microcode_intel *mc_intel;
- unsigned int val[2];
-
- mc_intel = uci->mc;
- if (mc_intel == NULL)
- return 0;
-
- /* write microcode via MSR 0x79 */
- native_wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) mc_intel->bits,
- (unsigned long) mc_intel->bits >> 16 >> 16);
- native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- if (val[1] != mc_intel->hdr.rev)
- return -1;
-
-#ifdef CONFIG_X86_64
- /* Flush global tlb. This is precaution. */
- flush_tlb_early();
-#endif
- uci->cpu_sig.rev = val[1];
-
- if (early)
- print_ucode(uci);
- else
- print_ucode_info(uci, mc_intel->hdr.date);
-
- return 0;
-}
-
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
- */
-int __init save_microcode_in_initrd_intel(void)
-{
- unsigned int count = mc_saved_data.mc_saved_count;
- struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
- int ret = 0;
-
- if (count == 0)
- return ret;
-
- copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
- ret = save_microcode(&mc_saved_data, mc_saved, count);
- if (ret)
- pr_err("Cannot save microcode patches from initrd.\n");
-
- show_saved_mc();
-
- return ret;
-}
-
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
- unsigned long *initrd,
- unsigned long start, unsigned long size)
-{
- struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- collect_cpu_info_early(&uci);
-
- ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
- if (ret != UCODE_OK)
- return;
-
- ret = load_microcode(mc_saved_data, initrd, start, &uci);
- if (ret != UCODE_OK)
- return;
-
- apply_microcode_early(&uci, true);
-}
-
-void __init load_ucode_intel_bsp(void)
-{
- u64 start, size;
-#ifdef CONFIG_X86_32
- struct boot_params *p;
-
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- start = p->hdr.ramdisk_image;
- size = p->hdr.ramdisk_size;
-
- _load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- start, size);
-#else
- start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
- size = boot_params.hdr.ramdisk_size;
-
- _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
-#endif
-}
-
-void load_ucode_intel_ap(void)
-{
- struct mc_saved_data *mc_saved_data_p;
- struct ucode_cpu_info uci;
- unsigned long *mc_saved_in_initrd_p;
- unsigned long initrd_start_addr;
- enum ucode_state ret;
-#ifdef CONFIG_X86_32
- unsigned long *initrd_start_p;
-
- mc_saved_in_initrd_p =
- (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
- mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
- initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
-#else
- mc_saved_data_p = &mc_saved_data;
- mc_saved_in_initrd_p = mc_saved_in_initrd;
- initrd_start_addr = initrd_start;
-#endif
-
- /*
- * If there is no valid ucode previously saved in memory, no need to
- * update ucode on this AP.
- */
- if (mc_saved_data_p->mc_saved_count == 0)
- return;
-
- collect_cpu_info_early(&uci);
- ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
-
- if (ret != UCODE_OK)
- return;
-
- apply_microcode_early(&uci, true);
-}
-
-void reload_ucode_intel(void)
-{
- struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- if (!mc_saved_data.mc_saved_count)
- return;
-
- collect_cpu_info_early(&uci);
-
- ret = load_microcode_early(mc_saved_data.mc_saved,
- mc_saved_data.mc_saved_count, &uci);
- if (ret != UCODE_OK)
- return;
-
- apply_microcode_early(&uci, false);
-}
diff --git a/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c b/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c
index cd47a510a..b96896bcb 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -25,17 +25,23 @@
#include <linux/firmware.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <asm/microcode_intel.h>
#include <asm/processor.h>
#include <asm/msr.h>
-static inline int
-update_match_cpu(unsigned int csig, unsigned int cpf,
- unsigned int sig, unsigned int pf)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
{
- return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+ if (s1 != s2)
+ return false;
+
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
+
+ /* ... or they intersect. */
+ return p1 & p2;
}
int microcode_sanity_check(void *mc, int print_err)
@@ -124,27 +130,25 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
* Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
+int find_matching_signature(void *mc, unsigned int csig, int cpf)
{
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header;
- unsigned long total_size = get_totalsize(mc_header);
- int ext_sigcount, i;
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
struct extended_signature *ext_sig;
+ int i;
- if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+ if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
return 1;
/* Look for ext. headers: */
- if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
return 0;
- ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
- for (i = 0; i < ext_sigcount; i++) {
- if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
return 1;
ext_sig++;
}
@@ -154,13 +158,13 @@ int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
/*
* Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
+int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
{
struct microcode_header_intel *mc_hdr = mc;
- if (!revision_is_newer(mc_hdr, rev))
+ if (mc_hdr->rev <= new_rev)
return 0;
- return get_matching_sig(csig, cpf, rev, mc);
+ return find_matching_signature(mc, csig, cpf);
}
-EXPORT_SYMBOL_GPL(get_matching_microcode);
+EXPORT_SYMBOL_GPL(has_newer_microcode);
diff --git a/kernel/arch/x86/kernel/cpu/mshyperv.c b/kernel/arch/x86/kernel/cpu/mshyperv.c
index 939155ffd..20e242ea1 100644
--- a/kernel/arch/x86/kernel/cpu/mshyperv.c
+++ b/kernel/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
#include <linux/efi.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
+#include <linux/kexec.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv.h>
@@ -28,25 +29,26 @@
#include <asm/i8259.h>
#include <asm/apic.h>
#include <asm/timer.h>
+#include <asm/reboot.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
#if IS_ENABLED(CONFIG_HYPERV)
static void (*vmbus_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
void hyperv_vector_handler(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- irq_enter();
- exit_idle();
-
+ entering_irq();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
- irq_exit();
+ exiting_irq();
set_irq_regs(old_regs);
}
@@ -69,7 +71,47 @@ void hv_remove_vmbus_irq(void)
}
EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
-#endif
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+ hv_kexec_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void hv_remove_kexec_handler(void)
+{
+ hv_kexec_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+ hv_crash_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void hv_remove_crash_handler(void)
+{
+ hv_crash_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+ if (kexec_in_progress && hv_kexec_handler)
+ hv_kexec_handler();
+ native_machine_shutdown();
+}
+
+static void hv_machine_crash_shutdown(struct pt_regs *regs)
+{
+ if (hv_crash_handler)
+ hv_crash_handler(regs);
+ native_machine_crash_shutdown(regs);
+}
+#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
{
@@ -116,6 +158,7 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
@@ -143,6 +186,11 @@ static void __init ms_hyperv_init_platform(void)
no_timer_check = 1;
#endif
+#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+ machine_ops.shutdown = hv_machine_shutdown;
+ machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+#endif
+ mark_tsc_unstable("running on Hyper-V");
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c b/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5f90b85ff..70d7c93f4 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
- (mtrr_state.enabled & 1)) {
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
printk(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/generic.c b/kernel/arch/x86/kernel/cpu/mtrr/generic.c
index 7d74f7b3c..3b533cf37 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/kernel/arch/x86/kernel/cpu/mtrr/generic.c
@@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr)
return 0;
}
-/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- * corresponds only to [start:*partial_end].
- * Caller has to lookup again for [*partial_end:end].
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+ int idx;
+
+ if (start >= 0x100000)
+ return MTRR_TYPE_INVALID;
+
+ /* 0x0 - 0x7FFFF */
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state.fixed_ranges[idx];
+ /* 0x80000 - 0xBFFFF */
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state.fixed_ranges[idx];
+ }
+
+ /* 0xC0000 - 0xFFFFF */
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ * returned corresponds only to [start:*partial_end]. Caller has
+ * to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
*/
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+ int *repeat, u8 *uniform)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
*repeat = 0;
- if (!mtrr_state_set)
- return 0xFF;
-
- if (!mtrr_state.enabled)
- return 0xFF;
+ *uniform = 1;
- /* Make end inclusive end, instead of exclusive */
+ /* Make end inclusive instead of exclusive */
end--;
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state.have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state.fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state.enabled & 2))
- return mtrr_state.def_type;
-
- prev_match = 0xFF;
+ prev_match = MTRR_TYPE_INVALID;
for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
+ unsigned short start_state, end_state, inclusive;
if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
continue;
@@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
+ inclusive = ((start < base) && (end > base));
- if (start_state != end_state) {
+ if ((start_state != end_state) || inclusive) {
/*
* We have start:end spanning across an MTRR.
- * We split the region into
- * either
- * (start:mtrr_end) (mtrr_end:end)
- * or
- * (start:mtrr_start) (mtrr_start:end)
+ * We split the region into either
+ *
+ * - start_state:1
+ * (start:mtrr_end)(mtrr_end:end)
+ * - end_state:1
+ * (start:mtrr_start)(mtrr_start:end)
+ * - inclusive:1
+ * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+ *
* depending on kind of overlap.
- * Return the type for first region and a pointer to
- * the start of second region so that caller will
- * lookup again on the second region.
- * Note: This way we handle multiple overlaps as well.
+ *
+ * Return the type of the first region and a pointer
+ * to the start of next region so that caller will be
+ * advised to lookup again after having adjusted start
+ * and end.
+ *
+ * Note: This way we handle overlaps with multiple
+ * entries and the default type properly.
*/
if (start_state)
*partial_end = base + get_mtrr_size(mask);
@@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
end = *partial_end - 1; /* end is inclusive */
*repeat = 1;
+ *uniform = 0;
}
if ((start & mask) != (base & mask))
continue;
curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
+ if (prev_match == MTRR_TYPE_INVALID) {
prev_match = curr_match;
continue;
}
+ *uniform = 0;
if (check_type_overlap(&prev_match, &curr_match))
return curr_match;
}
- if (mtrr_tom2) {
- if (start >= (1ULL<<32) && (end < mtrr_tom2))
- return MTRR_TYPE_WRBACK;
- }
-
- if (prev_match != 0xFF)
+ if (prev_match != MTRR_TYPE_INVALID)
return prev_match;
return mtrr_state.def_type;
}
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
- u8 type, prev_type;
+ u8 type, prev_type, is_uniform = 1, dummy;
int repeat;
u64 partial_end;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ if (!mtrr_state_set)
+ return MTRR_TYPE_INVALID;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_INVALID;
+
+ /*
+ * Look up the fixed ranges first, which take priority over
+ * the variable ranges.
+ */
+ if ((start < 0x100000) &&
+ (mtrr_state.have_fixed) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+ is_uniform = 0;
+ type = mtrr_type_lookup_fixed(start, end);
+ goto out;
+ }
+
+ /*
+ * Look up the variable ranges. Look of multiple ranges matching
+ * this address and pick type as per MTRR precedence.
+ */
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &is_uniform);
/*
* Common path is with repeat = 0.
* However, we can have cases where [start:end] spans across some
- * MTRR range. Do repeated lookups for that case here.
+ * MTRR ranges and/or the default type. Do repeated lookups for
+ * that case here.
*/
while (repeat) {
prev_type = type;
start = partial_end;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ is_uniform = 0;
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &dummy);
if (check_type_overlap(&prev_type, &type))
- return type;
+ goto out;
}
+ if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+ type = MTRR_TYPE_WRBACK;
+
+out:
+ *uniform = is_uniform;
return type;
}
@@ -347,7 +408,9 @@ static void __init print_mtrr_state(void)
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
pr_debug("MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
+ ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+ "en" : "dis");
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -360,7 +423,7 @@ static void __init print_mtrr_state(void)
print_fixed_last();
}
pr_debug("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
+ mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
@@ -382,7 +445,7 @@ static void __init print_mtrr_state(void)
}
/* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
unsigned long flags;
@@ -426,6 +489,8 @@ void __init get_mtrr_state(void)
post_set();
local_irq_restore(flags);
+
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}
/* Some BIOS's are messed up and don't set all MTRRs the same! */
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/main.c b/kernel/arch/x86/kernel/cpu/mtrr/main.c
index ea5f363a1..f891b4750 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/main.c
+++ b/kernel/arch/x86/kernel/cpu/mtrr/main.c
@@ -59,6 +59,12 @@
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+ return __mtrr_enabled;
+}
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
@@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
int i, replace, error;
mtrr_type ltype;
- if (!mtrr_if)
+ if (!mtrr_enabled())
return -ENXIO;
error = mtrr_if->validate_add_page(base, size, type);
@@ -435,12 +441,13 @@ static int mtrr_check(unsigned long base, unsigned long size)
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
increment);
}
-EXPORT_SYMBOL(mtrr_add);
/**
* mtrr_del_page - delete a memory type region
@@ -463,8 +470,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
unsigned long lbase, lsize;
int error = -EINVAL;
- if (!mtrr_if)
- return -ENXIO;
+ if (!mtrr_enabled())
+ return -ENODEV;
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
@@ -523,11 +530,12 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
*/
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
}
-EXPORT_SYMBOL(mtrr_del);
/**
* arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
@@ -538,6 +546,9 @@ EXPORT_SYMBOL(mtrr_del);
* attempts to add a WC MTRR covering size bytes starting at base and
* logs an error if this fails.
*
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
* Drivers must store the return value to pass to mtrr_del_wc_if_needed,
* but drivers should not try to interpret that return value.
*/
@@ -545,7 +556,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
{
int ret;
- if (pat_enabled)
+ if (pat_enabled() || !mtrr_enabled())
return 0; /* Success! (We don't need to do anything.) */
ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +588,7 @@ void arch_phys_wc_del(int handle)
EXPORT_SYMBOL(arch_phys_wc_del);
/*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
* @handle: Return value from arch_phys_wc_add
*
* This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,14 +598,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
* in printk line. Alas there is an illegitimate use in some ancient
* drm ioctls.
*/
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
{
if (handle < MTRR_TO_PHYS_WC_OFFSET)
return -1;
else
return handle - MTRR_TO_PHYS_WC_OFFSET;
}
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
/*
* HACK ALERT!
@@ -734,10 +745,12 @@ void __init mtrr_bp_init(void)
}
if (mtrr_if) {
+ __mtrr_enabled = true;
set_num_var_ranges();
init_table();
if (use_intel()) {
- get_mtrr_state();
+ /* BIOS may override */
+ __mtrr_enabled = get_mtrr_state();
if (mtrr_cleanup(phys_addr)) {
changed_by_mtrr_cleanup = 1;
@@ -745,10 +758,16 @@ void __init mtrr_bp_init(void)
}
}
}
+
+ if (!mtrr_enabled())
+ pr_info("MTRR: Disabled\n");
}
void mtrr_ap_init(void)
{
+ if (!mtrr_enabled())
+ return;
+
if (!use_intel() || mtrr_aps_delayed_init)
return;
/*
@@ -774,6 +793,9 @@ void mtrr_save_state(void)
{
int first_cpu;
+ if (!mtrr_enabled())
+ return;
+
get_online_cpus();
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
@@ -782,6 +804,8 @@ void mtrr_save_state(void)
void set_mtrr_aps_delayed_init(void)
{
+ if (!mtrr_enabled())
+ return;
if (!use_intel())
return;
@@ -793,7 +817,7 @@ void set_mtrr_aps_delayed_init(void)
*/
void mtrr_aps_init(void)
{
- if (!use_intel())
+ if (!use_intel() || !mtrr_enabled())
return;
/*
@@ -810,7 +834,7 @@ void mtrr_aps_init(void)
void mtrr_bp_restore(void)
{
- if (!use_intel())
+ if (!use_intel() || !mtrr_enabled())
return;
mtrr_if->set_all();
@@ -818,7 +842,7 @@ void mtrr_bp_restore(void)
static int __init mtrr_init_finialize(void)
{
- if (!mtrr_if)
+ if (!mtrr_enabled())
return 0;
if (use_intel()) {
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h b/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31..951884dcc 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
extern void set_mtrr_ops(const struct mtrr_ops *ops);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event.c b/kernel/arch/x86/kernel/cpu/perf_event.c
index 4cc98a4e8..2bf79d7c9 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event.c
@@ -5,7 +5,7 @@
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
@@ -135,6 +135,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
}
static atomic_t active_events;
+static atomic_t pmc_refcount;
static DEFINE_MUTEX(pmc_reserve_mutex);
#ifdef CONFIG_X86_LOCAL_APIC
@@ -271,6 +272,7 @@ msr_fail:
static void hw_perf_event_destroy(struct perf_event *event)
{
x86_release_hardware();
+ atomic_dec(&active_events);
}
void hw_perf_lbr_event_destroy(struct perf_event *event)
@@ -324,16 +326,16 @@ int x86_reserve_hardware(void)
{
int err = 0;
- if (!atomic_inc_not_zero(&active_events)) {
+ if (!atomic_inc_not_zero(&pmc_refcount)) {
mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_events) == 0) {
+ if (atomic_read(&pmc_refcount) == 0) {
if (!reserve_pmc_hardware())
err = -EBUSY;
else
reserve_ds_buffers();
}
if (!err)
- atomic_inc(&active_events);
+ atomic_inc(&pmc_refcount);
mutex_unlock(&pmc_reserve_mutex);
}
@@ -342,7 +344,7 @@ int x86_reserve_hardware(void)
void x86_release_hardware(void)
{
- if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
+ if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
release_pmc_hardware();
release_ds_buffers();
mutex_unlock(&pmc_reserve_mutex);
@@ -355,28 +357,30 @@ void x86_release_hardware(void)
*/
int x86_add_exclusive(unsigned int what)
{
- int ret = -EBUSY, i;
-
- if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
- return 0;
+ int i;
- mutex_lock(&pmc_reserve_mutex);
- for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
- if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
- goto out;
+ if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto fail_unlock;
+ }
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ mutex_unlock(&pmc_reserve_mutex);
}
- atomic_inc(&x86_pmu.lbr_exclusive[what]);
- ret = 0;
+ atomic_inc(&active_events);
+ return 0;
-out:
+fail_unlock:
mutex_unlock(&pmc_reserve_mutex);
- return ret;
+ return -EBUSY;
}
void x86_del_exclusive(unsigned int what)
{
atomic_dec(&x86_pmu.lbr_exclusive[what]);
+ atomic_dec(&active_events);
}
int x86_setup_perfctr(struct perf_event *event)
@@ -557,6 +561,7 @@ static int __x86_pmu_event_init(struct perf_event *event)
if (err)
return err;
+ atomic_inc(&active_events);
event->destroy = hw_perf_event_destroy;
event->hw.idx = -1;
@@ -895,10 +900,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if (x86_pmu.commit_scheduling)
x86_pmu.commit_scheduling(cpuc, i, assign[i]);
}
- }
-
- if (!assign || unsched) {
-
+ } else {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
/*
@@ -1111,13 +1113,16 @@ int x86_perf_event_set_period(struct perf_event *event)
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
- /*
- * The hw event starts counting from this event offset,
- * mark it to be able to extra future deltas:
- */
- local64_set(&hwc->prev_count, (u64)-left);
+ if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+ local64_read(&hwc->prev_count) != (u64)-left) {
+ /*
+ * The hw event starts counting from this event offset,
+ * mark it to be able to extra future deltas:
+ */
+ local64_set(&hwc->prev_count, (u64)-left);
- wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ }
/*
* Due to erratum on certan cpu we need
@@ -1170,7 +1175,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
* skip the schedulability test here, it will be performed
* at commit time (->commit_txn) as a whole.
*/
- if (cpuc->group_flag & PERF_EVENT_TXN)
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect;
ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1321,7 +1326,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* XXX assumes any ->del() called during a TXN will only be on
* an event added during that same TXN.
*/
- if (cpuc->group_flag & PERF_EVENT_TXN)
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
return;
/*
@@ -1429,6 +1434,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
u64 finish_clock;
int ret;
+ /*
+ * All PMUs/events that share this PMI handler should make sure to
+ * increment active_events for their events.
+ */
if (!atomic_read(&active_events))
return NMI_DONE;
@@ -1542,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs)
}
/* Merge two pointer arrays */
-static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
{
struct attribute **new;
int j, i;
@@ -1739,11 +1748,22 @@ static inline void x86_pmu_read(struct perf_event *event)
* Start group events scheduling transaction
* Set the flag to make pmu::enable() not perform the
* schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
*/
-static void x86_pmu_start_txn(struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
+
+ cpuc->txn_flags = txn_flags;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_disable(pmu);
- __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
__this_cpu_write(cpu_hw_events.n_txn, 0);
}
@@ -1754,7 +1774,16 @@ static void x86_pmu_start_txn(struct pmu *pmu)
*/
static void x86_pmu_cancel_txn(struct pmu *pmu)
{
- __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
+ unsigned int txn_flags;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ txn_flags = cpuc->txn_flags;
+ cpuc->txn_flags = 0;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
/*
* Truncate collected array by the number of events added in this
* transaction. See x86_pmu_add() and x86_pmu_*_txn().
@@ -1777,6 +1806,13 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
int assign[X86_PMC_IDX_MAX];
int n, ret;
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+ cpuc->txn_flags = 0;
+ return 0;
+ }
+
n = cpuc->n_events;
if (!x86_pmu_initialized())
@@ -1792,7 +1828,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
- cpuc->group_flag &= ~PERF_EVENT_TXN;
+ cpuc->txn_flags = 0;
perf_pmu_enable(pmu);
return 0;
}
@@ -2170,6 +2206,7 @@ static unsigned long get_segment_base(unsigned int segment)
int idx = segment >> 3;
if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
if (idx > LDT_ENTRIES)
@@ -2181,6 +2218,9 @@ static unsigned long get_segment_base(unsigned int segment)
return 0;
desc = &ldt->entries[idx];
+#else
+ return 0;
+#endif
} else {
if (idx > GDT_ENTRIES)
return 0;
@@ -2191,7 +2231,7 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION
#include <asm/compat.h>
diff --git a/kernel/arch/x86/kernel/cpu/perf_event.h b/kernel/arch/x86/kernel/cpu/perf_event.h
index f068695ea..d0e35ebb2 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event.h
+++ b/kernel/arch/x86/kernel/cpu/perf_event.h
@@ -5,7 +5,7 @@
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
* Copyright (C) 2009 Google, Inc., Stephane Eranian
*
@@ -47,6 +47,7 @@ enum extra_reg_type {
EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
EXTRA_REG_LBR = 2, /* lbr_select */
EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
+ EXTRA_REG_FE = 4, /* fe_* */
EXTRA_REG_MAX /* number of entries needed */
};
@@ -75,6 +76,8 @@ struct event_constraint {
#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */
struct amd_nb {
@@ -88,6 +91,18 @@ struct amd_nb {
#define MAX_PEBS_EVENTS 8
/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+ PERF_SAMPLE_TRANSACTION)
+
+/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
@@ -133,7 +148,6 @@ enum intel_excl_state_type {
};
struct intel_excl_states {
- enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
enum intel_excl_state_type state[X86_PMC_IDX_MAX];
bool sched_started; /* true if scheduling has started */
};
@@ -152,7 +166,7 @@ struct intel_excl_cntrs {
unsigned core_id; /* per-core: core id */
};
-#define MAX_LBR_ENTRIES 16
+#define MAX_LBR_ENTRIES 32
enum {
X86_PERF_KFREE_SHARED = 0,
@@ -182,7 +196,7 @@ struct cpu_hw_events {
int n_excl; /* the number of exclusive events */
- unsigned int group_flag;
+ unsigned int txn_flags;
int is_fake;
/*
@@ -373,7 +387,7 @@ struct cpu_hw_events {
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
- INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
@@ -527,10 +541,10 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
- void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
-
void (*start_scheduling)(struct cpu_hw_events *cpuc);
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
void (*stop_scheduling)(struct cpu_hw_events *cpuc);
struct event_constraint *event_constraints;
@@ -581,6 +595,7 @@ struct x86_pmu {
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
int max_pebs_events;
+ unsigned long free_running_flags;
/*
* Intel LBR
@@ -611,6 +626,8 @@ struct x86_pmu {
struct x86_perf_task_context {
u64 lbr_from[MAX_LBR_ENTRIES];
u64 lbr_to[MAX_LBR_ENTRIES];
+ u64 lbr_info[MAX_LBR_ENTRIES];
+ int tos;
int lbr_callstack_users;
int lbr_stack_state;
};
@@ -780,6 +797,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
ssize_t intel_event_sysfs_show(char *page, u64 config);
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -795,20 +814,6 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
- return true;
-
- return false;
-}
-
static inline bool intel_pmu_has_bts(struct perf_event *event)
{
if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
@@ -860,6 +865,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
extern struct event_constraint intel_hsw_pebs_event_constraints[];
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
@@ -870,6 +877,8 @@ void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_ds_init(void);
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
@@ -896,6 +905,8 @@ void intel_pmu_lbr_init_snb(void);
void intel_pmu_lbr_init_hsw(void);
+void intel_pmu_lbr_init_skl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
@@ -919,6 +930,7 @@ static inline int is_ht_workaround_enabled(void)
{
return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
}
+
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel.c b/kernel/arch/x86/kernel/cpu/perf_event_intel.c
index 2813ea0f1..e2a430021 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ EVENT_CONSTRAINT_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,18 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ /*
+ * Note the low 8 bits eventsel code is not a continuous field, containing
+ * some #GPing bits. These are masked out.
+ */
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -212,7 +232,7 @@ static struct event_constraint intel_hsw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
@@ -235,7 +255,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
+ INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
EVENT_CONSTRAINT_END
};
@@ -244,6 +264,200 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD BIT_ULL(0)
+#define SKL_DEMAND_RFO BIT_ULL(1)
+#define SKL_ANY_RESPONSE BIT_ULL(16)
+#define SKL_SUPPLIER_NONE BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
+#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT BIT_ULL(30)
+#define SKL_SNOOP_NONE BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define SKL_SNOOP_MISS BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define SKL_SNOOP_HITM BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
+#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
#define SNB_DMND_DATA_RD (1ULL << 0)
#define SNB_DMND_RFO (1ULL << 1)
#define SNB_DMND_IFETCH (1ULL << 2)
@@ -1114,7 +1328,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly =
{
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
- INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
EVENT_EXTRA_END
};
@@ -1594,6 +1808,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
loops = 0;
again:
+ intel_pmu_lbr_read();
intel_pmu_ack_status(status);
if (++loops > 100) {
static bool warned = false;
@@ -1608,16 +1823,16 @@ again:
inc_irq_stat(apic_perf_irqs);
- intel_pmu_lbr_read();
/*
- * CondChgd bit 63 doesn't mean any overflow status. Ignore
- * and clear the bit.
+ * Ignore a range of extra bits in status that do not indicate
+ * overflow by themselves.
*/
- if (__test_and_clear_bit(63, (unsigned long *)&status)) {
- if (!status)
- goto done;
- }
+ status &= ~(GLOBAL_STATUS_COND_CHG |
+ GLOBAL_STATUS_ASIF |
+ GLOBAL_STATUS_LBRS_FROZEN);
+ if (!status)
+ goto done;
/*
* PEBS overflow sets bit 62 in the global status register
@@ -1699,18 +1914,22 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
-static int intel_alt_er(int idx)
+static int intel_alt_er(int idx, u64 config)
{
+ int alt_idx;
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
if (idx == EXTRA_REG_RSP_0)
- return EXTRA_REG_RSP_1;
+ alt_idx = EXTRA_REG_RSP_1;
if (idx == EXTRA_REG_RSP_1)
- return EXTRA_REG_RSP_0;
+ alt_idx = EXTRA_REG_RSP_0;
+
+ if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+ return idx;
- return idx;
+ return alt_idx;
}
static void intel_fixup_er(struct perf_event *event, int idx)
@@ -1799,7 +2018,7 @@ again:
*/
c = NULL;
} else {
- idx = intel_alt_er(idx);
+ idx = intel_alt_er(idx, reg->config);
if (idx != reg->idx) {
raw_spin_unlock_irqrestore(&era->lock, flags);
goto again;
@@ -1903,9 +2122,8 @@ static void
intel_start_scheduling(struct cpu_hw_events *cpuc)
{
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
+ struct intel_excl_states *xl;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* sibling thread */
/*
* nothing needed if in group validation mode
@@ -1916,10 +2134,9 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
/*
* no exclusion needed
*/
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return;
- xlo = &excl_cntrs->states[o_tid];
xl = &excl_cntrs->states[tid];
xl->sched_started = true;
@@ -1928,22 +2145,41 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
* in stop_event_scheduling()
* makes scheduling appear as a transaction
*/
- WARN_ON_ONCE(!irqs_disabled());
raw_spin_lock(&excl_cntrs->lock);
+}
- /*
- * save initial state of sibling thread
- */
- memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = cpuc->event_constraint[idx];
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ lockdep_assert_held(&excl_cntrs->lock);
+
+ if (c->flags & PERF_X86_EVENT_EXCL)
+ xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xl->state[cntr] = INTEL_EXCL_SHARED;
}
static void
intel_stop_scheduling(struct cpu_hw_events *cpuc)
{
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
+ struct intel_excl_states *xl;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* sibling thread */
/*
* nothing needed if in group validation mode
@@ -1953,17 +2189,11 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
/*
* no exclusion needed
*/
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return;
- xlo = &excl_cntrs->states[o_tid];
xl = &excl_cntrs->states[tid];
- /*
- * make new sibling thread state visible
- */
- memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
-
xl->sched_started = false;
/*
* release shared state lock (acquired in intel_start_scheduling())
@@ -1975,12 +2205,10 @@ static struct event_constraint *
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
int idx, struct event_constraint *c)
{
- struct event_constraint *cx;
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xl, *xlo;
- int is_excl, i;
+ struct intel_excl_states *xlo;
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid; /* alternate */
+ int is_excl, i;
/*
* validating a group does not require
@@ -1992,27 +2220,8 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
/*
* no exclusion needed
*/
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return c;
- /*
- * event requires exclusive counter access
- * across HT threads
- */
- is_excl = c->flags & PERF_X86_EVENT_EXCL;
- if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
- event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
- if (!cpuc->n_excl++)
- WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
- }
-
- /*
- * xl = state of current HT
- * xlo = state of sibling HT
- */
- xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];
-
- cx = c;
/*
* because we modify the constraint, we need
@@ -2023,10 +2232,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* been cloned (marked dynamic)
*/
if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-
- /* sanity check */
- if (idx < 0)
- return &emptyconstraint;
+ struct event_constraint *cx;
/*
* grab pre-allocated constraint entry
@@ -2037,13 +2243,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* initialize dynamic constraint
* with static constraint
*/
- memcpy(cx, c, sizeof(*cx));
+ *cx = *c;
/*
* mark constraint as dynamic, so we
* can free it later on
*/
cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ c = cx;
}
/*
@@ -2054,6 +2261,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
*/
/*
+ * state of sibling HT
+ */
+ xlo = &excl_cntrs->states[tid ^ 1];
+
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+ if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+ event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+ if (!cpuc->n_excl++)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+ }
+
+ /*
* Modify static constraint with current dynamic
* state of thread
*
@@ -2061,46 +2284,49 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* SHARED : sibling counter measuring non-exclusive event
* UNUSED : sibling counter unused
*/
- for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+ for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
/*
* exclusive event in sibling counter
* our corresponding counter cannot be used
* regardless of our event
*/
- if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
- __clear_bit(i, cx->idxmsk);
+ if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+ __clear_bit(i, c->idxmsk);
/*
* if measuring an exclusive event, sibling
* measuring non-exclusive, then counter cannot
* be used
*/
- if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
- __clear_bit(i, cx->idxmsk);
+ if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+ __clear_bit(i, c->idxmsk);
}
/*
* recompute actual bit weight for scheduling algorithm
*/
- cx->weight = hweight64(cx->idxmsk64);
+ c->weight = hweight64(c->idxmsk64);
/*
* if we return an empty mask, then switch
* back to static empty constraint to avoid
* the cost of freeing later on
*/
- if (cx->weight == 0)
- cx = &emptyconstraint;
+ if (c->weight == 0)
+ c = &emptyconstraint;
- return cx;
+ return c;
}
static struct event_constraint *
intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
- struct event_constraint *c1 = cpuc->event_constraint[idx];
+ struct event_constraint *c1 = NULL;
struct event_constraint *c2;
+ if (idx >= 0) /* fake does < 0 */
+ c1 = cpuc->event_constraint[idx];
+
/*
* first time only
* - static constraint: no change across incremental scheduling calls
@@ -2124,10 +2350,8 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
{
struct hw_perf_event *hwc = &event->hw;
struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct intel_excl_states *xlo, *xl;
- unsigned long flags = 0; /* keep compiler happy */
int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid;
+ struct intel_excl_states *xl;
/*
* nothing needed if in group validation mode
@@ -2135,13 +2359,9 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
if (cpuc->is_fake)
return;
- WARN_ON_ONCE(!excl_cntrs);
-
- if (!excl_cntrs)
+ if (WARN_ON_ONCE(!excl_cntrs))
return;
- xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];
if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
if (!--cpuc->n_excl)
@@ -2149,22 +2369,25 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
}
/*
- * put_constraint may be called from x86_schedule_events()
- * which already has the lock held so here make locking
- * conditional
+ * If event was actually assigned, then mark the counter state as
+ * unused now.
*/
- if (!xl->sched_started)
- raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+ if (hwc->idx >= 0) {
+ xl = &excl_cntrs->states[tid];
- /*
- * if event was actually assigned, then mark the
- * counter state as unused now
- */
- if (hwc->idx >= 0)
- xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional.
+ */
+ if (!xl->sched_started)
+ raw_spin_lock(&excl_cntrs->lock);
+
+ xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
- if (!xl->sched_started)
- raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+ if (!xl->sched_started)
+ raw_spin_unlock(&excl_cntrs->lock);
+ }
}
static void
@@ -2196,41 +2419,6 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
intel_put_excl_constraints(cpuc, event);
}
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
-{
- struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
- struct event_constraint *c = cpuc->event_constraint[idx];
- struct intel_excl_states *xlo, *xl;
- int tid = cpuc->excl_thread_id;
- int o_tid = 1 - tid;
- int is_excl;
-
- if (cpuc->is_fake || !c)
- return;
-
- is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
- if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
- return;
-
- WARN_ON_ONCE(!excl_cntrs);
-
- if (!excl_cntrs)
- return;
-
- xl = &excl_cntrs->states[tid];
- xlo = &excl_cntrs->states[o_tid];
-
- WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
-
- if (cntr >= 0) {
- if (is_excl)
- xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
- else
- xlo->init_state[cntr] = INTEL_EXCL_SHARED;
- }
-}
-
static void intel_pebs_aliases_core2(struct perf_event *event)
{
if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
@@ -2287,6 +2475,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
}
}
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+ unsigned long flags = x86_pmu.free_running_flags;
+
+ if (event->attr.use_clockid)
+ flags &= ~PERF_SAMPLE_TIME;
+ return flags;
+}
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -2294,8 +2491,16 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (ret)
return ret;
- if (event->attr.precise_ip && x86_pmu.pebs_aliases)
- x86_pmu.pebs_aliases(event);
+ if (event->attr.precise_ip) {
+ if (!event->attr.freq) {
+ event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+ if (!(event->attr.sample_type &
+ ~intel_pmu_free_running_flags(event)))
+ event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+ }
+ if (x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
+ }
if (needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
@@ -2544,19 +2749,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
{
struct intel_excl_cntrs *c;
- int i;
c = kzalloc_node(sizeof(struct intel_excl_cntrs),
GFP_KERNEL, cpu_to_node(cpu));
if (c) {
raw_spin_lock_init(&c->lock);
- for (i = 0; i < X86_PMC_IDX_MAX; i++) {
- c->states[0].state[i] = INTEL_EXCL_UNUSED;
- c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
-
- c->states[1].state[i] = INTEL_EXCL_UNUSED;
- c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
- }
c->core_id = -1;
}
return c;
@@ -2569,7 +2766,7 @@ static int intel_pmu_cpu_prepare(int cpu)
if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
- return NOTIFY_BAD;
+ goto err;
}
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
@@ -2577,18 +2774,27 @@ static int intel_pmu_cpu_prepare(int cpu)
cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
if (!cpuc->constraint_list)
- return NOTIFY_BAD;
+ goto err_shared_regs;
cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
- if (!cpuc->excl_cntrs) {
- kfree(cpuc->constraint_list);
- kfree(cpuc->shared_regs);
- return NOTIFY_BAD;
- }
+ if (!cpuc->excl_cntrs)
+ goto err_constraint_list;
+
cpuc->excl_thread_id = 0;
}
return NOTIFY_OK;
+
+err_constraint_list:
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+
+err_shared_regs:
+ kfree(cpuc->shared_regs);
+ cpuc->shared_regs = NULL;
+
+err:
+ return NOTIFY_BAD;
}
static void intel_pmu_cpu_starting(int cpu)
@@ -2611,7 +2817,7 @@ static void intel_pmu_cpu_starting(int cpu)
if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
- for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ for_each_cpu(i, topology_sibling_cpumask(cpu)) {
struct intel_shared_regs *pc;
pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2629,7 +2835,7 @@ static void intel_pmu_cpu_starting(int cpu)
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
- for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ for_each_cpu(i, topology_sibling_cpumask(cpu)) {
struct intel_excl_cntrs *c;
c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -2677,10 +2883,21 @@ static void intel_pmu_cpu_dying(int cpu)
fini_debug_store_on_cpu(cpu);
}
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ if (x86_pmu.pebs_active)
+ intel_pmu_pebs_sched_task(ctx, sched_in);
+ if (x86_pmu.lbr_nr)
+ intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
static struct attribute *intel_arch3_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
@@ -2697,6 +2914,11 @@ static struct attribute *intel_arch3_formats_attr[] = {
NULL,
};
+static struct attribute *skl_format_attr[] = {
+ &format_attr_frontend.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu core_pmu = {
.name = "core",
.handle_irq = x86_pmu_handle_irq,
@@ -2711,6 +2933,8 @@ static __initconst const struct x86_pmu core_pmu = {
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
+ .free_running_flags = PEBS_FREERUNNING_FLAGS,
+
/*
* Intel PMCs cannot be accessed sanely above 32-bit width,
* so we install an artificial 1<<31 period regardless of
@@ -2749,6 +2973,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
+ .free_running_flags = PEBS_FREERUNNING_FLAGS,
/*
* Intel PMCs cannot be accessed sanely above 32 bit width,
* so we install an artificial 1<<31 period regardless of
@@ -2766,7 +2991,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .sched_task = intel_pmu_lbr_sched_task,
+ .sched_task = intel_pmu_sched_task,
};
static __init void intel_clovertown_quirk(void)
@@ -2939,8 +3164,8 @@ static __init void intel_ht_bug(void)
{
x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
- x86_pmu.commit_scheduling = intel_commit_scheduling;
x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
x86_pmu.stop_scheduling = intel_stop_scheduling;
}
@@ -3286,6 +3511,30 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_skl_event_constraints;
+ x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_skl_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+ skl_format_attr);
+ WARN_ON(!x86_pmu.format_attrs);
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Skylake events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -3355,7 +3604,7 @@ __init int intel_pmu_init(void)
*/
if (x86_pmu.extra_regs) {
for (er = x86_pmu.extra_regs; er->msr; er++) {
- er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ er->extra_msr_access = check_msr(er->msr, 0x11UL);
/* Disable LBR select mapping */
if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
x86_pmu.lbr_sel_map = NULL;
@@ -3388,21 +3637,24 @@ static __init int fixup_ht_bug(void)
if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
return 0;
- w = cpumask_weight(topology_thread_cpumask(cpu));
+ w = cpumask_weight(topology_sibling_cpumask(cpu));
if (w > 1) {
pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
return 0;
}
- watchdog_nmi_disable_all();
+ if (lockup_detector_suspend() != 0) {
+ pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+ return 0;
+ }
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
- x86_pmu.commit_scheduling = NULL;
x86_pmu.start_scheduling = NULL;
+ x86_pmu.commit_scheduling = NULL;
x86_pmu.stop_scheduling = NULL;
- watchdog_nmi_enable_all();
+ lockup_detector_resume();
get_online_cpus();
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 7795f3f8b..2cad71d1b 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -62,9 +62,6 @@ struct bts_buffer {
struct pmu bts_pmu;
-void intel_pmu_enable_bts(u64 config);
-void intel_pmu_disable_bts(void);
-
static size_t buf_size(struct page *page)
{
return 1 << (PAGE_SHIFT + page_private(page));
@@ -225,6 +222,7 @@ static void __bts_event_start(struct perf_event *event)
if (!buf || bts_buffer_is_full(buf, bts))
return;
+ event->hw.itrace_started = 1;
event->hw.state = 0;
if (!buf->snapshot)
@@ -497,6 +495,19 @@ static int bts_event_init(struct perf_event *event)
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
+ /*
+ * BTS leaks kernel addresses even when CPL0 tracing is
+ * disabled, so disallow intel_bts driver for unprivileged
+ * users on paranoid systems since it provides trace data
+ * to the user in a zero-copy fashion.
+ *
+ * Note that the default paranoia setting permits unprivileged
+ * users to profile the kernel.
+ */
+ if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+ !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
ret = x86_reserve_hardware();
if (ret) {
x86_del_exclusive(x86_lbr_exclusive_bts);
@@ -530,5 +541,4 @@ static __init int bts_init(void)
return perf_pmu_register(&bts_pmu, "intel_bts", -1);
}
-
-module_init(bts_init);
+arch_initcall(bts_init);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index cb77b11bc..a316ca96f 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -13,16 +13,35 @@
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
-static unsigned int cqm_max_rmid = -1;
+static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-struct intel_cqm_state {
- raw_spinlock_t lock;
- int rmid;
- int cnt;
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid: The cached Resource Monitoring ID
+ * @closid: The cached Class Of Service ID
+ * @rmid_usecnt: The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+ u32 rmid;
+ u32 closid;
+ int rmid_usecnt;
};
-static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -57,7 +76,7 @@ static cpumask_t cqm_cpumask;
* near-zero occupancy value, i.e. no cachelines are tagged with this
* RMID, once __intel_cqm_rmid_rotate() returns.
*/
-static unsigned int intel_cqm_rotation_rmid;
+static u32 intel_cqm_rotation_rmid;
#define INVALID_RMID (-1)
@@ -69,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid;
* Likewise, an rmid value of -1 is used to indicate "no rmid currently
* assigned" and is used as part of the rotation code.
*/
-static inline bool __rmid_valid(unsigned int rmid)
+static inline bool __rmid_valid(u32 rmid)
{
if (!rmid || rmid == INVALID_RMID)
return false;
@@ -77,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid)
return true;
}
-static u64 __rmid_read(unsigned int rmid)
+static u64 __rmid_read(u32 rmid)
{
u64 val;
@@ -102,7 +121,7 @@ enum rmid_recycle_state {
};
struct cqm_rmid_entry {
- unsigned int rmid;
+ u32 rmid;
enum rmid_recycle_state state;
struct list_head list;
unsigned long queue_time;
@@ -147,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru);
*/
static struct cqm_rmid_entry **cqm_rmid_ptrs;
-static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
{
struct cqm_rmid_entry *entry;
@@ -162,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
*
* We expect to be called with cache_mutex held.
*/
-static int __get_rmid(void)
+static u32 __get_rmid(void)
{
struct cqm_rmid_entry *entry;
@@ -177,7 +196,7 @@ static int __get_rmid(void)
return entry->rmid;
}
-static void __put_rmid(unsigned int rmid)
+static void __put_rmid(u32 rmid)
{
struct cqm_rmid_entry *entry;
@@ -279,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
{
if (event->attach_state & PERF_ATTACH_TASK)
- return perf_cgroup_from_task(event->hw.target);
+ return perf_cgroup_from_task(event->hw.target, event->ctx);
return event->cgrp;
}
@@ -372,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
}
struct rmid_read {
- unsigned int rmid;
+ u32 rmid;
atomic64_t value;
};
@@ -381,12 +400,11 @@ static void __intel_cqm_event_count(void *info);
/*
* Exchange the RMID of a group of events.
*/
-static unsigned int
-intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
{
struct perf_event *event;
- unsigned int old_rmid = group->hw.cqm_rmid;
struct list_head *head = &group->hw.cqm_group_entry;
+ u32 old_rmid = group->hw.cqm_rmid;
lockdep_assert_held(&cache_mutex);
@@ -451,7 +469,7 @@ static void intel_cqm_stable(void *arg)
* If we have group events waiting for an RMID that don't conflict with
* events already running, assign @rmid.
*/
-static bool intel_cqm_sched_in_event(unsigned int rmid)
+static bool intel_cqm_sched_in_event(u32 rmid)
{
struct perf_event *leader, *event;
@@ -598,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
static void __intel_cqm_pick_and_rotate(struct perf_event *next)
{
struct perf_event *rotor;
- unsigned int rmid;
+ u32 rmid;
lockdep_assert_held(&cache_mutex);
@@ -626,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next)
static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
{
struct perf_event *group, *g;
- unsigned int rmid;
+ u32 rmid;
lockdep_assert_held(&cache_mutex);
@@ -828,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
struct perf_event **group)
{
struct perf_event *iter;
- unsigned int rmid;
bool conflict = false;
+ u32 rmid;
list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
rmid = iter->hw.cqm_rmid;
@@ -860,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
static void intel_cqm_event_read(struct perf_event *event)
{
unsigned long flags;
- unsigned int rmid;
+ u32 rmid;
u64 val;
/*
@@ -969,55 +987,48 @@ out:
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
- struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
- unsigned int rmid = event->hw.cqm_rmid;
- unsigned long flags;
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u32 rmid = event->hw.cqm_rmid;
if (!(event->hw.cqm_state & PERF_HES_STOPPED))
return;
event->hw.cqm_state &= ~PERF_HES_STOPPED;
- raw_spin_lock_irqsave(&state->lock, flags);
-
- if (state->cnt++)
- WARN_ON_ONCE(state->rmid != rmid);
- else
+ if (state->rmid_usecnt++) {
+ if (!WARN_ON_ONCE(state->rmid != rmid))
+ return;
+ } else {
WARN_ON_ONCE(state->rmid);
+ }
state->rmid = rmid;
- wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
-
- raw_spin_unlock_irqrestore(&state->lock, flags);
+ wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
}
static void intel_cqm_event_stop(struct perf_event *event, int mode)
{
- struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
- unsigned long flags;
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
if (event->hw.cqm_state & PERF_HES_STOPPED)
return;
event->hw.cqm_state |= PERF_HES_STOPPED;
- raw_spin_lock_irqsave(&state->lock, flags);
intel_cqm_event_read(event);
- if (!--state->cnt) {
+ if (!--state->rmid_usecnt) {
state->rmid = 0;
- wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+ wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
} else {
WARN_ON_ONCE(!state->rmid);
}
-
- raw_spin_unlock_irqrestore(&state->lock, flags);
}
static int intel_cqm_event_add(struct perf_event *event, int mode)
{
unsigned long flags;
- unsigned int rmid;
+ u32 rmid;
raw_spin_lock_irqsave(&cache_lock, flags);
@@ -1032,11 +1043,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
return 0;
}
-static void intel_cqm_event_del(struct perf_event *event, int mode)
-{
- intel_cqm_event_stop(event, mode);
-}
-
static void intel_cqm_event_destroy(struct perf_event *event)
{
struct perf_event *group_other = NULL;
@@ -1065,7 +1071,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
list_replace(&event->hw.cqm_groups_entry,
&group_other->hw.cqm_groups_entry);
} else {
- unsigned int rmid = event->hw.cqm_rmid;
+ u32 rmid = event->hw.cqm_rmid;
if (__rmid_valid(rmid))
__put_rmid(rmid);
@@ -1229,7 +1235,7 @@ static struct pmu intel_cqm_pmu = {
.task_ctx_nr = perf_sw_context,
.event_init = intel_cqm_event_init,
.add = intel_cqm_event_add,
- .del = intel_cqm_event_del,
+ .del = intel_cqm_event_stop,
.start = intel_cqm_event_start,
.stop = intel_cqm_event_stop,
.read = intel_cqm_event_read,
@@ -1249,14 +1255,14 @@ static inline void cqm_pick_event_reader(int cpu)
cpumask_set_cpu(cpu, &cqm_cpumask);
}
-static void intel_cqm_cpu_prepare(unsigned int cpu)
+static void intel_cqm_cpu_starting(unsigned int cpu)
{
- struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+ struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
- raw_spin_lock_init(&state->lock);
state->rmid = 0;
- state->cnt = 0;
+ state->closid = 0;
+ state->rmid_usecnt = 0;
WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
@@ -1290,13 +1296,11 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
unsigned int cpu = (unsigned long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- intel_cqm_cpu_prepare(cpu);
- break;
case CPU_DOWN_PREPARE:
intel_cqm_cpu_exit(cpu);
break;
case CPU_STARTING:
+ intel_cqm_cpu_starting(cpu);
cqm_pick_event_reader(cpu);
break;
}
@@ -1367,7 +1371,7 @@ static int __init intel_cqm_init(void)
goto out;
for_each_online_cpu(i) {
- intel_cqm_cpu_prepare(i);
+ intel_cqm_cpu_starting(i);
cqm_pick_event_reader(i);
}
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_cstate.c
new file mode 100644
index 000000000..75a38b5a2
--- /dev/null
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_cstate.c
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ * - 'cstate_core': The counter is available for each physical core.
+ * The counters include CORE_C*_RESIDENCY.
+ * - 'cstate_pkg': The counter is available for each physical package.
+ * The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ * MSR_CORE_C1_RES: CORE C1 Residency Counter
+ * perf code: 0x00
+ * Available model: SLM,AMT
+ * Scope: Core (each processor core has a MSR)
+ * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Core
+ * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Core
+ * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ * perf code: 0x03
+ * Available model: SNB,IVB,HSW,BDW,SKL
+ * Scope: Core
+ * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
+ * perf code: 0x00
+ * Available model: SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
+ * perf code: 0x03
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ * Scope: Package (physical package)
+ * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
+ * perf code: 0x04
+ * Available model: HSW ULT only
+ * Scope: Package (physical package)
+ * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
+ * perf code: 0x05
+ * Available model: HSW ULT only
+ * Scope: Package (physical package)
+ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ * perf code: 0x06
+ * Available model: HSW ULT only
+ * Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __cstate_##_var##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct kobj_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+
+struct perf_cstate_msr {
+ u64 msr;
+ struct perf_pmu_events_attr *attr;
+ bool (*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+ /*
+ * cstate_core events
+ */
+ PERF_CSTATE_CORE_C1_RES = 0,
+ PERF_CSTATE_CORE_C3_RES,
+ PERF_CSTATE_CORE_C6_RES,
+ PERF_CSTATE_CORE_C7_RES,
+
+ PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+ if (idx == PERF_CSTATE_CORE_C3_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES)
+ return true;
+ break;
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ if (idx == PERF_CSTATE_CORE_C3_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES ||
+ idx == PERF_CSTATE_CORE_C7_RES)
+ return true;
+ break;
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case 76: /* 14nm Atom "Airmont" */
+ if (idx == PERF_CSTATE_CORE_C1_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+ [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1, test_core, },
+ [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3, test_core, },
+ [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6, test_core, },
+ [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7, test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+ .name = "events",
+ .attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+ &format_attr_core_event.attr,
+ NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+ .name = "format",
+ .attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+ .attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+ &core_events_attr_group,
+ &core_format_attr_group,
+ &cpumask_attr_group,
+ NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+ /*
+ * cstate_pkg events
+ */
+ PERF_CSTATE_PKG_C2_RES = 0,
+ PERF_CSTATE_PKG_C3_RES,
+ PERF_CSTATE_PKG_C6_RES,
+ PERF_CSTATE_PKG_C7_RES,
+ PERF_CSTATE_PKG_C8_RES,
+ PERF_CSTATE_PKG_C9_RES,
+ PERF_CSTATE_PKG_C10_RES,
+
+ PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+ if (idx == PERF_CSTATE_CORE_C3_RES ||
+ idx == PERF_CSTATE_CORE_C6_RES ||
+ idx == PERF_CSTATE_CORE_C7_RES)
+ return true;
+ break;
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ if (idx == PERF_CSTATE_PKG_C2_RES ||
+ idx == PERF_CSTATE_PKG_C3_RES ||
+ idx == PERF_CSTATE_PKG_C6_RES ||
+ idx == PERF_CSTATE_PKG_C7_RES)
+ return true;
+ break;
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case 76: /* 14nm Atom "Airmont" */
+ if (idx == PERF_CSTATE_CORE_C6_RES)
+ return true;
+ break;
+ case 69: /* 22nm Haswell ULT */
+ if (idx == PERF_CSTATE_PKG_C2_RES ||
+ idx == PERF_CSTATE_PKG_C3_RES ||
+ idx == PERF_CSTATE_PKG_C6_RES ||
+ idx == PERF_CSTATE_PKG_C7_RES ||
+ idx == PERF_CSTATE_PKG_C8_RES ||
+ idx == PERF_CSTATE_PKG_C9_RES ||
+ idx == PERF_CSTATE_PKG_C10_RES)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+ [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2, test_pkg, },
+ [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3, test_pkg, },
+ [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6, test_pkg, },
+ [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7, test_pkg, },
+ [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8, test_pkg, },
+ [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9, test_pkg, },
+ [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10, test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+ .name = "events",
+ .attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+ &format_attr_pkg_event.attr,
+ NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+ .name = "format",
+ .attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+ &pkg_events_attr_group,
+ &pkg_format_attr_group,
+ &cpumask_attr_group,
+ NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (pmu == &cstate_core_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+ else if (pmu == &cstate_pkg_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+ else
+ return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+ int ret = 0;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (event->pmu == &cstate_core_pmu) {
+ if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+ return -EINVAL;
+ if (!core_msr[cfg].attr)
+ return -EINVAL;
+ event->hw.event_base = core_msr[cfg].msr;
+ } else if (event->pmu == &cstate_pkg_pmu) {
+ if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+ return -EINVAL;
+ if (!pkg_msr[cfg].attr)
+ return -EINVAL;
+ event->hw.event_base = pkg_msr[cfg].msr;
+ } else
+ return -ENOENT;
+
+ /* must be done before validate_group */
+ event->hw.config = cfg;
+ event->hw.idx = -1;
+
+ return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+ u64 val;
+
+ rdmsrl(event->hw.event_base, val);
+ return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ new_raw_count = cstate_pmu_read_counter(event);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+ local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+ if (mode & PERF_EF_START)
+ cstate_pmu_event_start(event, mode);
+
+ return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+ int i, id, target;
+
+ /* cpu exit for cstate core */
+ if (has_cstate_core) {
+ id = topology_core_id(cpu);
+ target = -1;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (id == topology_core_id(i)) {
+ target = i;
+ break;
+ }
+ }
+ if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+ cpumask_set_cpu(target, &cstate_core_cpu_mask);
+ WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+ if (target >= 0)
+ perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+ }
+
+ /* cpu exit for cstate pkg */
+ if (has_cstate_pkg) {
+ id = topology_physical_package_id(cpu);
+ target = -1;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (id == topology_physical_package_id(i)) {
+ target = i;
+ break;
+ }
+ }
+ if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+ cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+ WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+ if (target >= 0)
+ perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+ }
+}
+
+static void cstate_cpu_init(int cpu)
+{
+ int i, id;
+
+ /* cpu init for cstate core */
+ if (has_cstate_core) {
+ id = topology_core_id(cpu);
+ for_each_cpu(i, &cstate_core_cpu_mask) {
+ if (id == topology_core_id(i))
+ break;
+ }
+ if (i >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+ }
+
+ /* cpu init for cstate pkg */
+ if (has_cstate_pkg) {
+ id = topology_physical_package_id(cpu);
+ for_each_cpu(i, &cstate_pkg_cpu_mask) {
+ if (id == topology_physical_package_id(i))
+ break;
+ }
+ if (i >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+ }
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ break;
+ case CPU_STARTING:
+ cstate_cpu_init(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ break;
+ case CPU_DOWN_PREPARE:
+ cstate_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+ struct attribute **events_attrs,
+ int max_event_nr)
+{
+ int i, j = 0;
+ u64 val;
+
+ /* Probe the cstate events. */
+ for (i = 0; i < max_event_nr; i++) {
+ if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+ msr[i].attr = NULL;
+ }
+
+ /* List remaining events in the sysfs attrs. */
+ for (i = 0; i < max_event_nr; i++) {
+ if (msr[i].attr)
+ events_attrs[j++] = &msr[i].attr->attr.attr;
+ }
+ events_attrs[j] = NULL;
+
+ return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+ /* SLM has different MSR for PKG C6 */
+ switch (boot_cpu_data.x86_model) {
+ case 55:
+ case 76:
+ case 77:
+ pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+ }
+
+ if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+ has_cstate_core = true;
+
+ if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+ has_cstate_pkg = true;
+
+ return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+ int cpu;
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu)
+ cstate_cpu_init(cpu);
+
+ __perf_cpu_notifier(cstate_cpu_notifier);
+
+ cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+ .attr_groups = core_attr_groups,
+ .name = "cstate_core",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add, /* must have */
+ .del = cstate_pmu_event_del, /* must have */
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+ .attr_groups = pkg_attr_groups,
+ .name = "cstate_pkg",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add, /* must have */
+ .del = cstate_pmu_event_del, /* must have */
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+ int err;
+
+ if (has_cstate_core) {
+ err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+ if (WARN_ON(err))
+ pr_info("Failed to register PMU %s error %d\n",
+ cstate_core_pmu.name, err);
+ }
+
+ if (has_cstate_pkg) {
+ err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+ if (WARN_ON(err))
+ pr_info("Failed to register PMU %s error %d\n",
+ cstate_pkg_pmu.name, err);
+ }
+}
+
+static int __init cstate_pmu_init(void)
+{
+ int err;
+
+ if (cpu_has_hypervisor)
+ return -ENODEV;
+
+ err = cstate_init();
+ if (err)
+ return err;
+
+ cstate_cpumask_init();
+
+ cstate_pmus_register();
+
+ return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 7f73b3553..5db1c7755 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -11,7 +11,7 @@
#define BTS_RECORD_SIZE 24
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE PAGE_SIZE
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
@@ -224,6 +224,19 @@ union hsw_tsx_tuning {
#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+ u64 tsc;
+};
+
void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -250,7 +263,7 @@ static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
- int max, thresh = 1; /* always use a single PEBS record */
+ int max;
void *buffer, *ibuffer;
if (!x86_pmu.pebs)
@@ -280,9 +293,6 @@ static int alloc_pebs_buffer(int cpu)
ds->pebs_absolute_maximum = ds->pebs_buffer_base +
max * x86_pmu.pebs_record_size;
- ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
- thresh * x86_pmu.pebs_record_size;
-
return 0;
}
@@ -500,10 +510,11 @@ int intel_pmu_drain_bts_buffer(void)
u64 flags;
};
struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
+ struct bts_record *at, *base, *top;
struct perf_output_handle handle;
struct perf_event_header header;
struct perf_sample_data data;
+ unsigned long skip = 0;
struct pt_regs regs;
if (!event)
@@ -512,10 +523,10 @@ int intel_pmu_drain_bts_buffer(void)
if (!x86_pmu.bts_active)
return 0;
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
+ base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
- if (top <= at)
+ if (top <= base)
return 0;
memset(&regs, 0, sizeof(regs));
@@ -525,16 +536,43 @@ int intel_pmu_drain_bts_buffer(void)
perf_sample_data_init(&data, 0, event->hw.last_period);
/*
+ * BTS leaks kernel addresses in branches across the cpl boundary,
+ * such as traps or system calls, so unless the user is asking for
+ * kernel tracing (and right now it's not possible), we'd need to
+ * filter them out. But first we need to count how many of those we
+ * have in the current batch. This is an extra O(n) pass, however,
+ * it's much faster than the other one especially considering that
+ * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+ * alloc_bts_buffer()).
+ */
+ for (at = base; at < top; at++) {
+ /*
+ * Note that right now *this* BTS code only works if
+ * attr::exclude_kernel is set, but let's keep this extra
+ * check here in case that changes.
+ */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ skip++;
+ }
+
+ /*
* Prepare a generic sample, i.e. fill in the invariant fields.
* We will overwrite the from and to address before we output
* the sample.
*/
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size * (top - at)))
+ if (perf_output_begin(&handle, event, header.size *
+ (top - base - skip)))
return 1;
- for (; at < top; at++) {
+ for (at = base; at < top; at++) {
+ /* Filter out any records that contain kernel addresses. */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ continue;
+
data.ip = at->from;
data.addr = at->to;
@@ -549,6 +587,19 @@ int intel_pmu_drain_bts_buffer(void)
return 1;
}
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+ struct pt_regs regs;
+
+ x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ if (!sched_in)
+ intel_pmu_drain_pebs_buffer();
+}
+
/*
* PEBS
*/
@@ -665,6 +716,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
@@ -684,25 +757,71 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
return &emptyconstraint;
}
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+ return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
void intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+ bool first_pebs;
+ u64 threshold;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+ first_pebs = !pebs_is_enabled(cpuc);
cpuc->pebs_enabled |= 1ULL << hwc->idx;
if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled |= 1ULL << 63;
+
+ /*
+ * When the event is constrained enough we can use a larger
+ * threshold and run the event with less frequent PMI.
+ */
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+ threshold = ds->pebs_absolute_maximum -
+ x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+ if (first_pebs)
+ perf_sched_cb_inc(event->ctx->pmu);
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+ /*
+ * If not all events can use larger buffer,
+ * roll back to threshold = 1
+ */
+ if (!first_pebs &&
+ (ds->pebs_interrupt_threshold > threshold))
+ perf_sched_cb_dec(event->ctx->pmu);
+ }
+
+ /* Use auto-reload if possible to save a MSR write in the PMI */
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+ ds->pebs_event_reset[hwc->idx] =
+ (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+ }
+
+ if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+ ds->pebs_interrupt_threshold = threshold;
}
void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+ bool large_pebs = ds->pebs_interrupt_threshold >
+ ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+ if (large_pebs)
+ intel_pmu_drain_pebs_buffer();
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -711,6 +830,9 @@ void intel_pmu_pebs_disable(struct perf_event *event)
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled &= ~(1ULL << 63);
+ if (large_pebs && !pebs_is_enabled(cpuc))
+ perf_sched_cb_dec(event->ctx->pmu);
+
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
@@ -827,7 +949,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 0;
}
-static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
{
if (pebs->tsx_tuning) {
union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
@@ -836,7 +958,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
return 0;
}
-static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
{
u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
@@ -846,8 +968,10 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
return txn;
}
-static void __intel_pmu_pebs_event(struct perf_event *event,
- struct pt_regs *iregs, void *__pebs)
+static void setup_pebs_sample_data(struct perf_event *event,
+ struct pt_regs *iregs, void *__pebs,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
#define PERF_X86_EVENT_PEBS_HSW_PREC \
(PERF_X86_EVENT_PEBS_ST_HSW | \
@@ -858,14 +982,12 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* unconditionally access the 'extra' entries.
*/
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- struct pebs_record_hsw *pebs = __pebs;
- struct perf_sample_data data;
- struct pt_regs regs;
+ struct pebs_record_skl *pebs = __pebs;
u64 sample_type;
int fll, fst, dsrc;
int fl = event->hw.flags;
- if (!intel_pmu_save_and_restart(event))
+ if (pebs == NULL)
return;
sample_type = event->attr.sample_type;
@@ -874,15 +996,15 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
- perf_sample_data_init(&data, 0, event->hw.last_period);
+ perf_sample_data_init(data, 0, event->hw.last_period);
- data.period = event->hw.last_period;
+ data->period = event->hw.last_period;
/*
* Use latency for weight (only avail with PEBS-LL)
*/
if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data.weight = pebs->lat;
+ data->weight = pebs->lat;
/*
* data.data_src encodes the data source
@@ -895,7 +1017,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
val = precise_datala_hsw(event, pebs->dse);
else if (fst)
val = precise_store_data(pebs->dse);
- data.data_src.val = val;
+ data->data_src.val = val;
}
/*
@@ -908,61 +1030,133 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
* A possible PERF_SAMPLE_REGS will have to transfer all regs.
*/
- regs = *iregs;
- regs.flags = pebs->flags;
- set_linear_ip(&regs, pebs->ip);
- regs.bp = pebs->bp;
- regs.sp = pebs->sp;
+ *regs = *iregs;
+ regs->flags = pebs->flags;
+ set_linear_ip(regs, pebs->ip);
+ regs->bp = pebs->bp;
+ regs->sp = pebs->sp;
if (sample_type & PERF_SAMPLE_REGS_INTR) {
- regs.ax = pebs->ax;
- regs.bx = pebs->bx;
- regs.cx = pebs->cx;
- regs.dx = pebs->dx;
- regs.si = pebs->si;
- regs.di = pebs->di;
- regs.bp = pebs->bp;
- regs.sp = pebs->sp;
-
- regs.flags = pebs->flags;
+ regs->ax = pebs->ax;
+ regs->bx = pebs->bx;
+ regs->cx = pebs->cx;
+ regs->dx = pebs->dx;
+ regs->si = pebs->si;
+ regs->di = pebs->di;
+ regs->bp = pebs->bp;
+ regs->sp = pebs->sp;
+
+ regs->flags = pebs->flags;
#ifndef CONFIG_X86_32
- regs.r8 = pebs->r8;
- regs.r9 = pebs->r9;
- regs.r10 = pebs->r10;
- regs.r11 = pebs->r11;
- regs.r12 = pebs->r12;
- regs.r13 = pebs->r13;
- regs.r14 = pebs->r14;
- regs.r15 = pebs->r15;
+ regs->r8 = pebs->r8;
+ regs->r9 = pebs->r9;
+ regs->r10 = pebs->r10;
+ regs->r11 = pebs->r11;
+ regs->r12 = pebs->r12;
+ regs->r13 = pebs->r13;
+ regs->r14 = pebs->r14;
+ regs->r15 = pebs->r15;
#endif
}
if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
- regs.ip = pebs->real_ip;
- regs.flags |= PERF_EFLAGS_EXACT;
- } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
- regs.flags |= PERF_EFLAGS_EXACT;
+ regs->ip = pebs->real_ip;
+ regs->flags |= PERF_EFLAGS_EXACT;
+ } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+ regs->flags |= PERF_EFLAGS_EXACT;
else
- regs.flags &= ~PERF_EFLAGS_EXACT;
+ regs->flags &= ~PERF_EFLAGS_EXACT;
if ((sample_type & PERF_SAMPLE_ADDR) &&
x86_pmu.intel_cap.pebs_format >= 1)
- data.addr = pebs->dla;
+ data->addr = pebs->dla;
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
- data.weight = intel_hsw_weight(pebs);
+ data->weight = intel_hsw_weight(pebs);
if (sample_type & PERF_SAMPLE_TRANSACTION)
- data.txn = intel_hsw_transaction(pebs);
+ data->txn = intel_hsw_transaction(pebs);
}
+ /*
+ * v3 supplies an accurate time stamp, so we use that
+ * for the time stamp.
+ *
+ * We can only do this for the default trace clock.
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 3 &&
+ event->attr.use_clockid == 0)
+ data->time = native_sched_clock_from_tsc(pebs->tsc);
+
if (has_branch_stack(event))
- data.br_stack = &cpuc->lbr_stack;
+ data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ void *at;
+ u64 pebs_status;
+
+ if (base == NULL)
+ return NULL;
+
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+ struct pebs_record_nhm *p = at;
+
+ if (test_bit(bit, (unsigned long *)&p->status)) {
+ /* PEBS v3 has accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3)
+ return at;
+
+ if (p->status == (1 << bit))
+ return at;
+
+ /* clear non-PEBS bit and re-check */
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+ if (pebs_status == (1 << bit))
+ return at;
+ }
+ }
+ return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ void *base, void *top,
+ int bit, int count)
+{
+ struct perf_sample_data data;
+ struct pt_regs regs;
+ void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+ if (!intel_pmu_save_and_restart(event) &&
+ !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+ return;
+
+ while (count > 1) {
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
+ perf_event_output(event, &data, &regs);
+ at += x86_pmu.pebs_record_size;
+ at = get_next_pebs_record_by_bit(at, top, bit);
+ count--;
+ }
+
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
- if (perf_event_overflow(event, &data, &regs))
+ /*
+ * All but the last records are processed.
+ * The last one is left to be able to call the overflow handler.
+ */
+ if (perf_event_overflow(event, &data, &regs)) {
x86_pmu_stop(event, 0);
+ return;
+ }
+
}
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -992,72 +1186,99 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
if (!event->attr.precise_ip)
return;
- n = top - at;
+ n = (top - at) / x86_pmu.pebs_record_size;
if (n <= 0)
return;
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
- at += n - 1;
-
- __intel_pmu_pebs_event(event, iregs, at);
+ __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
}
static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
- struct perf_event *event = NULL;
- void *at, *top;
- u64 status = 0;
- int bit;
+ struct perf_event *event;
+ void *base, *at, *top;
+ short counts[MAX_PEBS_EVENTS] = {};
+ short error[MAX_PEBS_EVENTS] = {};
+ int bit, i;
if (!x86_pmu.pebs_active)
return;
- at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
ds->pebs_index = ds->pebs_buffer_base;
- if (unlikely(at > top))
+ if (unlikely(base >= top))
return;
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
- "Unexpected number of pebs records %ld\n",
- (long)(top - at) / x86_pmu.pebs_record_size);
-
- for (; at < top; at += x86_pmu.pebs_record_size) {
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at;
+ u64 pebs_status;
- for_each_set_bit(bit, (unsigned long *)&p->status,
- x86_pmu.max_pebs_events) {
- event = cpuc->events[bit];
- if (!test_bit(bit, cpuc->active_mask))
- continue;
+ /* PEBS v3 has accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3) {
+ for_each_set_bit(bit, (unsigned long *)&p->status,
+ MAX_PEBS_EVENTS)
+ counts[bit]++;
- WARN_ON_ONCE(!event);
+ continue;
+ }
- if (!event->attr.precise_ip)
- continue;
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
- if (__test_and_set_bit(bit, (unsigned long *)&status))
- continue;
+ bit = find_first_bit((unsigned long *)&pebs_status,
+ x86_pmu.max_pebs_events);
+ if (WARN(bit >= x86_pmu.max_pebs_events,
+ "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
+ (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
+ *(unsigned long long *)cpuc->active_mask))
+ continue;
- break;
+ /*
+ * The PEBS hardware does not deal well with the situation
+ * when events happen near to each other and multiple bits
+ * are set. But it should happen rarely.
+ *
+ * If these events include one PEBS and multiple non-PEBS
+ * events, it doesn't impact PEBS record. The record will
+ * be handled normally. (slow path)
+ *
+ * If these events include two or more PEBS events, the
+ * records for the events can be collapsed into a single
+ * one, and it's not possible to reconstruct all events
+ * that caused the PEBS record. It's called collision.
+ * If collision happened, the record will be dropped.
+ */
+ if (p->status != (1ULL << bit)) {
+ for_each_set_bit(i, (unsigned long *)&pebs_status,
+ x86_pmu.max_pebs_events)
+ error[i]++;
+ continue;
}
- if (!event || bit >= x86_pmu.max_pebs_events)
+ counts[bit]++;
+ }
+
+ for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+ if ((counts[bit] == 0) && (error[bit] == 0))
continue;
- __intel_pmu_pebs_event(event, iregs, at);
+ event = cpuc->events[bit];
+ WARN_ON_ONCE(!event);
+ WARN_ON_ONCE(!event->attr.precise_ip);
+
+ /* log dropped samples number */
+ if (error[bit])
+ perf_log_lost_samples(event, error[bit]);
+
+ if (counts[bit]) {
+ __intel_pmu_pebs_event(event, iregs, base,
+ top, bit, counts[bit]);
+ }
}
}
@@ -1098,6 +1319,14 @@ void __init intel_ds_init(void)
x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
break;
+ case 3:
+ pr_cont("PEBS fmt3%c, ", pebs_type);
+ x86_pmu.pebs_record_size =
+ sizeof(struct pebs_record_skl);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+ break;
+
default:
printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
x86_pmu.pebs = 0;
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 94e5b506c..659f01e16 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,7 +13,8 @@ enum {
LBR_FORMAT_EIP = 0x02,
LBR_FORMAT_EIP_FLAGS = 0x03,
LBR_FORMAT_EIP_FLAGS2 = 0x04,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
+ LBR_FORMAT_INFO = 0x05,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO,
};
static enum {
@@ -96,6 +97,7 @@ enum {
X86_BR_NO_TX = 1 << 14,/* not in transaction */
X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
X86_BR_CALL_STACK = 1 << 16,/* call stack */
+ X86_BR_IND_JMP = 1 << 17,/* indirect jump */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -113,6 +115,7 @@ enum {
X86_BR_IRQ |\
X86_BR_ABORT |\
X86_BR_IND_CALL |\
+ X86_BR_IND_JMP |\
X86_BR_ZERO_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
@@ -138,13 +141,20 @@ static void __intel_pmu_lbr_enable(bool pmi)
u64 debugctl, lbr_select = 0, orig_debugctl;
/*
+ * No need to unfreeze manually, as v4 can do that as part
+ * of the GLOBAL_STATUS ack.
+ */
+ if (pmi && x86_pmu.version >= 4)
+ return;
+
+ /*
* No need to reprogram LBR_SELECT in a PMI, as it
* did not change.
*/
- if (cpuc->lbr_sel && !pmi) {
+ if (cpuc->lbr_sel)
lbr_select = cpuc->lbr_sel->config;
+ if (!pmi)
wrmsrl(MSR_LBR_SELECT, lbr_select);
- }
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
orig_debugctl = debugctl;
@@ -184,6 +194,8 @@ static void intel_pmu_lbr_reset_64(void)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + i, 0);
}
}
@@ -227,12 +239,15 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
}
mask = x86_pmu.lbr_nr - 1;
- tos = intel_pmu_lbr_tos();
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ tos = task_ctx->tos;
+ for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+ wrmsrl(x86_pmu.lbr_tos, tos);
task_ctx->lbr_stack_state = LBR_NONE;
}
@@ -249,11 +264,14 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
mask = x86_pmu.lbr_nr - 1;
tos = intel_pmu_lbr_tos();
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
+ task_ctx->tos = tos;
task_ctx->lbr_stack_state = LBR_VALID;
}
@@ -262,9 +280,6 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
- if (!x86_pmu.lbr_nr)
- return;
-
/*
* If LBR callstack feature is enabled and the stack was saved when
* the task was scheduled out, restore the stack. Otherwise flush
@@ -412,16 +427,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
+ int num = x86_pmu.lbr_nr;
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+ num = tos;
+
+ for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
int skip = 0;
+ u16 cycles = 0;
int lbr_flags = lbr_desc[lbr_format];
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
+ if (lbr_format == LBR_FORMAT_INFO) {
+ u64 info;
+
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ cycles = (info & LBR_INFO_CYCLES);
+ }
if (lbr_flags & LBR_EIP_FLAGS) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
@@ -451,6 +481,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[out].predicted = pred;
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
+ cpuc->lbr_entries[out].cycles = cycles;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
@@ -523,6 +554,11 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
X86_BR_CALL_STACK;
}
+ if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+ mask |= X86_BR_IND_JMP;
+
+ if (br_type & PERF_SAMPLE_BRANCH_CALL)
+ mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -736,7 +772,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
break;
case 4:
case 5:
- ret = X86_BR_JMP;
+ ret = X86_BR_IND_JMP;
break;
}
break;
@@ -844,6 +880,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
*/
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
};
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -856,6 +893,8 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
};
static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -870,6 +909,8 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
| LBR_RETURN | LBR_CALL_STACK,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
};
/* core */
@@ -942,6 +983,26 @@ void intel_pmu_lbr_init_hsw(void)
pr_cont("16-deep LBR, ");
}
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+ x86_pmu.lbr_nr = 32;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("32-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 123ff1bb2..868e11943 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -65,15 +65,21 @@ static struct pt_cap_desc {
} pt_caps[] = {
PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
+ PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)),
+ PT_CAP(mtc, 0, CR_EBX, BIT(3)),
PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
+ PT_CAP(single_range_output, 0, CR_ECX, BIT(2)),
PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
+ PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000),
+ PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff),
+ PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000),
};
static u32 pt_cap_get(enum pt_capabilities cap)
{
struct pt_cap_desc *cd = &pt_caps[cap];
- u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+ u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
return (c & cd->mask) >> shift;
@@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = {
.name = "caps",
};
+PMU_FORMAT_ATTR(cyc, "config:1" );
+PMU_FORMAT_ATTR(mtc, "config:9" );
PMU_FORMAT_ATTR(tsc, "config:10" );
PMU_FORMAT_ATTR(noretcomp, "config:11" );
+PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
+PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
+PMU_FORMAT_ATTR(psb_period, "config:24-27" );
static struct attribute *pt_formats_attr[] = {
+ &format_attr_cyc.attr,
+ &format_attr_mtc.attr,
&format_attr_tsc.attr,
&format_attr_noretcomp.attr,
+ &format_attr_mtc_period.attr,
+ &format_attr_cyc_thresh.attr,
+ &format_attr_psb_period.attr,
NULL,
};
@@ -123,16 +139,13 @@ static int __init pt_pmu_hw_init(void)
long i;
attrs = NULL;
- ret = -ENODEV;
- if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
- goto fail;
for (i = 0; i < PT_CPUID_LEAVES; i++) {
cpuid_count(20, i,
- &pt_pmu.caps[CR_EAX + i*4],
- &pt_pmu.caps[CR_EBX + i*4],
- &pt_pmu.caps[CR_ECX + i*4],
- &pt_pmu.caps[CR_EDX + i*4]);
+ &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
}
ret = -ENOMEM;
@@ -170,15 +183,65 @@ fail:
return ret;
}
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \
+ RTIT_CTL_CYC_THRESH | \
+ RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \
+ RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \
+ RTIT_CTL_DISRETC | \
+ RTIT_CTL_CYC_PSB | \
+ RTIT_CTL_MTC)
static bool pt_event_valid(struct perf_event *event)
{
u64 config = event->attr.config;
+ u64 allowed, requested;
if ((config & PT_CONFIG_MASK) != config)
return false;
+ if (config & RTIT_CTL_CYC_PSB) {
+ if (!pt_cap_get(PT_CAP_psb_cyc))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_psb_periods);
+ requested = (config & RTIT_CTL_PSB_FREQ) >>
+ RTIT_CTL_PSB_FREQ_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+ requested = (config & RTIT_CTL_CYC_THRESH) >>
+ RTIT_CTL_CYC_THRESH_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+ }
+
+ if (config & RTIT_CTL_MTC) {
+ /*
+ * In the unlikely case that CPUID lists valid mtc periods,
+ * but not the mtc capability, drop out here.
+ *
+ * Spec says that setting mtc period bits while mtc bit in
+ * CPUID is 0 will #GP, so better safe than sorry.
+ */
+ if (!pt_cap_get(PT_CAP_mtc))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_mtc_periods);
+ if (!allowed)
+ return false;
+
+ requested = (config & RTIT_CTL_MTC_RANGE) >>
+ RTIT_CTL_MTC_RANGE_OFFSET;
+
+ if (!(allowed & BIT(requested)))
+ return false;
+ }
+
return true;
}
@@ -187,19 +250,15 @@ static bool pt_event_valid(struct perf_event *event)
* These all are cpu affine and operate on a local PT
*/
-static bool pt_is_running(void)
-{
- u64 ctl;
-
- rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-
- return !!(ctl & RTIT_CTL_TRACEEN);
-}
-
static void pt_config(struct perf_event *event)
{
u64 reg;
+ if (!event->hw.itrace_started) {
+ event->hw.itrace_started = 1;
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ }
+
reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
if (!event->attr.exclude_kernel)
@@ -609,7 +668,12 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
* @handle: Current output handle.
*
* Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected.
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
*/
static int pt_buffer_reset_markers(struct pt_buffer *buf,
struct perf_output_handle *handle)
@@ -618,9 +682,6 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
unsigned long head = local64_read(&buf->head);
unsigned long idx, npages, wakeup;
- if (buf->snapshot)
- return 0;
-
/* can't stop in the middle of an output region */
if (buf->output_off + handle->size + 1 <
sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
@@ -674,7 +735,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
struct topa *cur = buf->first, *prev = buf->last;
struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
*te_prev = TOPA_ENTRY(prev, prev->last - 1);
- int pg = 0, idx = 0, ntopa = 0;
+ int pg = 0, idx = 0;
while (pg < buf->nr_pages) {
int tidx;
@@ -689,9 +750,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
/* advance to next topa table */
idx = 0;
cur = list_entry(cur->list.next, struct topa, list);
- ntopa++;
- } else
+ } else {
idx++;
+ }
te_cur = TOPA_ENTRY(cur, idx);
}
@@ -703,7 +764,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
* @head: Write pointer (aux_head) from AUX buffer.
*
* Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly.
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
*/
static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
{
@@ -901,6 +969,7 @@ void intel_pt_interrupt(void)
}
pt_buffer_reset_offsets(buf, pt->handle.head);
+ /* snapshot counters don't use PMI, so it's safe */
ret = pt_buffer_reset_markers(buf, &pt->handle);
if (ret) {
perf_aux_output_end(&pt->handle, 0, true);
@@ -909,7 +978,6 @@ void intel_pt_interrupt(void)
pt_config_buffer(buf->cur->table, buf->cur_idx,
buf->output_off);
- wrmsrl(MSR_IA32_RTIT_STATUS, 0);
pt_config(event);
}
}
@@ -923,7 +991,7 @@ static void pt_event_start(struct perf_event *event, int mode)
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf = perf_get_aux(&pt->handle);
- if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+ if (!buf || pt_buffer_is_full(buf, pt)) {
event->hw.state = PERF_HES_STOPPED;
return;
}
@@ -933,7 +1001,6 @@ static void pt_event_start(struct perf_event *event, int mode)
pt_config_buffer(buf->cur->table, buf->cur_idx,
buf->output_off);
- wrmsrl(MSR_IA32_RTIT_STATUS, 0);
pt_config(event);
}
@@ -954,7 +1021,6 @@ static void pt_event_stop(struct perf_event *event, int mode)
event->hw.state = PERF_HES_STOPPED;
if (mode & PERF_EF_UPDATE) {
- struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf = perf_get_aux(&pt->handle);
if (!buf)
@@ -1061,6 +1127,10 @@ static __init int pt_init(void)
int ret, cpu, prior_warn = 0;
BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ return -ENODEV;
+
get_online_cpus();
for_each_online_cpu(cpu) {
u64 ctl;
@@ -1106,5 +1176,4 @@ static __init int pt_init(void)
return ret;
}
-
-module_init(pt_init);
+arch_initcall(pt_init);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 358c54ad2..ed446bdcb 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
1<<RAPL_IDX_RAM_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT)
+
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
@@ -103,12 +107,6 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-#define RAPL_EVENT_DESC(_name, _config) \
-{ \
- .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
- .config = _config, \
-}
-
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
#define RAPL_EVENT_ATTR_STR(_name, v, str) \
@@ -204,9 +202,8 @@ again:
static void rapl_start_hrtimer(struct rapl_pmu *pmu)
{
- __hrtimer_start_range_ns(&pmu->hrtimer,
- pmu->timer_interval, 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+ HRTIMER_MODE_REL_PINNED);
}
static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
@@ -487,6 +484,18 @@ static struct attribute *rapl_events_hsw_attr[] = {
NULL,
};
+static struct attribute *rapl_events_knl_attr[] = {
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
static struct attribute_group rapl_pmu_events_group = {
.name = "events",
.attrs = NULL, /* patched at runtime */
@@ -731,6 +740,10 @@ static int __init rapl_pmu_init(void)
rapl_cntr_mask = RAPL_IDX_SRV;
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
break;
+ case 87: /* Knights Landing */
+ rapl_add_quirk(rapl_hsw_server_quirk);
+ rapl_cntr_mask = RAPL_IDX_KNL;
+ rapl_pmu_events_group.attrs = rapl_events_knl_attr;
default:
/* unsupported */
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 90b7c501c..61215a69b 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
static bool pcidrv_registered;
struct pci_driver *uncore_pci_driver;
/* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed =
struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+ struct pci2phy_map *map;
+ int phys_id = -1;
+
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == pci_domain_nr(bus)) {
+ phys_id = map->pbus_to_physid[bus->number];
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+ struct pci2phy_map *map, *alloc = NULL;
+ int i;
+
+ lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == segment)
+ goto end;
+ }
+
+ if (!alloc) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+ raw_spin_lock(&pci2phy_map_lock);
+
+ if (!alloc)
+ return NULL;
+
+ goto lookup;
+ }
+
+ map = alloc;
+ alloc = NULL;
+ map->segment = segment;
+ for (i = 0; i < 256; i++)
+ map->pbus_to_physid[i] = -1;
+ list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+ kfree(alloc);
+ return map;
+}
+
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -233,9 +287,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
{
- __hrtimer_start_range_ns(&box->hrtimer,
- ns_to_ktime(box->hrtimer_duration), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+ HRTIMER_MODE_REL_PINNED);
}
void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
@@ -810,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
int phys_id;
bool first_box = false;
- phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+ phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
@@ -857,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
- int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+ int i, cpu, phys_id;
bool last_box = false;
+ phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
@@ -912,6 +966,9 @@ static int __init uncore_pci_init(void)
case 63: /* Haswell-EP */
ret = hswep_uncore_pci_init();
break;
+ case 86: /* BDX-DE */
+ ret = bdx_uncore_pci_init();
+ break;
case 42: /* Sandy Bridge */
ret = snb_uncore_pci_init();
break;
@@ -922,6 +979,9 @@ static int __init uncore_pci_init(void)
case 69: /* Haswell Celeron */
ret = hsw_uncore_pci_init();
break;
+ case 61: /* Broadwell */
+ ret = bdw_uncore_pci_init();
+ break;
default:
return 0;
}
@@ -1207,6 +1267,11 @@ static int __init uncore_cpu_init(void)
break;
case 42: /* Sandy Bridge */
case 58: /* Ivy Bridge */
+ case 60: /* Haswell */
+ case 69: /* Haswell */
+ case 70: /* Haswell */
+ case 61: /* Broadwell */
+ case 71: /* Broadwell */
snb_uncore_cpu_init();
break;
case 45: /* Sandy Bridge-EP */
@@ -1222,6 +1287,9 @@ static int __init uncore_cpu_init(void)
case 63: /* Haswell-EP */
hswep_uncore_cpu_init();
break;
+ case 86: /* BDX-DE */
+ bdx_uncore_cpu_init();
+ break;
default:
return 0;
}
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index ceac8f5dc..2f0a4a98e 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,15 @@ struct uncore_event_desc {
const char *config;
};
+struct pci2phy_map {
+ struct list_head list;
+ int segment;
+ int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
@@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
extern struct intel_uncore_type **uncore_msr_uncores;
extern struct intel_uncore_type **uncore_pci_uncores;
extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
extern struct event_constraint uncore_constraint_empty;
@@ -325,6 +335,7 @@ extern struct event_constraint uncore_constraint_empty;
int snb_uncore_pci_init(void);
int ivb_uncore_pci_init(void);
int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
@@ -335,6 +346,8 @@ int ivbep_uncore_pci_init(void);
void ivbep_uncore_cpu_init(void);
int hswep_uncore_pci_init(void);
void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 4562e9e22..845256158 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -7,6 +7,7 @@
#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00
#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -44,6 +45,11 @@
#define SNB_UNC_CBO_0_PER_CTR0 0x706
#define SNB_UNC_CBO_MSR_OFFSET 0x10
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0 0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET 0x10
+
/* NHM global control register */
#define NHM_UNC_PERF_GLOBAL_CTL 0x391
#define NHM_UNC_FIXED_CTR 0x394
@@ -114,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = {
.read_counter = uncore_msr_read_counter,
};
-static struct event_constraint snb_uncore_cbox_constraints[] = {
+static struct event_constraint snb_uncore_arb_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
EVENT_CONSTRAINT_END
@@ -133,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = {
.single_fixed = 1,
.event_mask = SNB_UNC_RAW_EVENT_MASK,
.msr_offset = SNB_UNC_CBO_MSR_OFFSET,
- .constraints = snb_uncore_cbox_constraints,
.ops = &snb_uncore_msr_ops,
.format_group = &snb_uncore_format_group,
.event_descs = snb_uncore_events,
};
+static struct intel_uncore_type snb_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .perf_ctr = SNB_UNC_ARB_PER_CTR0,
+ .event_ctl = SNB_UNC_ARB_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_ARB_MSR_OFFSET,
+ .constraints = snb_uncore_arb_constraints,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
static struct intel_uncore_type *snb_msr_uncores[] = {
&snb_uncore_cbox,
+ &snb_uncore_arb,
NULL,
};
@@ -400,15 +420,25 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
static int snb_pci2phy_map_init(int devid)
{
struct pci_dev *dev = NULL;
- int bus;
+ struct pci2phy_map *map;
+ int bus, segment;
dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
if (!dev)
return -ENOTTY;
bus = dev->bus->number;
-
- uncore_pcibus_to_physid[bus] = 0;
+ segment = pci_domain_nr(dev->bus);
+
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ pci_dev_put(dev);
+ return -ENOMEM;
+ }
+ map->pbus_to_physid[bus] = 0;
+ raw_spin_unlock(&pci2phy_map_lock);
pci_dev_put(dev);
@@ -486,6 +516,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
{ /* end: all zeroes */ },
};
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
static struct pci_driver snb_uncore_pci_driver = {
.name = "snb_uncore",
.id_table = snb_uncore_pci_ids,
@@ -501,6 +539,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
.id_table = hsw_uncore_pci_ids,
};
+static struct pci_driver bdw_uncore_pci_driver = {
+ .name = "bdw_uncore",
+ .id_table = bdw_uncore_pci_ids,
+};
+
struct imc_uncore_pci_dev {
__u32 pci_id;
struct pci_driver *driver;
@@ -514,6 +557,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */
+ IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */
{ /* end marker */ }
};
@@ -561,6 +605,11 @@ int hsw_uncore_pci_init(void)
return imc_uncore_pci_init();
}
+int bdw_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
/* end of Sandy Bridge uncore support */
/* Nehalem uncore support */
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 6d6e85dd5..f0f4fcba2 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1087,7 +1087,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid;
+ int i, bus, nodeid, segment;
+ struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1106,16 +1107,27 @@ static int snbep_pci2phy_map_init(int devid)
err = pci_read_config_dword(ubox_dev, 0x54, &config);
if (err)
break;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
/*
* every three bits in the Node ID mapping register maps
* to a particular node.
*/
for (i = 0; i < 8; i++) {
if (nodeid == ((config >> (3 * i)) & 0x7)) {
- uncore_pcibus_to_physid[bus] = i;
+ map->pbus_to_physid[bus] = i;
break;
}
}
+ raw_spin_unlock(&pci2phy_map_lock);
}
if (!err) {
@@ -1123,13 +1135,17 @@ static int snbep_pci2phy_map_init(int devid)
* For PCI bus with no UBOX device, find the next bus
* that has UBOX device and use its mapping.
*/
- i = -1;
- for (bus = 255; bus >= 0; bus--) {
- if (uncore_pcibus_to_physid[bus] >= 0)
- i = uncore_pcibus_to_physid[bus];
- else
- uncore_pcibus_to_physid[bus] = i;
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ i = -1;
+ for (bus = 255; bus >= 0; bus--) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
}
+ raw_spin_unlock(&pci2phy_map_lock);
}
pci_dev_put(ubox_dev);
@@ -2215,7 +2231,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = {
NULL,
};
-static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
{ /* Home Agent 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
@@ -2321,3 +2337,167 @@ int hswep_uncore_pci_init(void)
return 0;
}
/* end of Haswell-EP uncore support */
+
+/* BDX-DE uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = bdx_uncore_cbox_constraints,
+ .ops = &hswep_uncore_cbox_ops,
+ .format_group = &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+ &bdx_uncore_ubox,
+ &bdx_uncore_cbox,
+ &hswep_uncore_pcu,
+ NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+ if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = bdx_msr_uncores;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &hswep_uncore_irp_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ BDX_PCI_UNCORE_HA,
+ BDX_PCI_UNCORE_IMC,
+ BDX_PCI_UNCORE_IRP,
+ BDX_PCI_UNCORE_R2PCIE,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+ [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha,
+ [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc,
+ [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp,
+ [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+ NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+ .name = "bdx_uncore",
+ .id_table = bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x6f1e);
+
+ if (ret)
+ return ret;
+ uncore_pci_uncores = bdx_pci_uncores;
+ uncore_pci_driver = &bdx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of BDX-DE uncore support */
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_msr.c b/kernel/arch/x86/kernel/cpu/perf_event_msr.c
new file mode 100644
index 000000000..ec863b9a9
--- /dev/null
+++ b/kernel/arch/x86/kernel/cpu/perf_event_msr.c
@@ -0,0 +1,241 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+ PERF_MSR_TSC = 0,
+ PERF_MSR_APERF = 1,
+ PERF_MSR_MPERF = 2,
+ PERF_MSR_PPERF = 3,
+ PERF_MSR_SMI = 4,
+
+ PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case 76: /* 14nm Atom "Airmont" */
+ if (idx == PERF_MSR_SMI)
+ return true;
+ break;
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+struct perf_msr {
+ u64 msr;
+ struct perf_pmu_events_attr *attr;
+ bool (*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+
+static struct perf_msr msr[] = {
+ [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
+ [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
+ [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
+ [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
+ [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group events_attr_group = {
+ .name = "events",
+ .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+static struct attribute_group format_attr_group = {
+ .name = "format",
+ .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &events_attr_group,
+ &format_attr_group,
+ NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (!msr[cfg].attr)
+ return -EINVAL;
+
+ event->hw.idx = -1;
+ event->hw.event_base = msr[cfg].msr;
+ event->hw.config = cfg;
+
+ return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+ u64 now;
+
+ if (event->hw.event_base)
+ rdmsrl(event->hw.event_base, now);
+ else
+ rdtscll(now);
+
+ return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+ u64 prev, now;
+ s64 delta;
+
+ /* Careful, an NMI might modify the previous event value. */
+again:
+ prev = local64_read(&event->hw.prev_count);
+ now = msr_read_counter(event);
+
+ if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+ goto again;
+
+ delta = now - prev;
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+ delta = sign_extend64(delta, 31);
+
+ local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+ u64 now;
+
+ now = msr_read_counter(event);
+ local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+ msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+ msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ msr_event_start(event, flags);
+
+ return 0;
+}
+
+static struct pmu pmu_msr = {
+ .task_ctx_nr = perf_sw_context,
+ .attr_groups = attr_groups,
+ .event_init = msr_event_init,
+ .add = msr_event_add,
+ .del = msr_event_del,
+ .start = msr_event_start,
+ .stop = msr_event_stop,
+ .read = msr_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+ int i, j = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ pr_cont("no MSR PMU driver.\n");
+ return 0;
+ }
+
+ /* Probe the MSRs. */
+ for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+ u64 val;
+
+ /*
+ * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+ */
+ if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+ msr[i].attr = NULL;
+ }
+
+ /* List remaining MSRs in the sysfs attrs. */
+ for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+ if (msr[i].attr)
+ events_attrs[j++] = &msr[i].attr->attr.attr;
+ }
+ events_attrs[j] = NULL;
+
+ perf_pmu_register(&pmu_msr, "msr", -1);
+
+ return 0;
+}
+device_initcall(msr_init);
diff --git a/kernel/arch/x86/kernel/cpu/proc.c b/kernel/arch/x86/kernel/cpu/proc.c
index e7d8c7608..18ca99f27 100644
--- a/kernel/arch/x86/kernel/cpu/proc.c
+++ b/kernel/arch/x86/kernel/cpu/proc.c
@@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
{
#ifdef CONFIG_SMP
seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
+ seq_printf(m, "siblings\t: %d\n",
+ cpumask_weight(topology_core_cpumask(cpu)));
seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/kernel/arch/x86/kernel/cpu/scattered.c b/kernel/arch/x86/kernel/cpu/scattered.c
index 3d423a101..608fb26c7 100644
--- a/kernel/arch/x86/kernel/cpu/scattered.c
+++ b/kernel/arch/x86/kernel/cpu/scattered.c
@@ -37,7 +37,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
{ X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
{ X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 },
- { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 },
+ { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 },
{ X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
{ X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
{ X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },