Add the rt linux 4.1.3-rt3 as base

Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
author: Yunhong Jiang <yunhong.jiang@intel.com> 2015-08-04 12:17:53 -0700
committer: Yunhong Jiang <yunhong.jiang@intel.com> 2015-08-04 15:44:42 -0700
commit: 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 (patch)
tree: 1c9cafbcd35f783a87880a10f85d1a060db1a563 /kernel/arch/x86/oprofile
parent: 98260f3884f4a202f9ca5eabed40b1354c489b29 (diff)
9 files changed, 2609 insertions, 0 deletions
diff --git a/kernel/arch/x86/oprofile/Makefile b/kernel/arch/x86/oprofile/Makefile
new file mode 100644
index 000000000..1599f568f
--- /dev/null
+++ b/kernel/arch/x86/oprofile/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_OPROFILE) += oprofile.o
+
+DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
+		oprof.o cpu_buffer.o buffer_sync.o \
+		event_buffer.o oprofile_files.o \
+		oprofilefs.o oprofile_stats.o  \
+		timer_int.o nmi_timer_int.o )
+
+oprofile-y				:= $(DRIVER_OBJS) init.o backtrace.o
+oprofile-$(CONFIG_X86_LOCAL_APIC) 	+= nmi_int.o op_model_amd.o \
+					   op_model_ppro.o op_model_p4.o
diff --git a/kernel/arch/x86/oprofile/backtrace.c b/kernel/arch/x86/oprofile/backtrace.c
new file mode 100644
index 000000000..4e664bdb5
--- /dev/null
+++ b/kernel/arch/x86/oprofile/backtrace.c
@@ -0,0 +1,127 @@
+/**
+ * @file backtrace.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author David Smith
+ */
+
+#include <linux/oprofile.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+
+static int backtrace_stack(void *data, char *name)
+{
+	/* Yes, we want all stacks */
+	return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	unsigned int *depth = data;
+
+	if ((*depth)--)
+		oprofile_add_trace(addr);
+}
+
+static struct stacktrace_ops backtrace_ops = {
+	.stack		= backtrace_stack,
+	.address	= backtrace_address,
+	.walk_stack	= print_context_stack,
+};
+
+#ifdef CONFIG_COMPAT
+static struct stack_frame_ia32 *
+dump_user_backtrace_32(struct stack_frame_ia32 *head)
+{
+	/* Also check accessibility of one struct frame_head beyond: */
+	struct stack_frame_ia32 bufhead[2];
+	struct stack_frame_ia32 *fp;
+	unsigned long bytes;
+
+	bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+	if (bytes != 0)
+		return NULL;
+
+	fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
+
+	oprofile_add_trace(bufhead[0].return_address);
+
+	/* frame pointers should strictly progress back up the stack
+	* (towards higher addresses) */
+	if (head >= fp)
+		return NULL;
+
+	return fp;
+}
+
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+	struct stack_frame_ia32 *head;
+
+	/* User process is IA32 */
+	if (!current || !test_thread_flag(TIF_IA32))
+		return 0;
+
+	head = (struct stack_frame_ia32 *) regs->bp;
+	while (depth-- && head)
+		head = dump_user_backtrace_32(head);
+
+	return 1;
+}
+
+#else
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+	return 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
+{
+	/* Also check accessibility of one struct frame_head beyond: */
+	struct stack_frame bufhead[2];
+	unsigned long bytes;
+
+	bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+	if (bytes != 0)
+		return NULL;
+
+	oprofile_add_trace(bufhead[0].return_address);
+
+	/* frame pointers should strictly progress back up the stack
+	 * (towards higher addresses) */
+	if (head >= bufhead[0].next_frame)
+		return NULL;
+
+	return bufhead[0].next_frame;
+}
+
+void
+x86_backtrace(struct pt_regs * const regs, unsigned int depth)
+{
+	struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
+
+	if (!user_mode(regs)) {
+		unsigned long stack = kernel_stack_pointer(regs);
+		if (depth)
+			dump_trace(NULL, regs, (unsigned long *)stack, 0,
+				   &backtrace_ops, &depth);
+		return;
+	}
+
+	if (x86_backtrace_32(regs, depth))
+		return;
+
+	while (depth-- && head)
+		head = dump_user_backtrace(head);
+}
diff --git a/kernel/arch/x86/oprofile/init.c b/kernel/arch/x86/oprofile/init.c
new file mode 100644
index 000000000..9e138d00a
--- /dev/null
+++ b/kernel/arch/x86/oprofile/init.c
@@ -0,0 +1,38 @@
+/**
+ * @file init.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ */
+
+#include <linux/oprofile.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+/*
+ * We support CPUs that have performance counters like the Pentium Pro
+ * with the NMI mode driver.
+ */
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int op_nmi_init(struct oprofile_operations *ops);
+extern void op_nmi_exit(void);
+#else
+static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
+static void op_nmi_exit(void) { }
+#endif
+
+extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
+
+int __init oprofile_arch_init(struct oprofile_operations *ops)
+{
+	ops->backtrace = x86_backtrace;
+	return op_nmi_init(ops);
+}
+
+void oprofile_arch_exit(void)
+{
+	op_nmi_exit();
+}
diff --git a/kernel/arch/x86/oprofile/nmi_int.c b/kernel/arch/x86/oprofile/nmi_int.c
new file mode 100644
index 000000000..1d2e6392f
--- /dev/null
+++ b/kernel/arch/x86/oprofile/nmi_int.c
@@ -0,0 +1,802 @@
+/**
+ * @file nmi_int.c
+ *
+ * @remark Copyright 2002-2009 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ * @author Robert Richter <robert.richter@amd.com>
+ * @author Barry Kasindorf <barry.kasindorf@amd.com>
+ * @author Jason Yeh <jason.yeh@amd.com>
+ * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/oprofile.h>
+#include <linux/syscore_ops.h>
+#include <linux/slab.h>
+#include <linux/moduleparam.h>
+#include <linux/kdebug.h>
+#include <linux/cpu.h>
+#include <asm/nmi.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+
+#include "op_counter.h"
+#include "op_x86_model.h"
+
+static struct op_x86_model_spec *model;
+static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
+static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
+
+/* must be protected with get_online_cpus()/put_online_cpus(): */
+static int nmi_enabled;
+static int ctr_running;
+
+struct op_counter_config counter_config[OP_MAX_COUNTER];
+
+/* common functions */
+
+u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
+		    struct op_counter_config *counter_config)
+{
+	u64 val = 0;
+	u16 event = (u16)counter_config->event;
+
+	val |= ARCH_PERFMON_EVENTSEL_INT;
+	val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
+	val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
+	val |= (counter_config->unit_mask & 0xFF) << 8;
+	counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
+				  ARCH_PERFMON_EVENTSEL_EDGE |
+				  ARCH_PERFMON_EVENTSEL_CMASK);
+	val |= counter_config->extra;
+	event &= model->event_mask ? model->event_mask : 0xFF;
+	val |= event & 0xFF;
+	val |= (u64)(event & 0x0F00) << 24;
+
+	return val;
+}
+
+
+static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs)
+{
+	if (ctr_running)
+		model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs));
+	else if (!nmi_enabled)
+		return NMI_DONE;
+	else
+		model->stop(this_cpu_ptr(&cpu_msrs));
+	return NMI_HANDLED;
+}
+
+static void nmi_cpu_save_registers(struct op_msrs *msrs)
+{
+	struct op_msr *counters = msrs->counters;
+	struct op_msr *controls = msrs->controls;
+	unsigned int i;
+
+	for (i = 0; i < model->num_counters; ++i) {
+		if (counters[i].addr)
+			rdmsrl(counters[i].addr, counters[i].saved);
+	}
+
+	for (i = 0; i < model->num_controls; ++i) {
+		if (controls[i].addr)
+			rdmsrl(controls[i].addr, controls[i].saved);
+	}
+}
+
+static void nmi_cpu_start(void *dummy)
+{
+	struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->start(msrs);
+}
+
+static int nmi_start(void)
+{
+	get_online_cpus();
+	ctr_running = 1;
+	/* make ctr_running visible to the nmi handler: */
+	smp_mb();
+	on_each_cpu(nmi_cpu_start, NULL, 1);
+	put_online_cpus();
+	return 0;
+}
+
+static void nmi_cpu_stop(void *dummy)
+{
+	struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->stop(msrs);
+}
+
+static void nmi_stop(void)
+{
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_stop, NULL, 1);
+	ctr_running = 0;
+	put_online_cpus();
+}
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static DEFINE_PER_CPU(int, switch_index);
+
+static inline int has_mux(void)
+{
+	return !!model->switch_ctrl;
+}
+
+inline int op_x86_phys_to_virt(int phys)
+{
+	return __this_cpu_read(switch_index) + phys;
+}
+
+inline int op_x86_virt_to_phys(int virt)
+{
+	return virt % model->num_counters;
+}
+
+static void nmi_shutdown_mux(void)
+{
+	int i;
+
+	if (!has_mux())
+		return;
+
+	for_each_possible_cpu(i) {
+		kfree(per_cpu(cpu_msrs, i).multiplex);
+		per_cpu(cpu_msrs, i).multiplex = NULL;
+		per_cpu(switch_index, i) = 0;
+	}
+}
+
+static int nmi_setup_mux(void)
+{
+	size_t multiplex_size =
+		sizeof(struct op_msr) * model->num_virt_counters;
+	int i;
+
+	if (!has_mux())
+		return 1;
+
+	for_each_possible_cpu(i) {
+		per_cpu(cpu_msrs, i).multiplex =
+			kzalloc(multiplex_size, GFP_KERNEL);
+		if (!per_cpu(cpu_msrs, i).multiplex)
+			return 0;
+	}
+
+	return 1;
+}
+
+static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
+{
+	int i;
+	struct op_msr *multiplex = msrs->multiplex;
+
+	if (!has_mux())
+		return;
+
+	for (i = 0; i < model->num_virt_counters; ++i) {
+		if (counter_config[i].enabled) {
+			multiplex[i].saved = -(u64)counter_config[i].count;
+		} else {
+			multiplex[i].saved = 0;
+		}
+	}
+
+	per_cpu(switch_index, cpu) = 0;
+}
+
+static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
+{
+	struct op_msr *counters = msrs->counters;
+	struct op_msr *multiplex = msrs->multiplex;
+	int i;
+
+	for (i = 0; i < model->num_counters; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (counters[i].addr)
+			rdmsrl(counters[i].addr, multiplex[virt].saved);
+	}
+}
+
+static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
+{
+	struct op_msr *counters = msrs->counters;
+	struct op_msr *multiplex = msrs->multiplex;
+	int i;
+
+	for (i = 0; i < model->num_counters; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (counters[i].addr)
+			wrmsrl(counters[i].addr, multiplex[virt].saved);
+	}
+}
+
+static void nmi_cpu_switch(void *dummy)
+{
+	int cpu = smp_processor_id();
+	int si = per_cpu(switch_index, cpu);
+	struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+
+	nmi_cpu_stop(NULL);
+	nmi_cpu_save_mpx_registers(msrs);
+
+	/* move to next set */
+	si += model->num_counters;
+	if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
+		per_cpu(switch_index, cpu) = 0;
+	else
+		per_cpu(switch_index, cpu) = si;
+
+	model->switch_ctrl(model, msrs);
+	nmi_cpu_restore_mpx_registers(msrs);
+
+	nmi_cpu_start(NULL);
+}
+
+
+/*
+ * Quick check to see if multiplexing is necessary.
+ * The check should be sufficient since counters are used
+ * in ordre.
+ */
+static int nmi_multiplex_on(void)
+{
+	return counter_config[model->num_counters].count ? 0 : -EINVAL;
+}
+
+static int nmi_switch_event(void)
+{
+	if (!has_mux())
+		return -ENOSYS;		/* not implemented */
+	if (nmi_multiplex_on() < 0)
+		return -EINVAL;		/* not necessary */
+
+	get_online_cpus();
+	if (ctr_running)
+		on_each_cpu(nmi_cpu_switch, NULL, 1);
+	put_online_cpus();
+
+	return 0;
+}
+
+static inline void mux_init(struct oprofile_operations *ops)
+{
+	if (has_mux())
+		ops->switch_events = nmi_switch_event;
+}
+
+static void mux_clone(int cpu)
+{
+	if (!has_mux())
+		return;
+
+	memcpy(per_cpu(cpu_msrs, cpu).multiplex,
+	       per_cpu(cpu_msrs, 0).multiplex,
+	       sizeof(struct op_msr) * model->num_virt_counters);
+}
+
+#else
+
+inline int op_x86_phys_to_virt(int phys) { return phys; }
+inline int op_x86_virt_to_phys(int virt) { return virt; }
+static inline void nmi_shutdown_mux(void) { }
+static inline int nmi_setup_mux(void) { return 1; }
+static inline void
+nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
+static inline void mux_init(struct oprofile_operations *ops) { }
+static void mux_clone(int cpu) { }
+
+#endif
+
+static void free_msrs(void)
+{
+	int i;
+	for_each_possible_cpu(i) {
+		kfree(per_cpu(cpu_msrs, i).counters);
+		per_cpu(cpu_msrs, i).counters = NULL;
+		kfree(per_cpu(cpu_msrs, i).controls);
+		per_cpu(cpu_msrs, i).controls = NULL;
+	}
+	nmi_shutdown_mux();
+}
+
+static int allocate_msrs(void)
+{
+	size_t controls_size = sizeof(struct op_msr) * model->num_controls;
+	size_t counters_size = sizeof(struct op_msr) * model->num_counters;
+
+	int i;
+	for_each_possible_cpu(i) {
+		per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
+							GFP_KERNEL);
+		if (!per_cpu(cpu_msrs, i).counters)
+			goto fail;
+		per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
+							GFP_KERNEL);
+		if (!per_cpu(cpu_msrs, i).controls)
+			goto fail;
+	}
+
+	if (!nmi_setup_mux())
+		goto fail;
+
+	return 1;
+
+fail:
+	free_msrs();
+	return 0;
+}
+
+static void nmi_cpu_setup(void *dummy)
+{
+	int cpu = smp_processor_id();
+	struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+	nmi_cpu_save_registers(msrs);
+	raw_spin_lock(&oprofilefs_lock);
+	model->setup_ctrs(model, msrs);
+	nmi_cpu_setup_mux(cpu, msrs);
+	raw_spin_unlock(&oprofilefs_lock);
+	per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static void nmi_cpu_restore_registers(struct op_msrs *msrs)
+{
+	struct op_msr *counters = msrs->counters;
+	struct op_msr *controls = msrs->controls;
+	unsigned int i;
+
+	for (i = 0; i < model->num_controls; ++i) {
+		if (controls[i].addr)
+			wrmsrl(controls[i].addr, controls[i].saved);
+	}
+
+	for (i = 0; i < model->num_counters; ++i) {
+		if (counters[i].addr)
+			wrmsrl(counters[i].addr, counters[i].saved);
+	}
+}
+
+static void nmi_cpu_shutdown(void *dummy)
+{
+	unsigned int v;
+	int cpu = smp_processor_id();
+	struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+
+	/* restoring APIC_LVTPC can trigger an apic error because the delivery
+	 * mode and vector nr combination can be illegal. That's by design: on
+	 * power on apic lvt contain a zero vector nr which are legal only for
+	 * NMI delivery mode. So inhibit apic err before restoring lvtpc
+	 */
+	v = apic_read(APIC_LVTERR);
+	apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
+	apic_write(APIC_LVTERR, v);
+	nmi_cpu_restore_registers(msrs);
+}
+
+static void nmi_cpu_up(void *dummy)
+{
+	if (nmi_enabled)
+		nmi_cpu_setup(dummy);
+	if (ctr_running)
+		nmi_cpu_start(dummy);
+}
+
+static void nmi_cpu_down(void *dummy)
+{
+	if (ctr_running)
+		nmi_cpu_stop(dummy);
+	if (nmi_enabled)
+		nmi_cpu_shutdown(dummy);
+}
+
+static int nmi_create_files(struct dentry *root)
+{
+	unsigned int i;
+
+	for (i = 0; i < model->num_virt_counters; ++i) {
+		struct dentry *dir;
+		char buf[4];
+
+		/* quick little hack to _not_ expose a counter if it is not
+		 * available for use.  This should protect userspace app.
+		 * NOTE:  assumes 1:1 mapping here (that counters are organized
+		 *        sequentially in their struct assignment).
+		 */
+		if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
+			continue;
+
+		snprintf(buf,  sizeof(buf), "%d", i);
+		dir = oprofilefs_mkdir(root, buf);
+		oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
+		oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
+		oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
+		oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
+		oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
+		oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
+		oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
+	}
+
+	return 0;
+}
+
+static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
+				 void *data)
+{
+	int cpu = (unsigned long)data;
+	switch (action) {
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
+		break;
+	case CPU_DOWN_PREPARE:
+		smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block oprofile_cpu_nb = {
+	.notifier_call = oprofile_cpu_notifier
+};
+
+static int nmi_setup(void)
+{
+	int err = 0;
+	int cpu;
+
+	if (!allocate_msrs())
+		return -ENOMEM;
+
+	/* We need to serialize save and setup for HT because the subset
+	 * of msrs are distinct for save and setup operations
+	 */
+
+	/* Assume saved/restored counters are the same on all CPUs */
+	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	if (err)
+		goto fail;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpu)
+			continue;
+
+		memcpy(per_cpu(cpu_msrs, cpu).counters,
+		       per_cpu(cpu_msrs, 0).counters,
+		       sizeof(struct op_msr) * model->num_counters);
+
+		memcpy(per_cpu(cpu_msrs, cpu).controls,
+		       per_cpu(cpu_msrs, 0).controls,
+		       sizeof(struct op_msr) * model->num_controls);
+
+		mux_clone(cpu);
+	}
+
+	nmi_enabled = 0;
+	ctr_running = 0;
+	/* make variables visible to the nmi handler: */
+	smp_mb();
+	err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify,
+					0, "oprofile");
+	if (err)
+		goto fail;
+
+	cpu_notifier_register_begin();
+
+	/* Use get/put_online_cpus() to protect 'nmi_enabled' */
+	get_online_cpus();
+	nmi_enabled = 1;
+	/* make nmi_enabled visible to the nmi handler: */
+	smp_mb();
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
+	__register_cpu_notifier(&oprofile_cpu_nb);
+	put_online_cpus();
+
+	cpu_notifier_register_done();
+
+	return 0;
+fail:
+	free_msrs();
+	return err;
+}
+
+static void nmi_shutdown(void)
+{
+	struct op_msrs *msrs;
+
+	cpu_notifier_register_begin();
+
+	/* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	__unregister_cpu_notifier(&oprofile_cpu_nb);
+	put_online_cpus();
+
+	cpu_notifier_register_done();
+
+	/* make variables visible to the nmi handler: */
+	smp_mb();
+	unregister_nmi_handler(NMI_LOCAL, "oprofile");
+	msrs = &get_cpu_var(cpu_msrs);
+	model->shutdown(msrs);
+	free_msrs();
+	put_cpu_var(cpu_msrs);
+}
+
+#ifdef CONFIG_PM
+
+static int nmi_suspend(void)
+{
+	/* Only one CPU left, just stop that one */
+	if (nmi_enabled == 1)
+		nmi_cpu_stop(NULL);
+	return 0;
+}
+
+static void nmi_resume(void)
+{
+	if (nmi_enabled == 1)
+		nmi_cpu_start(NULL);
+}
+
+static struct syscore_ops oprofile_syscore_ops = {
+	.resume		= nmi_resume,
+	.suspend	= nmi_suspend,
+};
+
+static void __init init_suspend_resume(void)
+{
+	register_syscore_ops(&oprofile_syscore_ops);
+}
+
+static void exit_suspend_resume(void)
+{
+	unregister_syscore_ops(&oprofile_syscore_ops);
+}
+
+#else
+
+static inline void init_suspend_resume(void) { }
+static inline void exit_suspend_resume(void) { }
+
+#endif /* CONFIG_PM */
+
+static int __init p4_init(char **cpu_type)
+{
+	__u8 cpu_model = boot_cpu_data.x86_model;
+
+	if (cpu_model > 6 || cpu_model == 5)
+		return 0;
+
+#ifndef CONFIG_SMP
+	*cpu_type = "i386/p4";
+	model = &op_p4_spec;
+	return 1;
+#else
+	switch (smp_num_siblings) {
+	case 1:
+		*cpu_type = "i386/p4";
+		model = &op_p4_spec;
+		return 1;
+
+	case 2:
+		*cpu_type = "i386/p4-ht";
+		model = &op_p4_ht2_spec;
+		return 1;
+	}
+#endif
+
+	printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
+	printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
+	return 0;
+}
+
+enum __force_cpu_type {
+	reserved = 0,		/* do not force */
+	timer,
+	arch_perfmon,
+};
+
+static int force_cpu_type;
+
+static int set_cpu_type(const char *str, struct kernel_param *kp)
+{
+	if (!strcmp(str, "timer")) {
+		force_cpu_type = timer;
+		printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
+	} else if (!strcmp(str, "arch_perfmon")) {
+		force_cpu_type = arch_perfmon;
+		printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+	} else {
+		force_cpu_type = 0;
+	}
+
+	return 0;
+}
+module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
+
+static int __init ppro_init(char **cpu_type)
+{
+	__u8 cpu_model = boot_cpu_data.x86_model;
+	struct op_x86_model_spec *spec = &op_ppro_spec;	/* default */
+
+	if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon)
+		return 0;
+
+	/*
+	 * Documentation on identifying Intel processors by CPU family
+	 * and model can be found in the Intel Software Developer's
+	 * Manuals (SDM):
+	 *
+	 *  http://www.intel.com/products/processor/manuals/
+	 *
+	 * As of May 2010 the documentation for this was in the:
+	 * "Intel 64 and IA-32 Architectures Software Developer's
+	 * Manual Volume 3B: System Programming Guide", "Table B-1
+	 * CPUID Signature Values of DisplayFamily_DisplayModel".
+	 */
+	switch (cpu_model) {
+	case 0 ... 2:
+		*cpu_type = "i386/ppro";
+		break;
+	case 3 ... 5:
+		*cpu_type = "i386/pii";
+		break;
+	case 6 ... 8:
+	case 10 ... 11:
+		*cpu_type = "i386/piii";
+		break;
+	case 9:
+	case 13:
+		*cpu_type = "i386/p6_mobile";
+		break;
+	case 14:
+		*cpu_type = "i386/core";
+		break;
+	case 0x0f:
+	case 0x16:
+	case 0x17:
+	case 0x1d:
+		*cpu_type = "i386/core_2";
+		break;
+	case 0x1a:
+	case 0x1e:
+	case 0x2e:
+		spec = &op_arch_perfmon_spec;
+		*cpu_type = "i386/core_i7";
+		break;
+	case 0x1c:
+		*cpu_type = "i386/atom";
+		break;
+	default:
+		/* Unknown */
+		return 0;
+	}
+
+	model = spec;
+	return 1;
+}
+
+int __init op_nmi_init(struct oprofile_operations *ops)
+{
+	__u8 vendor = boot_cpu_data.x86_vendor;
+	__u8 family = boot_cpu_data.x86;
+	char *cpu_type = NULL;
+	int ret = 0;
+
+	if (!cpu_has_apic)
+		return -ENODEV;
+
+	if (force_cpu_type == timer)
+		return -ENODEV;
+
+	switch (vendor) {
+	case X86_VENDOR_AMD:
+		/* Needs to be at least an Athlon (or hammer in 32bit mode) */
+
+		switch (family) {
+		case 6:
+			cpu_type = "i386/athlon";
+			break;
+		case 0xf:
+			/*
+			 * Actually it could be i386/hammer too, but
+			 * give user space an consistent name.
+			 */
+			cpu_type = "x86-64/hammer";
+			break;
+		case 0x10:
+			cpu_type = "x86-64/family10";
+			break;
+		case 0x11:
+			cpu_type = "x86-64/family11h";
+			break;
+		case 0x12:
+			cpu_type = "x86-64/family12h";
+			break;
+		case 0x14:
+			cpu_type = "x86-64/family14h";
+			break;
+		case 0x15:
+			cpu_type = "x86-64/family15h";
+			break;
+		default:
+			return -ENODEV;
+		}
+		model = &op_amd_spec;
+		break;
+
+	case X86_VENDOR_INTEL:
+		switch (family) {
+			/* Pentium IV */
+		case 0xf:
+			p4_init(&cpu_type);
+			break;
+
+			/* A P6-class processor */
+		case 6:
+			ppro_init(&cpu_type);
+			break;
+
+		default:
+			break;
+		}
+
+		if (cpu_type)
+			break;
+
+		if (!cpu_has_arch_perfmon)
+			return -ENODEV;
+
+		/* use arch perfmon as fallback */
+		cpu_type = "i386/arch_perfmon";
+		model = &op_arch_perfmon_spec;
+		break;
+
+	default:
+		return -ENODEV;
+	}
+
+	/* default values, can be overwritten by model */
+	ops->create_files	= nmi_create_files;
+	ops->setup		= nmi_setup;
+	ops->shutdown		= nmi_shutdown;
+	ops->start		= nmi_start;
+	ops->stop		= nmi_stop;
+	ops->cpu_type		= cpu_type;
+
+	if (model->init)
+		ret = model->init(ops);
+	if (ret)
+		return ret;
+
+	if (!model->num_virt_counters)
+		model->num_virt_counters = model->num_counters;
+
+	mux_init(ops);
+
+	init_suspend_resume();
+
+	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
+	return 0;
+}
+
+void op_nmi_exit(void)
+{
+	exit_suspend_resume();
+}
diff --git a/kernel/arch/x86/oprofile/op_counter.h b/kernel/arch/x86/oprofile/op_counter.h
new file mode 100644
index 000000000..0b7b7b179
--- /dev/null
+++ b/kernel/arch/x86/oprofile/op_counter.h
@@ -0,0 +1,30 @@
+/**
+ * @file op_counter.h
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ */
+
+#ifndef OP_COUNTER_H
+#define OP_COUNTER_H
+
+#define OP_MAX_COUNTER 32
+
+/* Per-perfctr configuration as set via
+ * oprofilefs.
+ */
+struct op_counter_config {
+	unsigned long count;
+	unsigned long enabled;
+	unsigned long event;
+	unsigned long kernel;
+	unsigned long user;
+	unsigned long unit_mask;
+	unsigned long extra;
+};
+
+extern struct op_counter_config counter_config[];
+
+#endif /* OP_COUNTER_H */
diff --git a/kernel/arch/x86/oprofile/op_model_amd.c b/kernel/arch/x86/oprofile/op_model_amd.c
new file mode 100644
index 000000000..50d86c0e9
--- /dev/null
+++ b/kernel/arch/x86/oprofile/op_model_amd.c
@@ -0,0 +1,543 @@
+/*
+ * @file op_model_amd.c
+ * athlon / K7 / K8 / Family 10h model-specific MSR operations
+ *
+ * @remark Copyright 2002-2009 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author Philippe Elie
+ * @author Graydon Hoare
+ * @author Robert Richter <robert.richter@amd.com>
+ * @author Barry Kasindorf <barry.kasindorf@amd.com>
+ * @author Jason Yeh <jason.yeh@amd.com>
+ * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+ */
+
+#include <linux/oprofile.h>
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/percpu.h>
+
+#include <asm/ptrace.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+#include "op_x86_model.h"
+#include "op_counter.h"
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+#define NUM_VIRT_COUNTERS	32
+#else
+#define NUM_VIRT_COUNTERS	0
+#endif
+
+#define OP_EVENT_MASK			0x0FFF
+#define OP_CTR_OVERFLOW			(1ULL<<31)
+
+#define MSR_AMD_EVENTSEL_RESERVED	((0xFFFFFCF0ULL<<32)|(1ULL<<21))
+
+static int num_counters;
+static unsigned long reset_value[OP_MAX_COUNTER];
+
+#define IBS_FETCH_SIZE			6
+#define IBS_OP_SIZE			12
+
+static u32 ibs_caps;
+
+struct ibs_config {
+	unsigned long op_enabled;
+	unsigned long fetch_enabled;
+	unsigned long max_cnt_fetch;
+	unsigned long max_cnt_op;
+	unsigned long rand_en;
+	unsigned long dispatched_ops;
+	unsigned long branch_target;
+};
+
+struct ibs_state {
+	u64		ibs_op_ctl;
+	int		branch_target;
+	unsigned long	sample_size;
+};
+
+static struct ibs_config ibs_config;
+static struct ibs_state ibs_state;
+
+/*
+ * IBS randomization macros
+ */
+#define IBS_RANDOM_BITS			12
+#define IBS_RANDOM_MASK			((1ULL << IBS_RANDOM_BITS) - 1)
+#define IBS_RANDOM_MAXCNT_OFFSET	(1ULL << (IBS_RANDOM_BITS - 5))
+
+/*
+ * 16-bit Linear Feedback Shift Register (LFSR)
+ *
+ *                       16   14   13    11
+ * Feedback polynomial = X  + X  + X  +  X  + 1
+ */
+static unsigned int lfsr_random(void)
+{
+	static unsigned int lfsr_value = 0xF00D;
+	unsigned int bit;
+
+	/* Compute next bit to shift in */
+	bit = ((lfsr_value >> 0) ^
+	       (lfsr_value >> 2) ^
+	       (lfsr_value >> 3) ^
+	       (lfsr_value >> 5)) & 0x0001;
+
+	/* Advance to next register value */
+	lfsr_value = (lfsr_value >> 1) | (bit << 15);
+
+	return lfsr_value;
+}
+
+/*
+ * IBS software randomization
+ *
+ * The IBS periodic op counter is randomized in software. The lower 12
+ * bits of the 20 bit counter are randomized. IbsOpCurCnt is
+ * initialized with a 12 bit random value.
+ */
+static inline u64 op_amd_randomize_ibs_op(u64 val)
+{
+	unsigned int random = lfsr_random();
+
+	if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
+		/*
+		 * Work around if the hw can not write to IbsOpCurCnt
+		 *
+		 * Randomize the lower 8 bits of the 16 bit
+		 * IbsOpMaxCnt [15:0] value in the range of -128 to
+		 * +127 by adding/subtracting an offset to the
+		 * maximum count (IbsOpMaxCnt).
+		 *
+		 * To avoid over or underflows and protect upper bits
+		 * starting at bit 16, the initial value for
+		 * IbsOpMaxCnt must fit in the range from 0x0081 to
+		 * 0xff80.
+		 */
+		val += (s8)(random >> 4);
+	else
+		val |= (u64)(random & IBS_RANDOM_MASK) << 32;
+
+	return val;
+}
+
+static inline void
+op_amd_handle_ibs(struct pt_regs * const regs,
+		  struct op_msrs const * const msrs)
+{
+	u64 val, ctl;
+	struct op_entry entry;
+
+	if (!ibs_caps)
+		return;
+
+	if (ibs_config.fetch_enabled) {
+		rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
+		if (ctl & IBS_FETCH_VAL) {
+			rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
+			oprofile_write_reserve(&entry, regs, val,
+					       IBS_FETCH_CODE, IBS_FETCH_SIZE);
+			oprofile_add_data64(&entry, val);
+			oprofile_add_data64(&entry, ctl);
+			rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
+			oprofile_add_data64(&entry, val);
+			oprofile_write_commit(&entry);
+
+			/* reenable the IRQ */
+			ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
+			ctl |= IBS_FETCH_ENABLE;
+			wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
+		}
+	}
+
+	if (ibs_config.op_enabled) {
+		rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
+		if (ctl & IBS_OP_VAL) {
+			rdmsrl(MSR_AMD64_IBSOPRIP, val);
+			oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
+					       ibs_state.sample_size);
+			oprofile_add_data64(&entry, val);
+			rdmsrl(MSR_AMD64_IBSOPDATA, val);
+			oprofile_add_data64(&entry, val);
+			rdmsrl(MSR_AMD64_IBSOPDATA2, val);
+			oprofile_add_data64(&entry, val);
+			rdmsrl(MSR_AMD64_IBSOPDATA3, val);
+			oprofile_add_data64(&entry, val);
+			rdmsrl(MSR_AMD64_IBSDCLINAD, val);
+			oprofile_add_data64(&entry, val);
+			rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
+			oprofile_add_data64(&entry, val);
+			if (ibs_state.branch_target) {
+				rdmsrl(MSR_AMD64_IBSBRTARGET, val);
+				oprofile_add_data(&entry, (unsigned long)val);
+			}
+			oprofile_write_commit(&entry);
+
+			/* reenable the IRQ */
+			ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
+			wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
+		}
+	}
+}
+
+static inline void op_amd_start_ibs(void)
+{
+	u64 val;
+
+	if (!ibs_caps)
+		return;
+
+	memset(&ibs_state, 0, sizeof(ibs_state));
+
+	/*
+	 * Note: Since the max count settings may out of range we
+	 * write back the actual used values so that userland can read
+	 * it.
+	 */
+
+	if (ibs_config.fetch_enabled) {
+		val = ibs_config.max_cnt_fetch >> 4;
+		val = min(val, IBS_FETCH_MAX_CNT);
+		ibs_config.max_cnt_fetch = val << 4;
+		val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
+		val |= IBS_FETCH_ENABLE;
+		wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
+	}
+
+	if (ibs_config.op_enabled) {
+		val = ibs_config.max_cnt_op >> 4;
+		if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
+			/*
+			 * IbsOpCurCnt not supported.  See
+			 * op_amd_randomize_ibs_op() for details.
+			 */
+			val = clamp(val, 0x0081ULL, 0xFF80ULL);
+			ibs_config.max_cnt_op = val << 4;
+		} else {
+			/*
+			 * The start value is randomized with a
+			 * positive offset, we need to compensate it
+			 * with the half of the randomized range. Also
+			 * avoid underflows.
+			 */
+			val += IBS_RANDOM_MAXCNT_OFFSET;
+			if (ibs_caps & IBS_CAPS_OPCNTEXT)
+				val = min(val, IBS_OP_MAX_CNT_EXT);
+			else
+				val = min(val, IBS_OP_MAX_CNT);
+			ibs_config.max_cnt_op =
+				(val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
+		}
+		val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
+		val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
+		val |= IBS_OP_ENABLE;
+		ibs_state.ibs_op_ctl = val;
+		ibs_state.sample_size = IBS_OP_SIZE;
+		if (ibs_config.branch_target) {
+			ibs_state.branch_target = 1;
+			ibs_state.sample_size++;
+		}
+		val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
+		wrmsrl(MSR_AMD64_IBSOPCTL, val);
+	}
+}
+
+static void op_amd_stop_ibs(void)
+{
+	if (!ibs_caps)
+		return;
+
+	if (ibs_config.fetch_enabled)
+		/* clear max count and enable */
+		wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
+
+	if (ibs_config.op_enabled)
+		/* clear max count and enable */
+		wrmsrl(MSR_AMD64_IBSOPCTL, 0);
+}
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
+			       struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* enable active counters */
+	for (i = 0; i < num_counters; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
+#endif
+
+/* functions for op_amd_spec */
+
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+}
+
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; i++) {
+		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+			goto fail;
+		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+			goto fail;
+		}
+		/* both registers must be reserved */
+		if (num_counters == AMD64_NUM_COUNTERS_CORE) {
+			msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
+			msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
+		} else {
+			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+			msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		}
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		op_amd_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
+			      struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* setup reset_value */
+	for (i = 0; i < OP_MAX_COUNTER; ++i) {
+		if (counter_config[i].enabled
+		    && msrs->counters[op_x86_virt_to_phys(i)].addr)
+			reset_value[i] = counter_config[i].count;
+		else
+			reset_value[i] = 0;
+	}
+
+	/* clear all counters */
+	for (i = 0; i < num_counters; ++i) {
+		if (!msrs->controls[i].addr)
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+			op_x86_warn_in_use(i);
+		val &= model->reserved;
+		wrmsrl(msrs->controls[i].addr, val);
+		/*
+		 * avoid a false detection of ctr overflows in NMI
+		 * handler
+		 */
+		wrmsrl(msrs->counters[i].addr, -1LL);
+	}
+
+	/* enable active counters */
+	for (i = 0; i < num_counters; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+
+		/* setup counter registers */
+		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+
+		/* setup control registers */
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
+static int op_amd_check_ctrs(struct pt_regs * const regs,
+			     struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+		rdmsrl(msrs->counters[i].addr, val);
+		/* bit is clear if overflowed: */
+		if (val & OP_CTR_OVERFLOW)
+			continue;
+		oprofile_add_sample(regs, virt);
+		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+	}
+
+	op_amd_handle_ibs(regs, msrs);
+
+	/* See op_model_ppro.c */
+	return 1;
+}
+
+static void op_amd_start(struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!reset_value[op_x86_phys_to_virt(i)])
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+
+	op_amd_start_ibs();
+}
+
+static void op_amd_stop(struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/*
+	 * Subtle: stop on all counters to avoid race with setting our
+	 * pm callback
+	 */
+	for (i = 0; i < num_counters; ++i) {
+		if (!reset_value[op_x86_phys_to_virt(i)])
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+
+	op_amd_stop_ibs();
+}
+
+/*
+ * check and reserve APIC extended interrupt LVT offset for IBS if
+ * available
+ */
+
+static void init_ibs(void)
+{
+	ibs_caps = get_ibs_caps();
+
+	if (!ibs_caps)
+		return;
+
+	printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
+}
+
+static int (*create_arch_files)(struct dentry *root);
+
+static int setup_ibs_files(struct dentry *root)
+{
+	struct dentry *dir;
+	int ret = 0;
+
+	/* architecture specific files */
+	if (create_arch_files)
+		ret = create_arch_files(root);
+
+	if (ret)
+		return ret;
+
+	if (!ibs_caps)
+		return ret;
+
+	/* model specific files */
+
+	/* setup some reasonable defaults */
+	memset(&ibs_config, 0, sizeof(ibs_config));
+	ibs_config.max_cnt_fetch = 250000;
+	ibs_config.max_cnt_op = 250000;
+
+	if (ibs_caps & IBS_CAPS_FETCHSAM) {
+		dir = oprofilefs_mkdir(root, "ibs_fetch");
+		oprofilefs_create_ulong(dir, "enable",
+					&ibs_config.fetch_enabled);
+		oprofilefs_create_ulong(dir, "max_count",
+					&ibs_config.max_cnt_fetch);
+		oprofilefs_create_ulong(dir, "rand_enable",
+					&ibs_config.rand_en);
+	}
+
+	if (ibs_caps & IBS_CAPS_OPSAM) {
+		dir = oprofilefs_mkdir(root, "ibs_op");
+		oprofilefs_create_ulong(dir, "enable",
+					&ibs_config.op_enabled);
+		oprofilefs_create_ulong(dir, "max_count",
+					&ibs_config.max_cnt_op);
+		if (ibs_caps & IBS_CAPS_OPCNT)
+			oprofilefs_create_ulong(dir, "dispatched_ops",
+						&ibs_config.dispatched_ops);
+		if (ibs_caps & IBS_CAPS_BRNTRGT)
+			oprofilefs_create_ulong(dir, "branch_target",
+						&ibs_config.branch_target);
+	}
+
+	return 0;
+}
+
+struct op_x86_model_spec op_amd_spec;
+
+static int op_amd_init(struct oprofile_operations *ops)
+{
+	init_ibs();
+	create_arch_files = ops->create_files;
+	ops->create_files = setup_ibs_files;
+
+	if (boot_cpu_data.x86 == 0x15) {
+		num_counters = AMD64_NUM_COUNTERS_CORE;
+	} else {
+		num_counters = AMD64_NUM_COUNTERS;
+	}
+
+	op_amd_spec.num_counters = num_counters;
+	op_amd_spec.num_controls = num_counters;
+	op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
+
+	return 0;
+}
+
+struct op_x86_model_spec op_amd_spec = {
+	/* num_counters/num_controls filled in at runtime */
+	.reserved		= MSR_AMD_EVENTSEL_RESERVED,
+	.event_mask		= OP_EVENT_MASK,
+	.init			= op_amd_init,
+	.fill_in_addresses	= &op_amd_fill_in_addresses,
+	.setup_ctrs		= &op_amd_setup_ctrs,
+	.check_ctrs		= &op_amd_check_ctrs,
+	.start			= &op_amd_start,
+	.stop			= &op_amd_stop,
+	.shutdown		= &op_amd_shutdown,
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+	.switch_ctrl		= &op_mux_switch_ctrl,
+#endif
+};
diff --git a/kernel/arch/x86/oprofile/op_model_p4.c b/kernel/arch/x86/oprofile/op_model_p4.c
new file mode 100644
index 000000000..ad1d91f47
--- /dev/null
+++ b/kernel/arch/x86/oprofile/op_model_p4.c
@@ -0,0 +1,723 @@
+/**
+ * @file op_model_p4.c
+ * P4 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author Graydon Hoare
+ */
+
+#include <linux/oprofile.h>
+#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <asm/nmi.h>
+#include <asm/msr.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+
+
+#include "op_x86_model.h"
+#include "op_counter.h"
+
+#define NUM_EVENTS 39
+
+#define NUM_COUNTERS_NON_HT 8
+#define NUM_ESCRS_NON_HT 45
+#define NUM_CCCRS_NON_HT 18
+#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT)
+
+#define NUM_COUNTERS_HT2 4
+#define NUM_ESCRS_HT2 23
+#define NUM_CCCRS_HT2 9
+#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
+
+#define OP_CTR_OVERFLOW			(1ULL<<31)
+
+static unsigned int num_counters = NUM_COUNTERS_NON_HT;
+static unsigned int num_controls = NUM_CONTROLS_NON_HT;
+
+/* this has to be checked dynamically since the
+   hyper-threadedness of a chip is discovered at
+   kernel boot-time. */
+static inline void setup_num_counters(void)
+{
+#ifdef CONFIG_SMP
+	if (smp_num_siblings == 2) {
+		num_counters = NUM_COUNTERS_HT2;
+		num_controls = NUM_CONTROLS_HT2;
+	}
+#endif
+}
+
+static inline int addr_increment(void)
+{
+#ifdef CONFIG_SMP
+	return smp_num_siblings == 2 ? 2 : 1;
+#else
+	return 1;
+#endif
+}
+
+
+/* tables to simulate simplified hardware view of p4 registers */
+struct p4_counter_binding {
+	int virt_counter;
+	int counter_address;
+	int cccr_address;
+};
+
+struct p4_event_binding {
+	int escr_select;  /* value to put in CCCR */
+	int event_select; /* value to put in ESCR */
+	struct {
+		int virt_counter; /* for this counter... */
+		int escr_address; /* use this ESCR       */
+	} bindings[2];
+};
+
+/* nb: these CTR_* defines are a duplicate of defines in
+   event/i386.p4*events. */
+
+
+#define CTR_BPU_0      (1 << 0)
+#define CTR_MS_0       (1 << 1)
+#define CTR_FLAME_0    (1 << 2)
+#define CTR_IQ_4       (1 << 3)
+#define CTR_BPU_2      (1 << 4)
+#define CTR_MS_2       (1 << 5)
+#define CTR_FLAME_2    (1 << 6)
+#define CTR_IQ_5       (1 << 7)
+
+static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
+	{ CTR_BPU_0,   MSR_P4_BPU_PERFCTR0,   MSR_P4_BPU_CCCR0 },
+	{ CTR_MS_0,    MSR_P4_MS_PERFCTR0,    MSR_P4_MS_CCCR0 },
+	{ CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
+	{ CTR_IQ_4,    MSR_P4_IQ_PERFCTR4,    MSR_P4_IQ_CCCR4 },
+	{ CTR_BPU_2,   MSR_P4_BPU_PERFCTR2,   MSR_P4_BPU_CCCR2 },
+	{ CTR_MS_2,    MSR_P4_MS_PERFCTR2,    MSR_P4_MS_CCCR2 },
+	{ CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 },
+	{ CTR_IQ_5,    MSR_P4_IQ_PERFCTR5,    MSR_P4_IQ_CCCR5 }
+};
+
+#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
+
+/* p4 event codes in libop/op_event.h are indices into this table. */
+
+static struct p4_event_binding p4_events[NUM_EVENTS] = {
+
+	{ /* BRANCH_RETIRED */
+		0x05, 0x06,
+		{ {CTR_IQ_4, MSR_P4_CRU_ESCR2},
+		  {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+	},
+
+	{ /* MISPRED_BRANCH_RETIRED */
+		0x04, 0x03,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+	},
+
+	{ /* TC_DELIVER_MODE */
+		0x01, 0x01,
+		{ { CTR_MS_0, MSR_P4_TC_ESCR0},
+		  { CTR_MS_2, MSR_P4_TC_ESCR1} }
+	},
+
+	{ /* BPU_FETCH_REQUEST */
+		0x00, 0x03,
+		{ { CTR_BPU_0, MSR_P4_BPU_ESCR0},
+		  { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
+	},
+
+	{ /* ITLB_REFERENCE */
+		0x03, 0x18,
+		{ { CTR_BPU_0, MSR_P4_ITLB_ESCR0},
+		  { CTR_BPU_2, MSR_P4_ITLB_ESCR1} }
+	},
+
+	{ /* MEMORY_CANCEL */
+		0x05, 0x02,
+		{ { CTR_FLAME_0, MSR_P4_DAC_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_DAC_ESCR1} }
+	},
+
+	{ /* MEMORY_COMPLETE */
+		0x02, 0x08,
+		{ { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+	},
+
+	{ /* LOAD_PORT_REPLAY */
+		0x02, 0x04,
+		{ { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+	},
+
+	{ /* STORE_PORT_REPLAY */
+		0x02, 0x05,
+		{ { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+	},
+
+	{ /* MOB_LOAD_REPLAY */
+		0x02, 0x03,
+		{ { CTR_BPU_0, MSR_P4_MOB_ESCR0},
+		  { CTR_BPU_2, MSR_P4_MOB_ESCR1} }
+	},
+
+	{ /* PAGE_WALK_TYPE */
+		0x04, 0x01,
+		{ { CTR_BPU_0, MSR_P4_PMH_ESCR0},
+		  { CTR_BPU_2, MSR_P4_PMH_ESCR1} }
+	},
+
+	{ /* BSQ_CACHE_REFERENCE */
+		0x07, 0x0c,
+		{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
+		  { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
+	},
+
+	{ /* IOQ_ALLOCATION */
+		0x06, 0x03,
+		{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+		  { 0, 0 } }
+	},
+
+	{ /* IOQ_ACTIVE_ENTRIES */
+		0x06, 0x1a,
+		{ { CTR_BPU_2, MSR_P4_FSB_ESCR1},
+		  { 0, 0 } }
+	},
+
+	{ /* FSB_DATA_ACTIVITY */
+		0x06, 0x17,
+		{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+		  { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
+	},
+
+	{ /* BSQ_ALLOCATION */
+		0x07, 0x05,
+		{ { CTR_BPU_0, MSR_P4_BSU_ESCR0},
+		  { 0, 0 } }
+	},
+
+	{ /* BSQ_ACTIVE_ENTRIES */
+		0x07, 0x06,
+		{ { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
+		  { 0, 0 } }
+	},
+
+	{ /* X87_ASSIST */
+		0x05, 0x03,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+	},
+
+	{ /* SSE_INPUT_ASSIST */
+		0x01, 0x34,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* PACKED_SP_UOP */
+		0x01, 0x08,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* PACKED_DP_UOP */
+		0x01, 0x0c,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* SCALAR_SP_UOP */
+		0x01, 0x0a,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* SCALAR_DP_UOP */
+		0x01, 0x0e,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* 64BIT_MMX_UOP */
+		0x01, 0x02,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* 128BIT_MMX_UOP */
+		0x01, 0x1a,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* X87_FP_UOP */
+		0x01, 0x04,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* X87_SIMD_MOVES_UOP */
+		0x01, 0x2e,
+		{ { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+		  { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+	},
+
+	{ /* MACHINE_CLEAR */
+		0x05, 0x02,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+	},
+
+	{ /* GLOBAL_POWER_EVENTS */
+		0x06, 0x13 /* older manual says 0x05, newer 0x13 */,
+		{ { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+		  { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
+	},
+
+	{ /* TC_MS_XFER */
+		0x00, 0x05,
+		{ { CTR_MS_0, MSR_P4_MS_ESCR0},
+		  { CTR_MS_2, MSR_P4_MS_ESCR1} }
+	},
+
+	{ /* UOP_QUEUE_WRITES */
+		0x00, 0x09,
+		{ { CTR_MS_0, MSR_P4_MS_ESCR0},
+		  { CTR_MS_2, MSR_P4_MS_ESCR1} }
+	},
+
+	{ /* FRONT_END_EVENT */
+		0x05, 0x08,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+	},
+
+	{ /* EXECUTION_EVENT */
+		0x05, 0x0c,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+	},
+
+	{ /* REPLAY_EVENT */
+		0x05, 0x09,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+	},
+
+	{ /* INSTR_RETIRED */
+		0x04, 0x02,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+	},
+
+	{ /* UOPS_RETIRED */
+		0x04, 0x01,
+		{ { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+		  { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+	},
+
+	{ /* UOP_TYPE */
+		0x02, 0x02,
+		{ { CTR_IQ_4, MSR_P4_RAT_ESCR0},
+		  { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
+	},
+
+	{ /* RETIRED_MISPRED_BRANCH_TYPE */
+		0x02, 0x05,
+		{ { CTR_MS_0, MSR_P4_TBPU_ESCR0},
+		  { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
+	},
+
+	{ /* RETIRED_BRANCH_TYPE */
+		0x02, 0x04,
+		{ { CTR_MS_0, MSR_P4_TBPU_ESCR0},
+		  { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
+	}
+};
+
+
+#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7)
+
+#define ESCR_RESERVED_BITS 0x80000003
+#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS)
+#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2))
+#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3))
+#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1)))
+#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
+#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
+#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
+
+#define CCCR_RESERVED_BITS 0x38030FFF
+#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
+#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000)
+#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13))
+#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26))
+#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
+#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
+#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
+#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
+#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
+
+
+/* this assigns a "stagger" to the current CPU, which is used throughout
+   the code in this module as an extra array offset, to select the "even"
+   or "odd" part of all the divided resources. */
+static unsigned int get_stagger(void)
+{
+#ifdef CONFIG_SMP
+	int cpu = smp_processor_id();
+	return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
+#endif
+	return 0;
+}
+
+
+/* finally, mediate access to a real hardware counter
+   by passing a "virtual" counter numer to this macro,
+   along with your stagger setting. */
+#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger)))
+
+static unsigned long reset_value[NUM_COUNTERS_NON_HT];
+
+static void p4_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (msrs->counters[i].addr)
+			release_perfctr_nmi(msrs->counters[i].addr);
+	}
+	/*
+	 * some of the control registers are specially reserved in
+	 * conjunction with the counter registers (hence the starting offset).
+	 * This saves a few bits.
+	 */
+	for (i = num_counters; i < num_controls; ++i) {
+		if (msrs->controls[i].addr)
+			release_evntsel_nmi(msrs->controls[i].addr);
+	}
+}
+
+static int p4_fill_in_addresses(struct op_msrs * const msrs)
+{
+	unsigned int i;
+	unsigned int addr, cccraddr, stag;
+
+	setup_num_counters();
+	stag = get_stagger();
+
+	/* the counter & cccr registers we pay attention to */
+	for (i = 0; i < num_counters; ++i) {
+		addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
+		cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
+		if (reserve_perfctr_nmi(addr)) {
+			msrs->counters[i].addr = addr;
+			msrs->controls[i].addr = cccraddr;
+		}
+	}
+
+	/* 43 ESCR registers in three or four discontiguous group */
+	for (addr = MSR_P4_BSU_ESCR0 + stag;
+	     addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) {
+		if (reserve_evntsel_nmi(addr))
+			msrs->controls[i].addr = addr;
+	}
+
+	/* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1
+	 * to avoid special case in nmi_{save|restore}_registers() */
+	if (boot_cpu_data.x86_model >= 0x3) {
+		for (addr = MSR_P4_BSU_ESCR0 + stag;
+		     addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) {
+			if (reserve_evntsel_nmi(addr))
+				msrs->controls[i].addr = addr;
+		}
+	} else {
+		for (addr = MSR_P4_IQ_ESCR0 + stag;
+		     addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) {
+			if (reserve_evntsel_nmi(addr))
+				msrs->controls[i].addr = addr;
+		}
+	}
+
+	for (addr = MSR_P4_RAT_ESCR0 + stag;
+	     addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) {
+		if (reserve_evntsel_nmi(addr))
+			msrs->controls[i].addr = addr;
+	}
+
+	for (addr = MSR_P4_MS_ESCR0 + stag;
+	     addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
+		if (reserve_evntsel_nmi(addr))
+			msrs->controls[i].addr = addr;
+	}
+
+	for (addr = MSR_P4_IX_ESCR0 + stag;
+	     addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
+		if (reserve_evntsel_nmi(addr))
+			msrs->controls[i].addr = addr;
+	}
+
+	/* there are 2 remaining non-contiguously located ESCRs */
+
+	if (num_counters == NUM_COUNTERS_NON_HT) {
+		/* standard non-HT CPUs handle both remaining ESCRs*/
+		if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
+			msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+		if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
+			msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
+
+	} else if (stag == 0) {
+		/* HT CPUs give the first remainder to the even thread, as
+		   the 32nd control register */
+		if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
+			msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
+
+	} else {
+		/* and two copies of the second to the odd thread,
+		   for the 22st and 23nd control registers */
+		if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) {
+			msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+			msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+		}
+	}
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!counter_config[i].enabled)
+			continue;
+		if (msrs->controls[i].addr)
+			continue;
+		op_x86_warn_reserved(i);
+		p4_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+
+static void pmc_setup_one_p4_counter(unsigned int ctr)
+{
+	int i;
+	int const maxbind = 2;
+	unsigned int cccr = 0;
+	unsigned int escr = 0;
+	unsigned int high = 0;
+	unsigned int counter_bit;
+	struct p4_event_binding *ev = NULL;
+	unsigned int stag;
+
+	stag = get_stagger();
+
+	/* convert from counter *number* to counter *bit* */
+	counter_bit = 1 << VIRT_CTR(stag, ctr);
+
+	/* find our event binding structure. */
+	if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
+		printk(KERN_ERR
+		       "oprofile: P4 event code 0x%lx out of range\n",
+		       counter_config[ctr].event);
+		return;
+	}
+
+	ev = &(p4_events[counter_config[ctr].event - 1]);
+
+	for (i = 0; i < maxbind; i++) {
+		if (ev->bindings[i].virt_counter & counter_bit) {
+
+			/* modify ESCR */
+			rdmsr(ev->bindings[i].escr_address, escr, high);
+			ESCR_CLEAR(escr);
+			if (stag == 0) {
+				ESCR_SET_USR_0(escr, counter_config[ctr].user);
+				ESCR_SET_OS_0(escr, counter_config[ctr].kernel);
+			} else {
+				ESCR_SET_USR_1(escr, counter_config[ctr].user);
+				ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
+			}
+			ESCR_SET_EVENT_SELECT(escr, ev->event_select);
+			ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
+			wrmsr(ev->bindings[i].escr_address, escr, high);
+
+			/* modify CCCR */
+			rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
+			      cccr, high);
+			CCCR_CLEAR(cccr);
+			CCCR_SET_REQUIRED_BITS(cccr);
+			CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
+			if (stag == 0)
+				CCCR_SET_PMI_OVF_0(cccr);
+			else
+				CCCR_SET_PMI_OVF_1(cccr);
+			wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
+			      cccr, high);
+			return;
+		}
+	}
+
+	printk(KERN_ERR
+	       "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
+	       counter_config[ctr].event, stag, ctr);
+}
+
+
+static void p4_setup_ctrs(struct op_x86_model_spec const *model,
+			  struct op_msrs const * const msrs)
+{
+	unsigned int i;
+	unsigned int low, high;
+	unsigned int stag;
+
+	stag = get_stagger();
+
+	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+	if (!MISC_PMC_ENABLED_P(low)) {
+		printk(KERN_ERR "oprofile: P4 PMC not available\n");
+		return;
+	}
+
+	/* clear the cccrs we will use */
+	for (i = 0; i < num_counters; i++) {
+		if (unlikely(!msrs->controls[i].addr))
+			continue;
+		rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+		CCCR_CLEAR(low);
+		CCCR_SET_REQUIRED_BITS(low);
+		wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+	}
+
+	/* clear all escrs (including those outside our concern) */
+	for (i = num_counters; i < num_controls; i++) {
+		if (unlikely(!msrs->controls[i].addr))
+			continue;
+		wrmsr(msrs->controls[i].addr, 0, 0);
+	}
+
+	/* setup all counters */
+	for (i = 0; i < num_counters; ++i) {
+		if (counter_config[i].enabled && msrs->controls[i].addr) {
+			reset_value[i] = counter_config[i].count;
+			pmc_setup_one_p4_counter(i);
+			wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
+			       -(u64)counter_config[i].count);
+		} else {
+			reset_value[i] = 0;
+		}
+	}
+}
+
+
+static int p4_check_ctrs(struct pt_regs * const regs,
+			 struct op_msrs const * const msrs)
+{
+	unsigned long ctr, low, high, stag, real;
+	int i;
+
+	stag = get_stagger();
+
+	for (i = 0; i < num_counters; ++i) {
+
+		if (!reset_value[i])
+			continue;
+
+		/*
+		 * there is some eccentricity in the hardware which
+		 * requires that we perform 2 extra corrections:
+		 *
+		 * - check both the CCCR:OVF flag for overflow and the
+		 *   counter high bit for un-flagged overflows.
+		 *
+		 * - write the counter back twice to ensure it gets
+		 *   updated properly.
+		 *
+		 * the former seems to be related to extra NMIs happening
+		 * during the current NMI; the latter is reported as errata
+		 * N15 in intel doc 249199-029, pentium 4 specification
+		 * update, though their suggested work-around does not
+		 * appear to solve the problem.
+		 */
+
+		real = VIRT_CTR(stag, i);
+
+		rdmsr(p4_counters[real].cccr_address, low, high);
+		rdmsr(p4_counters[real].counter_address, ctr, high);
+		if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
+			oprofile_add_sample(regs, i);
+			wrmsrl(p4_counters[real].counter_address,
+			       -(u64)reset_value[i]);
+			CCCR_CLEAR_OVF(low);
+			wrmsr(p4_counters[real].cccr_address, low, high);
+			wrmsrl(p4_counters[real].counter_address,
+			       -(u64)reset_value[i]);
+		}
+	}
+
+	/* P4 quirk: you have to re-unmask the apic vector */
+	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+
+	/* See op_model_ppro.c */
+	return 1;
+}
+
+
+static void p4_start(struct op_msrs const * const msrs)
+{
+	unsigned int low, high, stag;
+	int i;
+
+	stag = get_stagger();
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!reset_value[i])
+			continue;
+		rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+		CCCR_SET_ENABLE(low);
+		wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+	}
+}
+
+
+static void p4_stop(struct op_msrs const * const msrs)
+{
+	unsigned int low, high, stag;
+	int i;
+
+	stag = get_stagger();
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!reset_value[i])
+			continue;
+		rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+		CCCR_SET_DISABLE(low);
+		wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+	}
+}
+
+#ifdef CONFIG_SMP
+struct op_x86_model_spec op_p4_ht2_spec = {
+	.num_counters		= NUM_COUNTERS_HT2,
+	.num_controls		= NUM_CONTROLS_HT2,
+	.fill_in_addresses	= &p4_fill_in_addresses,
+	.setup_ctrs		= &p4_setup_ctrs,
+	.check_ctrs		= &p4_check_ctrs,
+	.start			= &p4_start,
+	.stop			= &p4_stop,
+	.shutdown		= &p4_shutdown
+};
+#endif
+
+struct op_x86_model_spec op_p4_spec = {
+	.num_counters		= NUM_COUNTERS_NON_HT,
+	.num_controls		= NUM_CONTROLS_NON_HT,
+	.fill_in_addresses	= &p4_fill_in_addresses,
+	.setup_ctrs		= &p4_setup_ctrs,
+	.check_ctrs		= &p4_check_ctrs,
+	.start			= &p4_start,
+	.stop			= &p4_stop,
+	.shutdown		= &p4_shutdown
+};
diff --git a/kernel/arch/x86/oprofile/op_model_ppro.c b/kernel/arch/x86/oprofile/op_model_ppro.c
new file mode 100644
index 000000000..d90528ea5
--- /dev/null
+++ b/kernel/arch/x86/oprofile/op_model_ppro.c
@@ -0,0 +1,245 @@
+/*
+ * @file op_model_ppro.h
+ * Family 6 perfmon and architectural perfmon MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2008 Intel Corporation
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author Philippe Elie
+ * @author Graydon Hoare
+ * @author Andi Kleen
+ * @author Robert Richter <robert.richter@amd.com>
+ */
+
+#include <linux/oprofile.h>
+#include <linux/slab.h>
+#include <asm/ptrace.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#include "op_x86_model.h"
+#include "op_counter.h"
+
+static int num_counters = 2;
+static int counter_width = 32;
+
+#define MSR_PPRO_EVENTSEL_RESERVED	((0xFFFFFFFFULL<<32)|(1ULL<<21))
+
+static u64 reset_value[OP_MAX_COUNTER];
+
+static void ppro_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+	}
+}
+
+static int ppro_fill_in_addresses(struct op_msrs * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; i++) {
+		if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
+			goto fail;
+		if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+			goto fail;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		ppro_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+
+static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
+			    struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	if (cpu_has_arch_perfmon) {
+		union cpuid10_eax eax;
+		eax.full = cpuid_eax(0xa);
+
+		/*
+		 * For Core2 (family 6, model 15), don't reset the
+		 * counter width:
+		 */
+		if (!(eax.split.version_id == 0 &&
+			__this_cpu_read(cpu_info.x86) == 6 &&
+				__this_cpu_read(cpu_info.x86_model) == 15)) {
+
+			if (counter_width < eax.split.bit_width)
+				counter_width = eax.split.bit_width;
+		}
+	}
+
+	/* clear all counters */
+	for (i = 0; i < num_counters; ++i) {
+		if (!msrs->controls[i].addr)
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+			op_x86_warn_in_use(i);
+		val &= model->reserved;
+		wrmsrl(msrs->controls[i].addr, val);
+		/*
+		 * avoid a false detection of ctr overflows in NMI *
+		 * handler
+		 */
+		wrmsrl(msrs->counters[i].addr, -1LL);
+	}
+
+	/* enable active counters */
+	for (i = 0; i < num_counters; ++i) {
+		if (counter_config[i].enabled && msrs->counters[i].addr) {
+			reset_value[i] = counter_config[i].count;
+			wrmsrl(msrs->counters[i].addr, -reset_value[i]);
+			rdmsrl(msrs->controls[i].addr, val);
+			val &= model->reserved;
+			val |= op_x86_get_ctrl(model, &counter_config[i]);
+			wrmsrl(msrs->controls[i].addr, val);
+		} else {
+			reset_value[i] = 0;
+		}
+	}
+}
+
+
+static int ppro_check_ctrs(struct pt_regs * const regs,
+			   struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!reset_value[i])
+			continue;
+		rdmsrl(msrs->counters[i].addr, val);
+		if (val & (1ULL << (counter_width - 1)))
+			continue;
+		oprofile_add_sample(regs, i);
+		wrmsrl(msrs->counters[i].addr, -reset_value[i]);
+	}
+
+	/* Only P6 based Pentium M need to re-unmask the apic vector but it
+	 * doesn't hurt other P6 variant */
+	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+
+	/* We can't work out if we really handled an interrupt. We
+	 * might have caught a *second* counter just after overflowing
+	 * the interrupt for this counter then arrives
+	 * and we don't find a counter that's overflowed, so we
+	 * would return 0 and get dazed + confused. Instead we always
+	 * assume we found an overflow. This sucks.
+	 */
+	return 1;
+}
+
+
+static void ppro_start(struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (reset_value[i]) {
+			rdmsrl(msrs->controls[i].addr, val);
+			val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+			wrmsrl(msrs->controls[i].addr, val);
+		}
+	}
+}
+
+
+static void ppro_stop(struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!reset_value[i])
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
+struct op_x86_model_spec op_ppro_spec = {
+	.num_counters		= 2,
+	.num_controls		= 2,
+	.reserved		= MSR_PPRO_EVENTSEL_RESERVED,
+	.fill_in_addresses	= &ppro_fill_in_addresses,
+	.setup_ctrs		= &ppro_setup_ctrs,
+	.check_ctrs		= &ppro_check_ctrs,
+	.start			= &ppro_start,
+	.stop			= &ppro_stop,
+	.shutdown		= &ppro_shutdown
+};
+
+/*
+ * Architectural performance monitoring.
+ *
+ * Newer Intel CPUs (Core1+) have support for architectural
+ * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
+ * The advantage of this is that it can be done without knowing about
+ * the specific CPU.
+ */
+
+static void arch_perfmon_setup_counters(void)
+{
+	union cpuid10_eax eax;
+
+	eax.full = cpuid_eax(0xa);
+
+	/* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
+	if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 &&
+		__this_cpu_read(cpu_info.x86_model) == 15) {
+		eax.split.version_id = 2;
+		eax.split.num_counters = 2;
+		eax.split.bit_width = 40;
+	}
+
+	num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER);
+
+	op_arch_perfmon_spec.num_counters = num_counters;
+	op_arch_perfmon_spec.num_controls = num_counters;
+}
+
+static int arch_perfmon_init(struct oprofile_operations *ignore)
+{
+	arch_perfmon_setup_counters();
+	return 0;
+}
+
+struct op_x86_model_spec op_arch_perfmon_spec = {
+	.reserved		= MSR_PPRO_EVENTSEL_RESERVED,
+	.init			= &arch_perfmon_init,
+	/* num_counters/num_controls filled in at runtime */
+	.fill_in_addresses	= &ppro_fill_in_addresses,
+	/* user space does the cpuid check for available events */
+	.setup_ctrs		= &ppro_setup_ctrs,
+	.check_ctrs		= &ppro_check_ctrs,
+	.start			= &ppro_start,
+	.stop			= &ppro_stop,
+	.shutdown		= &ppro_shutdown
+};
diff --git a/kernel/arch/x86/oprofile/op_x86_model.h b/kernel/arch/x86/oprofile/op_x86_model.h
new file mode 100644
index 000000000..71e8a6733
--- /dev/null
+++ b/kernel/arch/x86/oprofile/op_x86_model.h
@@ -0,0 +1,90 @@
+/**
+ * @file op_x86_model.h
+ * interface to x86 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author Graydon Hoare
+ * @author Robert Richter <robert.richter@amd.com>
+ */
+
+#ifndef OP_X86_MODEL_H
+#define OP_X86_MODEL_H
+
+#include <asm/types.h>
+#include <asm/perf_event.h>
+
+struct op_msr {
+	unsigned long	addr;
+	u64		saved;
+};
+
+struct op_msrs {
+	struct op_msr *counters;
+	struct op_msr *controls;
+	struct op_msr *multiplex;
+};
+
+struct pt_regs;
+
+struct oprofile_operations;
+
+/* The model vtable abstracts the differences between
+ * various x86 CPU models' perfctr support.
+ */
+struct op_x86_model_spec {
+	unsigned int	num_counters;
+	unsigned int	num_controls;
+	unsigned int	num_virt_counters;
+	u64		reserved;
+	u16		event_mask;
+	int		(*init)(struct oprofile_operations *ops);
+	int		(*fill_in_addresses)(struct op_msrs * const msrs);
+	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
+				      struct op_msrs const * const msrs);
+	int		(*check_ctrs)(struct pt_regs * const regs,
+				      struct op_msrs const * const msrs);
+	void		(*start)(struct op_msrs const * const msrs);
+	void		(*stop)(struct op_msrs const * const msrs);
+	void		(*shutdown)(struct op_msrs const * const msrs);
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+	void		(*switch_ctrl)(struct op_x86_model_spec const *model,
+				       struct op_msrs const * const msrs);
+#endif
+};
+
+struct op_counter_config;
+
+static inline void op_x86_warn_in_use(int counter)
+{
+	/*
+	 * The warning indicates an already running counter. If
+	 * oprofile doesn't collect data, then try using a different
+	 * performance counter on your platform to monitor the desired
+	 * event. Delete counter #%d from the desired event by editing
+	 * the /usr/share/oprofile/%s/<cpu>/events file. If the event
+	 * cannot be monitored by any other counter, contact your
+	 * hardware or BIOS vendor.
+	 */
+	pr_warning("oprofile: counter #%d on cpu #%d may already be used\n",
+		   counter, smp_processor_id());
+}
+
+static inline void op_x86_warn_reserved(int counter)
+{
+	pr_warning("oprofile: counter #%d is already reserved\n", counter);
+}
+
+extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
+			   struct op_counter_config *counter_config);
+extern int op_x86_phys_to_virt(int phys);
+extern int op_x86_virt_to_phys(int virt);
+
+extern struct op_x86_model_spec op_ppro_spec;
+extern struct op_x86_model_spec op_p4_spec;
+extern struct op_x86_model_spec op_p4_ht2_spec;
+extern struct op_x86_model_spec op_amd_spec;
+extern struct op_x86_model_spec op_arch_perfmon_spec;
+
+#endif /* OP_X86_MODEL_H */
author	Yunhong Jiang <yunhong.jiang@intel.com>	2015-08-04 12:17:53 -0700
committer	Yunhong Jiang <yunhong.jiang@intel.com>	2015-08-04 15:44:42 -0700
commit	9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 (patch)
tree	1c9cafbcd35f783a87880a10f85d1a060db1a563 /kernel/arch/x86/oprofile
parent	98260f3884f4a202f9ca5eabed40b1354c489b29 (diff)