From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/arch/x86/oprofile/Makefile | 11 + kernel/arch/x86/oprofile/backtrace.c | 127 +++++ kernel/arch/x86/oprofile/init.c | 38 ++ kernel/arch/x86/oprofile/nmi_int.c | 802 +++++++++++++++++++++++++++++++ kernel/arch/x86/oprofile/op_counter.h | 30 ++ kernel/arch/x86/oprofile/op_model_amd.c | 543 +++++++++++++++++++++ kernel/arch/x86/oprofile/op_model_p4.c | 723 ++++++++++++++++++++++++++++ kernel/arch/x86/oprofile/op_model_ppro.c | 245 ++++++++++ kernel/arch/x86/oprofile/op_x86_model.h | 90 ++++ 9 files changed, 2609 insertions(+) create mode 100644 kernel/arch/x86/oprofile/Makefile create mode 100644 kernel/arch/x86/oprofile/backtrace.c create mode 100644 kernel/arch/x86/oprofile/init.c create mode 100644 kernel/arch/x86/oprofile/nmi_int.c create mode 100644 kernel/arch/x86/oprofile/op_counter.h create mode 100644 kernel/arch/x86/oprofile/op_model_amd.c create mode 100644 kernel/arch/x86/oprofile/op_model_p4.c create mode 100644 kernel/arch/x86/oprofile/op_model_ppro.c create mode 100644 kernel/arch/x86/oprofile/op_x86_model.h (limited to 'kernel/arch/x86/oprofile') diff --git a/kernel/arch/x86/oprofile/Makefile b/kernel/arch/x86/oprofile/Makefile new file mode 100644 index 000000000..1599f568f --- /dev/null +++ b/kernel/arch/x86/oprofile/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_OPROFILE) += oprofile.o + +DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ + oprof.o cpu_buffer.o buffer_sync.o \ + event_buffer.o oprofile_files.o \ + oprofilefs.o oprofile_stats.o \ + timer_int.o nmi_timer_int.o ) + +oprofile-y := $(DRIVER_OBJS) init.o backtrace.o +oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ + op_model_ppro.o op_model_p4.o diff --git a/kernel/arch/x86/oprofile/backtrace.c b/kernel/arch/x86/oprofile/backtrace.c new file mode 100644 index 000000000..4e664bdb5 --- /dev/null +++ b/kernel/arch/x86/oprofile/backtrace.c @@ -0,0 +1,127 @@ +/** + * @file backtrace.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author David Smith + */ + +#include +#include +#include +#include +#include + +#include +#include + +static int backtrace_stack(void *data, char *name) +{ + /* Yes, we want all stacks */ + return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + unsigned int *depth = data; + + if ((*depth)--) + oprofile_add_trace(addr); +} + +static struct stacktrace_ops backtrace_ops = { + .stack = backtrace_stack, + .address = backtrace_address, + .walk_stack = print_context_stack, +}; + +#ifdef CONFIG_COMPAT +static struct stack_frame_ia32 * +dump_user_backtrace_32(struct stack_frame_ia32 *head) +{ + /* Also check accessibility of one struct frame_head beyond: */ + struct stack_frame_ia32 bufhead[2]; + struct stack_frame_ia32 *fp; + unsigned long bytes; + + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != 0) + return NULL; + + fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); + + oprofile_add_trace(bufhead[0].return_address); + + /* frame pointers should strictly progress back up the stack + * (towards higher addresses) */ + if (head >= fp) + return NULL; + + return fp; +} + +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + struct stack_frame_ia32 *head; + + /* User process is IA32 */ + if (!current || !test_thread_flag(TIF_IA32)) + return 0; + + head = (struct stack_frame_ia32 *) regs->bp; + while (depth-- && head) + head = dump_user_backtrace_32(head); + + return 1; +} + +#else +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + return 0; +} +#endif /* CONFIG_COMPAT */ + +static struct stack_frame *dump_user_backtrace(struct stack_frame *head) +{ + /* Also check accessibility of one struct frame_head beyond: */ + struct stack_frame bufhead[2]; + unsigned long bytes; + + bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead)); + if (bytes != 0) + return NULL; + + oprofile_add_trace(bufhead[0].return_address); + + /* frame pointers should strictly progress back up the stack + * (towards higher addresses) */ + if (head >= bufhead[0].next_frame) + return NULL; + + return bufhead[0].next_frame; +} + +void +x86_backtrace(struct pt_regs * const regs, unsigned int depth) +{ + struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); + + if (!user_mode(regs)) { + unsigned long stack = kernel_stack_pointer(regs); + if (depth) + dump_trace(NULL, regs, (unsigned long *)stack, 0, + &backtrace_ops, &depth); + return; + } + + if (x86_backtrace_32(regs, depth)) + return; + + while (depth-- && head) + head = dump_user_backtrace(head); +} diff --git a/kernel/arch/x86/oprofile/init.c b/kernel/arch/x86/oprofile/init.c new file mode 100644 index 000000000..9e138d00a --- /dev/null +++ b/kernel/arch/x86/oprofile/init.c @@ -0,0 +1,38 @@ +/** + * @file init.c + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + */ + +#include +#include +#include + +/* + * We support CPUs that have performance counters like the Pentium Pro + * with the NMI mode driver. + */ + +#ifdef CONFIG_X86_LOCAL_APIC +extern int op_nmi_init(struct oprofile_operations *ops); +extern void op_nmi_exit(void); +#else +static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; } +static void op_nmi_exit(void) { } +#endif + +extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); + +int __init oprofile_arch_init(struct oprofile_operations *ops) +{ + ops->backtrace = x86_backtrace; + return op_nmi_init(ops); +} + +void oprofile_arch_exit(void) +{ + op_nmi_exit(); +} diff --git a/kernel/arch/x86/oprofile/nmi_int.c b/kernel/arch/x86/oprofile/nmi_int.c new file mode 100644 index 000000000..1d2e6392f --- /dev/null +++ b/kernel/arch/x86/oprofile/nmi_int.c @@ -0,0 +1,802 @@ +/** + * @file nmi_int.c + * + * @remark Copyright 2002-2009 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Robert Richter + * @author Barry Kasindorf + * @author Jason Yeh + * @author Suravee Suthikulpanit + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "op_counter.h" +#include "op_x86_model.h" + +static struct op_x86_model_spec *model; +static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); +static DEFINE_PER_CPU(unsigned long, saved_lvtpc); + +/* must be protected with get_online_cpus()/put_online_cpus(): */ +static int nmi_enabled; +static int ctr_running; + +struct op_counter_config counter_config[OP_MAX_COUNTER]; + +/* common functions */ + +u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config) +{ + u64 val = 0; + u16 event = (u16)counter_config->event; + + val |= ARCH_PERFMON_EVENTSEL_INT; + val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; + val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; + val |= (counter_config->unit_mask & 0xFF) << 8; + counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_EDGE | + ARCH_PERFMON_EVENTSEL_CMASK); + val |= counter_config->extra; + event &= model->event_mask ? model->event_mask : 0xFF; + val |= event & 0xFF; + val |= (u64)(event & 0x0F00) << 24; + + return val; +} + + +static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs) +{ + if (ctr_running) + model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs)); + else if (!nmi_enabled) + return NMI_DONE; + else + model->stop(this_cpu_ptr(&cpu_msrs)); + return NMI_HANDLED; +} + +static void nmi_cpu_save_registers(struct op_msrs *msrs) +{ + struct op_msr *counters = msrs->counters; + struct op_msr *controls = msrs->controls; + unsigned int i; + + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + rdmsrl(counters[i].addr, counters[i].saved); + } + + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + rdmsrl(controls[i].addr, controls[i].saved); + } +} + +static void nmi_cpu_start(void *dummy) +{ + struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); + if (!msrs->controls) + WARN_ON_ONCE(1); + else + model->start(msrs); +} + +static int nmi_start(void) +{ + get_online_cpus(); + ctr_running = 1; + /* make ctr_running visible to the nmi handler: */ + smp_mb(); + on_each_cpu(nmi_cpu_start, NULL, 1); + put_online_cpus(); + return 0; +} + +static void nmi_cpu_stop(void *dummy) +{ + struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs); + if (!msrs->controls) + WARN_ON_ONCE(1); + else + model->stop(msrs); +} + +static void nmi_stop(void) +{ + get_online_cpus(); + on_each_cpu(nmi_cpu_stop, NULL, 1); + ctr_running = 0; + put_online_cpus(); +} + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static DEFINE_PER_CPU(int, switch_index); + +static inline int has_mux(void) +{ + return !!model->switch_ctrl; +} + +inline int op_x86_phys_to_virt(int phys) +{ + return __this_cpu_read(switch_index) + phys; +} + +inline int op_x86_virt_to_phys(int virt) +{ + return virt % model->num_counters; +} + +static void nmi_shutdown_mux(void) +{ + int i; + + if (!has_mux()) + return; + + for_each_possible_cpu(i) { + kfree(per_cpu(cpu_msrs, i).multiplex); + per_cpu(cpu_msrs, i).multiplex = NULL; + per_cpu(switch_index, i) = 0; + } +} + +static int nmi_setup_mux(void) +{ + size_t multiplex_size = + sizeof(struct op_msr) * model->num_virt_counters; + int i; + + if (!has_mux()) + return 1; + + for_each_possible_cpu(i) { + per_cpu(cpu_msrs, i).multiplex = + kzalloc(multiplex_size, GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).multiplex) + return 0; + } + + return 1; +} + +static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) +{ + int i; + struct op_msr *multiplex = msrs->multiplex; + + if (!has_mux()) + return; + + for (i = 0; i < model->num_virt_counters; ++i) { + if (counter_config[i].enabled) { + multiplex[i].saved = -(u64)counter_config[i].count; + } else { + multiplex[i].saved = 0; + } + } + + per_cpu(switch_index, cpu) = 0; +} + +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *counters = msrs->counters; + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (counters[i].addr) + rdmsrl(counters[i].addr, multiplex[virt].saved); + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *counters = msrs->counters; + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (counters[i].addr) + wrmsrl(counters[i].addr, multiplex[virt].saved); + } +} + +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_counters; + if ((si >= model->num_virt_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, cpu) = 0; + else + per_cpu(switch_index, cpu) = si; + + model->switch_ctrl(model, msrs); + nmi_cpu_restore_mpx_registers(msrs); + + nmi_cpu_start(NULL); +} + + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (!has_mux()) + return -ENOSYS; /* not implemented */ + if (nmi_multiplex_on() < 0) + return -EINVAL; /* not necessary */ + + get_online_cpus(); + if (ctr_running) + on_each_cpu(nmi_cpu_switch, NULL, 1); + put_online_cpus(); + + return 0; +} + +static inline void mux_init(struct oprofile_operations *ops) +{ + if (has_mux()) + ops->switch_events = nmi_switch_event; +} + +static void mux_clone(int cpu) +{ + if (!has_mux()) + return; + + memcpy(per_cpu(cpu_msrs, cpu).multiplex, + per_cpu(cpu_msrs, 0).multiplex, + sizeof(struct op_msr) * model->num_virt_counters); +} + +#else + +inline int op_x86_phys_to_virt(int phys) { return phys; } +inline int op_x86_virt_to_phys(int virt) { return virt; } +static inline void nmi_shutdown_mux(void) { } +static inline int nmi_setup_mux(void) { return 1; } +static inline void +nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } +static inline void mux_init(struct oprofile_operations *ops) { } +static void mux_clone(int cpu) { } + +#endif + +static void free_msrs(void) +{ + int i; + for_each_possible_cpu(i) { + kfree(per_cpu(cpu_msrs, i).counters); + per_cpu(cpu_msrs, i).counters = NULL; + kfree(per_cpu(cpu_msrs, i).controls); + per_cpu(cpu_msrs, i).controls = NULL; + } + nmi_shutdown_mux(); +} + +static int allocate_msrs(void) +{ + size_t controls_size = sizeof(struct op_msr) * model->num_controls; + size_t counters_size = sizeof(struct op_msr) * model->num_counters; + + int i; + for_each_possible_cpu(i) { + per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).counters) + goto fail; + per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).controls) + goto fail; + } + + if (!nmi_setup_mux()) + goto fail; + + return 1; + +fail: + free_msrs(); + return 0; +} + +static void nmi_cpu_setup(void *dummy) +{ + int cpu = smp_processor_id(); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); + raw_spin_lock(&oprofilefs_lock); + model->setup_ctrs(model, msrs); + nmi_cpu_setup_mux(cpu, msrs); + raw_spin_unlock(&oprofilefs_lock); + per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, APIC_DM_NMI); +} + +static void nmi_cpu_restore_registers(struct op_msrs *msrs) +{ + struct op_msr *counters = msrs->counters; + struct op_msr *controls = msrs->controls; + unsigned int i; + + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + wrmsrl(controls[i].addr, controls[i].saved); + } + + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + wrmsrl(counters[i].addr, counters[i].saved); + } +} + +static void nmi_cpu_shutdown(void *dummy) +{ + unsigned int v; + int cpu = smp_processor_id(); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + /* restoring APIC_LVTPC can trigger an apic error because the delivery + * mode and vector nr combination can be illegal. That's by design: on + * power on apic lvt contain a zero vector nr which are legal only for + * NMI delivery mode. So inhibit apic err before restoring lvtpc + */ + v = apic_read(APIC_LVTERR); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); + apic_write(APIC_LVTERR, v); + nmi_cpu_restore_registers(msrs); +} + +static void nmi_cpu_up(void *dummy) +{ + if (nmi_enabled) + nmi_cpu_setup(dummy); + if (ctr_running) + nmi_cpu_start(dummy); +} + +static void nmi_cpu_down(void *dummy) +{ + if (ctr_running) + nmi_cpu_stop(dummy); + if (nmi_enabled) + nmi_cpu_shutdown(dummy); +} + +static int nmi_create_files(struct dentry *root) +{ + unsigned int i; + + for (i = 0; i < model->num_virt_counters; ++i) { + struct dentry *dir; + char buf[4]; + + /* quick little hack to _not_ expose a counter if it is not + * available for use. This should protect userspace app. + * NOTE: assumes 1:1 mapping here (that counters are organized + * sequentially in their struct assignment). + */ + if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) + continue; + + snprintf(buf, sizeof(buf), "%d", i); + dir = oprofilefs_mkdir(root, buf); + oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra); + } + + return 0; +} + +static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, + void *data) +{ + int cpu = (unsigned long)data; + switch (action) { + case CPU_DOWN_FAILED: + case CPU_ONLINE: + smp_call_function_single(cpu, nmi_cpu_up, NULL, 0); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, nmi_cpu_down, NULL, 1); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block oprofile_cpu_nb = { + .notifier_call = oprofile_cpu_notifier +}; + +static int nmi_setup(void) +{ + int err = 0; + int cpu; + + if (!allocate_msrs()) + return -ENOMEM; + + /* We need to serialize save and setup for HT because the subset + * of msrs are distinct for save and setup operations + */ + + /* Assume saved/restored counters are the same on all CPUs */ + err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); + if (err) + goto fail; + + for_each_possible_cpu(cpu) { + if (!cpu) + continue; + + memcpy(per_cpu(cpu_msrs, cpu).counters, + per_cpu(cpu_msrs, 0).counters, + sizeof(struct op_msr) * model->num_counters); + + memcpy(per_cpu(cpu_msrs, cpu).controls, + per_cpu(cpu_msrs, 0).controls, + sizeof(struct op_msr) * model->num_controls); + + mux_clone(cpu); + } + + nmi_enabled = 0; + ctr_running = 0; + /* make variables visible to the nmi handler: */ + smp_mb(); + err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify, + 0, "oprofile"); + if (err) + goto fail; + + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' */ + get_online_cpus(); + nmi_enabled = 1; + /* make nmi_enabled visible to the nmi handler: */ + smp_mb(); + on_each_cpu(nmi_cpu_setup, NULL, 1); + __register_cpu_notifier(&oprofile_cpu_nb); + put_online_cpus(); + + cpu_notifier_register_done(); + + return 0; +fail: + free_msrs(); + return err; +} + +static void nmi_shutdown(void) +{ + struct op_msrs *msrs; + + cpu_notifier_register_begin(); + + /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */ + get_online_cpus(); + on_each_cpu(nmi_cpu_shutdown, NULL, 1); + nmi_enabled = 0; + ctr_running = 0; + __unregister_cpu_notifier(&oprofile_cpu_nb); + put_online_cpus(); + + cpu_notifier_register_done(); + + /* make variables visible to the nmi handler: */ + smp_mb(); + unregister_nmi_handler(NMI_LOCAL, "oprofile"); + msrs = &get_cpu_var(cpu_msrs); + model->shutdown(msrs); + free_msrs(); + put_cpu_var(cpu_msrs); +} + +#ifdef CONFIG_PM + +static int nmi_suspend(void) +{ + /* Only one CPU left, just stop that one */ + if (nmi_enabled == 1) + nmi_cpu_stop(NULL); + return 0; +} + +static void nmi_resume(void) +{ + if (nmi_enabled == 1) + nmi_cpu_start(NULL); +} + +static struct syscore_ops oprofile_syscore_ops = { + .resume = nmi_resume, + .suspend = nmi_suspend, +}; + +static void __init init_suspend_resume(void) +{ + register_syscore_ops(&oprofile_syscore_ops); +} + +static void exit_suspend_resume(void) +{ + unregister_syscore_ops(&oprofile_syscore_ops); +} + +#else + +static inline void init_suspend_resume(void) { } +static inline void exit_suspend_resume(void) { } + +#endif /* CONFIG_PM */ + +static int __init p4_init(char **cpu_type) +{ + __u8 cpu_model = boot_cpu_data.x86_model; + + if (cpu_model > 6 || cpu_model == 5) + return 0; + +#ifndef CONFIG_SMP + *cpu_type = "i386/p4"; + model = &op_p4_spec; + return 1; +#else + switch (smp_num_siblings) { + case 1: + *cpu_type = "i386/p4"; + model = &op_p4_spec; + return 1; + + case 2: + *cpu_type = "i386/p4-ht"; + model = &op_p4_ht2_spec; + return 1; + } +#endif + + printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n"); + printk(KERN_INFO "oprofile: Reverting to timer mode.\n"); + return 0; +} + +enum __force_cpu_type { + reserved = 0, /* do not force */ + timer, + arch_perfmon, +}; + +static int force_cpu_type; + +static int set_cpu_type(const char *str, struct kernel_param *kp) +{ + if (!strcmp(str, "timer")) { + force_cpu_type = timer; + printk(KERN_INFO "oprofile: forcing NMI timer mode\n"); + } else if (!strcmp(str, "arch_perfmon")) { + force_cpu_type = arch_perfmon; + printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); + } else { + force_cpu_type = 0; + } + + return 0; +} +module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); + +static int __init ppro_init(char **cpu_type) +{ + __u8 cpu_model = boot_cpu_data.x86_model; + struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ + + if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon) + return 0; + + /* + * Documentation on identifying Intel processors by CPU family + * and model can be found in the Intel Software Developer's + * Manuals (SDM): + * + * http://www.intel.com/products/processor/manuals/ + * + * As of May 2010 the documentation for this was in the: + * "Intel 64 and IA-32 Architectures Software Developer's + * Manual Volume 3B: System Programming Guide", "Table B-1 + * CPUID Signature Values of DisplayFamily_DisplayModel". + */ + switch (cpu_model) { + case 0 ... 2: + *cpu_type = "i386/ppro"; + break; + case 3 ... 5: + *cpu_type = "i386/pii"; + break; + case 6 ... 8: + case 10 ... 11: + *cpu_type = "i386/piii"; + break; + case 9: + case 13: + *cpu_type = "i386/p6_mobile"; + break; + case 14: + *cpu_type = "i386/core"; + break; + case 0x0f: + case 0x16: + case 0x17: + case 0x1d: + *cpu_type = "i386/core_2"; + break; + case 0x1a: + case 0x1e: + case 0x2e: + spec = &op_arch_perfmon_spec; + *cpu_type = "i386/core_i7"; + break; + case 0x1c: + *cpu_type = "i386/atom"; + break; + default: + /* Unknown */ + return 0; + } + + model = spec; + return 1; +} + +int __init op_nmi_init(struct oprofile_operations *ops) +{ + __u8 vendor = boot_cpu_data.x86_vendor; + __u8 family = boot_cpu_data.x86; + char *cpu_type = NULL; + int ret = 0; + + if (!cpu_has_apic) + return -ENODEV; + + if (force_cpu_type == timer) + return -ENODEV; + + switch (vendor) { + case X86_VENDOR_AMD: + /* Needs to be at least an Athlon (or hammer in 32bit mode) */ + + switch (family) { + case 6: + cpu_type = "i386/athlon"; + break; + case 0xf: + /* + * Actually it could be i386/hammer too, but + * give user space an consistent name. + */ + cpu_type = "x86-64/hammer"; + break; + case 0x10: + cpu_type = "x86-64/family10"; + break; + case 0x11: + cpu_type = "x86-64/family11h"; + break; + case 0x12: + cpu_type = "x86-64/family12h"; + break; + case 0x14: + cpu_type = "x86-64/family14h"; + break; + case 0x15: + cpu_type = "x86-64/family15h"; + break; + default: + return -ENODEV; + } + model = &op_amd_spec; + break; + + case X86_VENDOR_INTEL: + switch (family) { + /* Pentium IV */ + case 0xf: + p4_init(&cpu_type); + break; + + /* A P6-class processor */ + case 6: + ppro_init(&cpu_type); + break; + + default: + break; + } + + if (cpu_type) + break; + + if (!cpu_has_arch_perfmon) + return -ENODEV; + + /* use arch perfmon as fallback */ + cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; + break; + + default: + return -ENODEV; + } + + /* default values, can be overwritten by model */ + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; + + if (model->init) + ret = model->init(ops); + if (ret) + return ret; + + if (!model->num_virt_counters) + model->num_virt_counters = model->num_counters; + + mux_init(ops); + + init_suspend_resume(); + + printk(KERN_INFO "oprofile: using NMI interrupt.\n"); + return 0; +} + +void op_nmi_exit(void) +{ + exit_suspend_resume(); +} diff --git a/kernel/arch/x86/oprofile/op_counter.h b/kernel/arch/x86/oprofile/op_counter.h new file mode 100644 index 000000000..0b7b7b179 --- /dev/null +++ b/kernel/arch/x86/oprofile/op_counter.h @@ -0,0 +1,30 @@ +/** + * @file op_counter.h + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + */ + +#ifndef OP_COUNTER_H +#define OP_COUNTER_H + +#define OP_MAX_COUNTER 32 + +/* Per-perfctr configuration as set via + * oprofilefs. + */ +struct op_counter_config { + unsigned long count; + unsigned long enabled; + unsigned long event; + unsigned long kernel; + unsigned long user; + unsigned long unit_mask; + unsigned long extra; +}; + +extern struct op_counter_config counter_config[]; + +#endif /* OP_COUNTER_H */ diff --git a/kernel/arch/x86/oprofile/op_model_amd.c b/kernel/arch/x86/oprofile/op_model_amd.c new file mode 100644 index 000000000..50d86c0e9 --- /dev/null +++ b/kernel/arch/x86/oprofile/op_model_amd.c @@ -0,0 +1,543 @@ +/* + * @file op_model_amd.c + * athlon / K7 / K8 / Family 10h model-specific MSR operations + * + * @remark Copyright 2002-2009 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + * @author Robert Richter + * @author Barry Kasindorf + * @author Jason Yeh + * @author Suravee Suthikulpanit + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +#define NUM_VIRT_COUNTERS 32 +#else +#define NUM_VIRT_COUNTERS 0 +#endif + +#define OP_EVENT_MASK 0x0FFF +#define OP_CTR_OVERFLOW (1ULL<<31) + +#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) + +static int num_counters; +static unsigned long reset_value[OP_MAX_COUNTER]; + +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 + +static u32 ibs_caps; + +struct ibs_config { + unsigned long op_enabled; + unsigned long fetch_enabled; + unsigned long max_cnt_fetch; + unsigned long max_cnt_op; + unsigned long rand_en; + unsigned long dispatched_ops; + unsigned long branch_target; +}; + +struct ibs_state { + u64 ibs_op_ctl; + int branch_target; + unsigned long sample_size; +}; + +static struct ibs_config ibs_config; +static struct ibs_state ibs_state; + +/* + * IBS randomization macros + */ +#define IBS_RANDOM_BITS 12 +#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) +#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) + +/* + * 16-bit Linear Feedback Shift Register (LFSR) + * + * 16 14 13 11 + * Feedback polynomial = X + X + X + X + 1 + */ +static unsigned int lfsr_random(void) +{ + static unsigned int lfsr_value = 0xF00D; + unsigned int bit; + + /* Compute next bit to shift in */ + bit = ((lfsr_value >> 0) ^ + (lfsr_value >> 2) ^ + (lfsr_value >> 3) ^ + (lfsr_value >> 5)) & 0x0001; + + /* Advance to next register value */ + lfsr_value = (lfsr_value >> 1) | (bit << 15); + + return lfsr_value; +} + +/* + * IBS software randomization + * + * The IBS periodic op counter is randomized in software. The lower 12 + * bits of the 20 bit counter are randomized. IbsOpCurCnt is + * initialized with a 12 bit random value. + */ +static inline u64 op_amd_randomize_ibs_op(u64 val) +{ + unsigned int random = lfsr_random(); + + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) + /* + * Work around if the hw can not write to IbsOpCurCnt + * + * Randomize the lower 8 bits of the 16 bit + * IbsOpMaxCnt [15:0] value in the range of -128 to + * +127 by adding/subtracting an offset to the + * maximum count (IbsOpMaxCnt). + * + * To avoid over or underflows and protect upper bits + * starting at bit 16, the initial value for + * IbsOpMaxCnt must fit in the range from 0x0081 to + * 0xff80. + */ + val += (s8)(random >> 4); + else + val |= (u64)(random & IBS_RANDOM_MASK) << 32; + + return val; +} + +static inline void +op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + u64 val, ctl; + struct op_entry entry; + + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) { + rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); + if (ctl & IBS_FETCH_VAL) { + rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); + oprofile_write_reserve(&entry, regs, val, + IBS_FETCH_CODE, IBS_FETCH_SIZE); + oprofile_add_data64(&entry, val); + oprofile_add_data64(&entry, ctl); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); + oprofile_add_data64(&entry, val); + oprofile_write_commit(&entry); + + /* reenable the IRQ */ + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); + ctl |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); + } + } + + if (ibs_config.op_enabled) { + rdmsrl(MSR_AMD64_IBSOPCTL, ctl); + if (ctl & IBS_OP_VAL) { + rdmsrl(MSR_AMD64_IBSOPRIP, val); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, + ibs_state.sample_size); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA2, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA3, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCLINAD, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); + oprofile_add_data64(&entry, val); + if (ibs_state.branch_target) { + rdmsrl(MSR_AMD64_IBSBRTARGET, val); + oprofile_add_data(&entry, (unsigned long)val); + } + oprofile_write_commit(&entry); + + /* reenable the IRQ */ + ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); + wrmsrl(MSR_AMD64_IBSOPCTL, ctl); + } + } +} + +static inline void op_amd_start_ibs(void) +{ + u64 val; + + if (!ibs_caps) + return; + + memset(&ibs_state, 0, sizeof(ibs_state)); + + /* + * Note: Since the max count settings may out of range we + * write back the actual used values so that userland can read + * it. + */ + + if (ibs_config.fetch_enabled) { + val = ibs_config.max_cnt_fetch >> 4; + val = min(val, IBS_FETCH_MAX_CNT); + ibs_config.max_cnt_fetch = val << 4; + val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; + val |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, val); + } + + if (ibs_config.op_enabled) { + val = ibs_config.max_cnt_op >> 4; + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { + /* + * IbsOpCurCnt not supported. See + * op_amd_randomize_ibs_op() for details. + */ + val = clamp(val, 0x0081ULL, 0xFF80ULL); + ibs_config.max_cnt_op = val << 4; + } else { + /* + * The start value is randomized with a + * positive offset, we need to compensate it + * with the half of the randomized range. Also + * avoid underflows. + */ + val += IBS_RANDOM_MAXCNT_OFFSET; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + val = min(val, IBS_OP_MAX_CNT_EXT); + else + val = min(val, IBS_OP_MAX_CNT); + ibs_config.max_cnt_op = + (val - IBS_RANDOM_MAXCNT_OFFSET) << 4; + } + val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT); + val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + val |= IBS_OP_ENABLE; + ibs_state.ibs_op_ctl = val; + ibs_state.sample_size = IBS_OP_SIZE; + if (ibs_config.branch_target) { + ibs_state.branch_target = 1; + ibs_state.sample_size++; + } + val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); + wrmsrl(MSR_AMD64_IBSOPCTL, val); + } +} + +static void op_amd_stop_ibs(void) +{ + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); + + if (ibs_config.op_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSOPCTL, 0); +} + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + +#endif + +/* functions for op_amd_spec */ + +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < num_counters; ++i) { + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + +static int op_amd_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < num_counters; i++) { + if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + goto fail; + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + goto fail; + } + /* both registers must be reserved */ + if (num_counters == AMD64_NUM_COUNTERS_CORE) { + msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); + msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); + } else { + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + } + continue; + fail: + if (!counter_config[i].enabled) + continue; + op_x86_warn_reserved(i); + op_amd_shutdown(msrs); + return -EBUSY; + } + + return 0; +} + +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* setup reset_value */ + for (i = 0; i < OP_MAX_COUNTER; ++i) { + if (counter_config[i].enabled + && msrs->counters[op_x86_virt_to_phys(i)].addr) + reset_value[i] = counter_config[i].count; + else + reset_value[i] = 0; + } + + /* clear all counters */ + for (i = 0; i < num_counters; ++i) { + if (!msrs->controls[i].addr) + continue; + rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + op_x86_warn_in_use(i); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); + /* + * avoid a false detection of ctr overflows in NMI + * handler + */ + wrmsrl(msrs->counters[i].addr, -1LL); + } + + /* enable active counters */ + for (i = 0; i < num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) + continue; + + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + +static int op_amd_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + for (i = 0; i < num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) + continue; + rdmsrl(msrs->counters[i].addr, val); + /* bit is clear if overflowed: */ + if (val & OP_CTR_OVERFLOW) + continue; + oprofile_add_sample(regs, virt); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + } + + op_amd_handle_ibs(regs, msrs); + + /* See op_model_ppro.c */ + return 1; +} + +static void op_amd_start(struct op_msrs const * const msrs) +{ + u64 val; + int i; + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[op_x86_phys_to_virt(i)]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(msrs->controls[i].addr, val); + } + + op_amd_start_ibs(); +} + +static void op_amd_stop(struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* + * Subtle: stop on all counters to avoid race with setting our + * pm callback + */ + for (i = 0; i < num_counters; ++i) { + if (!reset_value[op_x86_phys_to_virt(i)]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(msrs->controls[i].addr, val); + } + + op_amd_stop_ibs(); +} + +/* + * check and reserve APIC extended interrupt LVT offset for IBS if + * available + */ + +static void init_ibs(void) +{ + ibs_caps = get_ibs_caps(); + + if (!ibs_caps) + return; + + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); +} + +static int (*create_arch_files)(struct dentry *root); + +static int setup_ibs_files(struct dentry *root) +{ + struct dentry *dir; + int ret = 0; + + /* architecture specific files */ + if (create_arch_files) + ret = create_arch_files(root); + + if (ret) + return ret; + + if (!ibs_caps) + return ret; + + /* model specific files */ + + /* setup some reasonable defaults */ + memset(&ibs_config, 0, sizeof(ibs_config)); + ibs_config.max_cnt_fetch = 250000; + ibs_config.max_cnt_op = 250000; + + if (ibs_caps & IBS_CAPS_FETCHSAM) { + dir = oprofilefs_mkdir(root, "ibs_fetch"); + oprofilefs_create_ulong(dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(dir, "max_count", + &ibs_config.max_cnt_fetch); + oprofilefs_create_ulong(dir, "rand_enable", + &ibs_config.rand_en); + } + + if (ibs_caps & IBS_CAPS_OPSAM) { + dir = oprofilefs_mkdir(root, "ibs_op"); + oprofilefs_create_ulong(dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(dir, "max_count", + &ibs_config.max_cnt_op); + if (ibs_caps & IBS_CAPS_OPCNT) + oprofilefs_create_ulong(dir, "dispatched_ops", + &ibs_config.dispatched_ops); + if (ibs_caps & IBS_CAPS_BRNTRGT) + oprofilefs_create_ulong(dir, "branch_target", + &ibs_config.branch_target); + } + + return 0; +} + +struct op_x86_model_spec op_amd_spec; + +static int op_amd_init(struct oprofile_operations *ops) +{ + init_ibs(); + create_arch_files = ops->create_files; + ops->create_files = setup_ibs_files; + + if (boot_cpu_data.x86 == 0x15) { + num_counters = AMD64_NUM_COUNTERS_CORE; + } else { + num_counters = AMD64_NUM_COUNTERS; + } + + op_amd_spec.num_counters = num_counters; + op_amd_spec.num_controls = num_counters; + op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); + + return 0; +} + +struct op_x86_model_spec op_amd_spec = { + /* num_counters/num_controls filled in at runtime */ + .reserved = MSR_AMD_EVENTSEL_RESERVED, + .event_mask = OP_EVENT_MASK, + .init = op_amd_init, + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown, +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + .switch_ctrl = &op_mux_switch_ctrl, +#endif +}; diff --git a/kernel/arch/x86/oprofile/op_model_p4.c b/kernel/arch/x86/oprofile/op_model_p4.c new file mode 100644 index 000000000..ad1d91f47 --- /dev/null +++ b/kernel/arch/x86/oprofile/op_model_p4.c @@ -0,0 +1,723 @@ +/** + * @file op_model_p4.c + * P4 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + */ + +#include +#include +#include +#include +#include +#include +#include + + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_EVENTS 39 + +#define NUM_COUNTERS_NON_HT 8 +#define NUM_ESCRS_NON_HT 45 +#define NUM_CCCRS_NON_HT 18 +#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) + +#define NUM_COUNTERS_HT2 4 +#define NUM_ESCRS_HT2 23 +#define NUM_CCCRS_HT2 9 +#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) + +#define OP_CTR_OVERFLOW (1ULL<<31) + +static unsigned int num_counters = NUM_COUNTERS_NON_HT; +static unsigned int num_controls = NUM_CONTROLS_NON_HT; + +/* this has to be checked dynamically since the + hyper-threadedness of a chip is discovered at + kernel boot-time. */ +static inline void setup_num_counters(void) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) { + num_counters = NUM_COUNTERS_HT2; + num_controls = NUM_CONTROLS_HT2; + } +#endif +} + +static inline int addr_increment(void) +{ +#ifdef CONFIG_SMP + return smp_num_siblings == 2 ? 2 : 1; +#else + return 1; +#endif +} + + +/* tables to simulate simplified hardware view of p4 registers */ +struct p4_counter_binding { + int virt_counter; + int counter_address; + int cccr_address; +}; + +struct p4_event_binding { + int escr_select; /* value to put in CCCR */ + int event_select; /* value to put in ESCR */ + struct { + int virt_counter; /* for this counter... */ + int escr_address; /* use this ESCR */ + } bindings[2]; +}; + +/* nb: these CTR_* defines are a duplicate of defines in + event/i386.p4*events. */ + + +#define CTR_BPU_0 (1 << 0) +#define CTR_MS_0 (1 << 1) +#define CTR_FLAME_0 (1 << 2) +#define CTR_IQ_4 (1 << 3) +#define CTR_BPU_2 (1 << 4) +#define CTR_MS_2 (1 << 5) +#define CTR_FLAME_2 (1 << 6) +#define CTR_IQ_5 (1 << 7) + +static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = { + { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, + { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, + { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, + { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, + { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, + { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, + { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, + { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } +}; + +#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT) + +/* p4 event codes in libop/op_event.h are indices into this table. */ + +static struct p4_event_binding p4_events[NUM_EVENTS] = { + + { /* BRANCH_RETIRED */ + 0x05, 0x06, + { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, + {CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* MISPRED_BRANCH_RETIRED */ + 0x04, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* TC_DELIVER_MODE */ + 0x01, 0x01, + { { CTR_MS_0, MSR_P4_TC_ESCR0}, + { CTR_MS_2, MSR_P4_TC_ESCR1} } + }, + + { /* BPU_FETCH_REQUEST */ + 0x00, 0x03, + { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, + { CTR_BPU_2, MSR_P4_BPU_ESCR1} } + }, + + { /* ITLB_REFERENCE */ + 0x03, 0x18, + { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, + { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } + }, + + { /* MEMORY_CANCEL */ + 0x05, 0x02, + { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, + { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } + }, + + { /* MEMORY_COMPLETE */ + 0x02, 0x08, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* LOAD_PORT_REPLAY */ + 0x02, 0x04, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* STORE_PORT_REPLAY */ + 0x02, 0x05, + { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, + { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } + }, + + { /* MOB_LOAD_REPLAY */ + 0x02, 0x03, + { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, + { CTR_BPU_2, MSR_P4_MOB_ESCR1} } + }, + + { /* PAGE_WALK_TYPE */ + 0x04, 0x01, + { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, + { CTR_BPU_2, MSR_P4_PMH_ESCR1} } + }, + + { /* BSQ_CACHE_REFERENCE */ + 0x07, 0x0c, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { CTR_BPU_2, MSR_P4_BSU_ESCR1} } + }, + + { /* IOQ_ALLOCATION */ + 0x06, 0x03, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { 0, 0 } } + }, + + { /* IOQ_ACTIVE_ENTRIES */ + 0x06, 0x1a, + { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, + { 0, 0 } } + }, + + { /* FSB_DATA_ACTIVITY */ + 0x06, 0x17, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* BSQ_ALLOCATION */ + 0x07, 0x05, + { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, + { 0, 0 } } + }, + + { /* BSQ_ACTIVE_ENTRIES */ + 0x07, 0x06, + { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, + { 0, 0 } } + }, + + { /* X87_ASSIST */ + 0x05, 0x03, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* SSE_INPUT_ASSIST */ + 0x01, 0x34, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_SP_UOP */ + 0x01, 0x08, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* PACKED_DP_UOP */ + 0x01, 0x0c, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_SP_UOP */ + 0x01, 0x0a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* SCALAR_DP_UOP */ + 0x01, 0x0e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 64BIT_MMX_UOP */ + 0x01, 0x02, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* 128BIT_MMX_UOP */ + 0x01, 0x1a, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_FP_UOP */ + 0x01, 0x04, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* X87_SIMD_MOVES_UOP */ + 0x01, 0x2e, + { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, + { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } + }, + + { /* MACHINE_CLEAR */ + 0x05, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* GLOBAL_POWER_EVENTS */ + 0x06, 0x13 /* older manual says 0x05, newer 0x13 */, + { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, + { CTR_BPU_2, MSR_P4_FSB_ESCR1} } + }, + + { /* TC_MS_XFER */ + 0x00, 0x05, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* UOP_QUEUE_WRITES */ + 0x00, 0x09, + { { CTR_MS_0, MSR_P4_MS_ESCR0}, + { CTR_MS_2, MSR_P4_MS_ESCR1} } + }, + + { /* FRONT_END_EVENT */ + 0x05, 0x08, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* EXECUTION_EVENT */ + 0x05, 0x0c, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* REPLAY_EVENT */ + 0x05, 0x09, + { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, + { CTR_IQ_5, MSR_P4_CRU_ESCR3} } + }, + + { /* INSTR_RETIRED */ + 0x04, 0x02, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOPS_RETIRED */ + 0x04, 0x01, + { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, + { CTR_IQ_5, MSR_P4_CRU_ESCR1} } + }, + + { /* UOP_TYPE */ + 0x02, 0x02, + { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, + { CTR_IQ_5, MSR_P4_RAT_ESCR1} } + }, + + { /* RETIRED_MISPRED_BRANCH_TYPE */ + 0x02, 0x05, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + }, + + { /* RETIRED_BRANCH_TYPE */ + 0x02, 0x04, + { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, + { CTR_MS_2, MSR_P4_TBPU_ESCR1} } + } +}; + + +#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7) + +#define ESCR_RESERVED_BITS 0x80000003 +#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) +#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2)) +#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3)) +#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1))) +#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) +#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) +#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) + +#define CCCR_RESERVED_BITS 0x38030FFF +#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) +#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000) +#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13)) +#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26)) +#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) +#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) +#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) +#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) +#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) + + +/* this assigns a "stagger" to the current CPU, which is used throughout + the code in this module as an extra array offset, to select the "even" + or "odd" part of all the divided resources. */ +static unsigned int get_stagger(void) +{ +#ifdef CONFIG_SMP + int cpu = smp_processor_id(); + return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); +#endif + return 0; +} + + +/* finally, mediate access to a real hardware counter + by passing a "virtual" counter numer to this macro, + along with your stagger setting. */ +#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) + +static unsigned long reset_value[NUM_COUNTERS_NON_HT]; + +static void p4_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < num_counters; ++i) { + if (msrs->counters[i].addr) + release_perfctr_nmi(msrs->counters[i].addr); + } + /* + * some of the control registers are specially reserved in + * conjunction with the counter registers (hence the starting offset). + * This saves a few bits. + */ + for (i = num_counters; i < num_controls; ++i) { + if (msrs->controls[i].addr) + release_evntsel_nmi(msrs->controls[i].addr); + } +} + +static int p4_fill_in_addresses(struct op_msrs * const msrs) +{ + unsigned int i; + unsigned int addr, cccraddr, stag; + + setup_num_counters(); + stag = get_stagger(); + + /* the counter & cccr registers we pay attention to */ + for (i = 0; i < num_counters; ++i) { + addr = p4_counters[VIRT_CTR(stag, i)].counter_address; + cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; + if (reserve_perfctr_nmi(addr)) { + msrs->counters[i].addr = addr; + msrs->controls[i].addr = cccraddr; + } + } + + /* 43 ESCR registers in three or four discontiguous group */ + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) { + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; + } + + /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1 + * to avoid special case in nmi_{save|restore}_registers() */ + if (boot_cpu_data.x86_model >= 0x3) { + for (addr = MSR_P4_BSU_ESCR0 + stag; + addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) { + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; + } + } else { + for (addr = MSR_P4_IQ_ESCR0 + stag; + addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) { + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; + } + } + + for (addr = MSR_P4_RAT_ESCR0 + stag; + addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; + } + + for (addr = MSR_P4_MS_ESCR0 + stag; + addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; + } + + for (addr = MSR_P4_IX_ESCR0 + stag; + addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { + if (reserve_evntsel_nmi(addr)) + msrs->controls[i].addr = addr; + } + + /* there are 2 remaining non-contiguously located ESCRs */ + + if (num_counters == NUM_COUNTERS_NON_HT) { + /* standard non-HT CPUs handle both remaining ESCRs*/ + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + + } else if (stag == 0) { + /* HT CPUs give the first remainder to the even thread, as + the 32nd control register */ + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4)) + msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; + + } else { + /* and two copies of the second to the odd thread, + for the 22st and 23nd control registers */ + if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) { + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; + } + } + + for (i = 0; i < num_counters; ++i) { + if (!counter_config[i].enabled) + continue; + if (msrs->controls[i].addr) + continue; + op_x86_warn_reserved(i); + p4_shutdown(msrs); + return -EBUSY; + } + + return 0; +} + + +static void pmc_setup_one_p4_counter(unsigned int ctr) +{ + int i; + int const maxbind = 2; + unsigned int cccr = 0; + unsigned int escr = 0; + unsigned int high = 0; + unsigned int counter_bit; + struct p4_event_binding *ev = NULL; + unsigned int stag; + + stag = get_stagger(); + + /* convert from counter *number* to counter *bit* */ + counter_bit = 1 << VIRT_CTR(stag, ctr); + + /* find our event binding structure. */ + if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { + printk(KERN_ERR + "oprofile: P4 event code 0x%lx out of range\n", + counter_config[ctr].event); + return; + } + + ev = &(p4_events[counter_config[ctr].event - 1]); + + for (i = 0; i < maxbind; i++) { + if (ev->bindings[i].virt_counter & counter_bit) { + + /* modify ESCR */ + rdmsr(ev->bindings[i].escr_address, escr, high); + ESCR_CLEAR(escr); + if (stag == 0) { + ESCR_SET_USR_0(escr, counter_config[ctr].user); + ESCR_SET_OS_0(escr, counter_config[ctr].kernel); + } else { + ESCR_SET_USR_1(escr, counter_config[ctr].user); + ESCR_SET_OS_1(escr, counter_config[ctr].kernel); + } + ESCR_SET_EVENT_SELECT(escr, ev->event_select); + ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); + wrmsr(ev->bindings[i].escr_address, escr, high); + + /* modify CCCR */ + rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); + CCCR_CLEAR(cccr); + CCCR_SET_REQUIRED_BITS(cccr); + CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); + if (stag == 0) + CCCR_SET_PMI_OVF_0(cccr); + else + CCCR_SET_PMI_OVF_1(cccr); + wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); + return; + } + } + + printk(KERN_ERR + "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", + counter_config[ctr].event, stag, ctr); +} + + +static void p4_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + unsigned int i; + unsigned int low, high; + unsigned int stag; + + stag = get_stagger(); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (!MISC_PMC_ENABLED_P(low)) { + printk(KERN_ERR "oprofile: P4 PMC not available\n"); + return; + } + + /* clear the cccrs we will use */ + for (i = 0; i < num_counters; i++) { + if (unlikely(!msrs->controls[i].addr)) + continue; + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + CCCR_CLEAR(low); + CCCR_SET_REQUIRED_BITS(low); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + } + + /* clear all escrs (including those outside our concern) */ + for (i = num_counters; i < num_controls; i++) { + if (unlikely(!msrs->controls[i].addr)) + continue; + wrmsr(msrs->controls[i].addr, 0, 0); + } + + /* setup all counters */ + for (i = 0; i < num_counters; ++i) { + if (counter_config[i].enabled && msrs->controls[i].addr) { + reset_value[i] = counter_config[i].count; + pmc_setup_one_p4_counter(i); + wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(u64)counter_config[i].count); + } else { + reset_value[i] = 0; + } + } +} + + +static int p4_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned long ctr, low, high, stag, real; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + + if (!reset_value[i]) + continue; + + /* + * there is some eccentricity in the hardware which + * requires that we perform 2 extra corrections: + * + * - check both the CCCR:OVF flag for overflow and the + * counter high bit for un-flagged overflows. + * + * - write the counter back twice to ensure it gets + * updated properly. + * + * the former seems to be related to extra NMIs happening + * during the current NMI; the latter is reported as errata + * N15 in intel doc 249199-029, pentium 4 specification + * update, though their suggested work-around does not + * appear to solve the problem. + */ + + real = VIRT_CTR(stag, i); + + rdmsr(p4_counters[real].cccr_address, low, high); + rdmsr(p4_counters[real].counter_address, ctr, high); + if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { + oprofile_add_sample(regs, i); + wrmsrl(p4_counters[real].counter_address, + -(u64)reset_value[i]); + CCCR_CLEAR_OVF(low); + wrmsr(p4_counters[real].cccr_address, low, high); + wrmsrl(p4_counters[real].counter_address, + -(u64)reset_value[i]); + } + } + + /* P4 quirk: you have to re-unmask the apic vector */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + + /* See op_model_ppro.c */ + return 1; +} + + +static void p4_start(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + CCCR_SET_ENABLE(low); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + } +} + + +static void p4_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high, stag; + int i; + + stag = get_stagger(); + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + CCCR_SET_DISABLE(low); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); + } +} + +#ifdef CONFIG_SMP +struct op_x86_model_spec op_p4_ht2_spec = { + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown +}; +#endif + +struct op_x86_model_spec op_p4_spec = { + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown +}; diff --git a/kernel/arch/x86/oprofile/op_model_ppro.c b/kernel/arch/x86/oprofile/op_model_ppro.c new file mode 100644 index 000000000..d90528ea5 --- /dev/null +++ b/kernel/arch/x86/oprofile/op_model_ppro.c @@ -0,0 +1,245 @@ +/* + * @file op_model_ppro.h + * Family 6 perfmon and architectural perfmon MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Copyright 2008 Intel Corporation + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + * @author Andi Kleen + * @author Robert Richter + */ + +#include +#include +#include +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +static int num_counters = 2; +static int counter_width = 32; + +#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) + +static u64 reset_value[OP_MAX_COUNTER]; + +static void ppro_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < num_counters; ++i) { + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); + } +} + +static int ppro_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < num_counters; i++) { + if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) + goto fail; + if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) { + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + goto fail; + } + /* both registers must be reserved */ + msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; + msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; + continue; + fail: + if (!counter_config[i].enabled) + continue; + op_x86_warn_reserved(i); + ppro_shutdown(msrs); + return -EBUSY; + } + + return 0; +} + + +static void ppro_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + if (cpu_has_arch_perfmon) { + union cpuid10_eax eax; + eax.full = cpuid_eax(0xa); + + /* + * For Core2 (family 6, model 15), don't reset the + * counter width: + */ + if (!(eax.split.version_id == 0 && + __this_cpu_read(cpu_info.x86) == 6 && + __this_cpu_read(cpu_info.x86_model) == 15)) { + + if (counter_width < eax.split.bit_width) + counter_width = eax.split.bit_width; + } + } + + /* clear all counters */ + for (i = 0; i < num_counters; ++i) { + if (!msrs->controls[i].addr) + continue; + rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + op_x86_warn_in_use(i); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); + /* + * avoid a false detection of ctr overflows in NMI * + * handler + */ + wrmsrl(msrs->counters[i].addr, -1LL); + } + + /* enable active counters */ + for (i = 0; i < num_counters; ++i) { + if (counter_config[i].enabled && msrs->counters[i].addr) { + reset_value[i] = counter_config[i].count; + wrmsrl(msrs->counters[i].addr, -reset_value[i]); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); + } else { + reset_value[i] = 0; + } + } +} + + +static int ppro_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + rdmsrl(msrs->counters[i].addr, val); + if (val & (1ULL << (counter_width - 1))) + continue; + oprofile_add_sample(regs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); + } + + /* Only P6 based Pentium M need to re-unmask the apic vector but it + * doesn't hurt other P6 variant */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + + /* We can't work out if we really handled an interrupt. We + * might have caught a *second* counter just after overflowing + * the interrupt for this counter then arrives + * and we don't find a counter that's overflowed, so we + * would return 0 and get dazed + confused. Instead we always + * assume we found an overflow. This sucks. + */ + return 1; +} + + +static void ppro_start(struct op_msrs const * const msrs) +{ + u64 val; + int i; + + for (i = 0; i < num_counters; ++i) { + if (reset_value[i]) { + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(msrs->controls[i].addr, val); + } + } +} + + +static void ppro_stop(struct op_msrs const * const msrs) +{ + u64 val; + int i; + + for (i = 0; i < num_counters; ++i) { + if (!reset_value[i]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(msrs->controls[i].addr, val); + } +} + +struct op_x86_model_spec op_ppro_spec = { + .num_counters = 2, + .num_controls = 2, + .reserved = MSR_PPRO_EVENTSEL_RESERVED, + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown +}; + +/* + * Architectural performance monitoring. + * + * Newer Intel CPUs (Core1+) have support for architectural + * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. + * The advantage of this is that it can be done without knowing about + * the specific CPU. + */ + +static void arch_perfmon_setup_counters(void) +{ + union cpuid10_eax eax; + + eax.full = cpuid_eax(0xa); + + /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ + if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 && + __this_cpu_read(cpu_info.x86_model) == 15) { + eax.split.version_id = 2; + eax.split.num_counters = 2; + eax.split.bit_width = 40; + } + + num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER); + + op_arch_perfmon_spec.num_counters = num_counters; + op_arch_perfmon_spec.num_controls = num_counters; +} + +static int arch_perfmon_init(struct oprofile_operations *ignore) +{ + arch_perfmon_setup_counters(); + return 0; +} + +struct op_x86_model_spec op_arch_perfmon_spec = { + .reserved = MSR_PPRO_EVENTSEL_RESERVED, + .init = &arch_perfmon_init, + /* num_counters/num_controls filled in at runtime */ + .fill_in_addresses = &ppro_fill_in_addresses, + /* user space does the cpuid check for available events */ + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown +}; diff --git a/kernel/arch/x86/oprofile/op_x86_model.h b/kernel/arch/x86/oprofile/op_x86_model.h new file mode 100644 index 000000000..71e8a6733 --- /dev/null +++ b/kernel/arch/x86/oprofile/op_x86_model.h @@ -0,0 +1,90 @@ +/** + * @file op_x86_model.h + * interface to x86 model-specific MSR operations + * + * @remark Copyright 2002 OProfile authors + * @remark Read the file COPYING + * + * @author Graydon Hoare + * @author Robert Richter + */ + +#ifndef OP_X86_MODEL_H +#define OP_X86_MODEL_H + +#include +#include + +struct op_msr { + unsigned long addr; + u64 saved; +}; + +struct op_msrs { + struct op_msr *counters; + struct op_msr *controls; + struct op_msr *multiplex; +}; + +struct pt_regs; + +struct oprofile_operations; + +/* The model vtable abstracts the differences between + * various x86 CPU models' perfctr support. + */ +struct op_x86_model_spec { + unsigned int num_counters; + unsigned int num_controls; + unsigned int num_virt_counters; + u64 reserved; + u16 event_mask; + int (*init)(struct oprofile_operations *ops); + int (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); + int (*check_ctrs)(struct pt_regs * const regs, + struct op_msrs const * const msrs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); + void (*shutdown)(struct op_msrs const * const msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + void (*switch_ctrl)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); +#endif +}; + +struct op_counter_config; + +static inline void op_x86_warn_in_use(int counter) +{ + /* + * The warning indicates an already running counter. If + * oprofile doesn't collect data, then try using a different + * performance counter on your platform to monitor the desired + * event. Delete counter #%d from the desired event by editing + * the /usr/share/oprofile/%s//events file. If the event + * cannot be monitored by any other counter, contact your + * hardware or BIOS vendor. + */ + pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); +} + +static inline void op_x86_warn_reserved(int counter) +{ + pr_warning("oprofile: counter #%d is already reserved\n", counter); +} + +extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config); +extern int op_x86_phys_to_virt(int phys); +extern int op_x86_virt_to_phys(int virt); + +extern struct op_x86_model_spec op_ppro_spec; +extern struct op_x86_model_spec op_p4_spec; +extern struct op_x86_model_spec op_p4_ht2_spec; +extern struct op_x86_model_spec op_amd_spec; +extern struct op_x86_model_spec op_arch_perfmon_spec; + +#endif /* OP_X86_MODEL_H */ -- cgit 1.2.3-korg