From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/drivers/idle/Kconfig | 27 + kernel/drivers/idle/Makefile | 3 + kernel/drivers/idle/i7300_idle.c | 612 +++++++++++++++++++++ kernel/drivers/idle/intel_idle.c | 1107 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 1749 insertions(+) create mode 100644 kernel/drivers/idle/Kconfig create mode 100644 kernel/drivers/idle/Makefile create mode 100644 kernel/drivers/idle/i7300_idle.c create mode 100644 kernel/drivers/idle/intel_idle.c (limited to 'kernel/drivers/idle') diff --git a/kernel/drivers/idle/Kconfig b/kernel/drivers/idle/Kconfig new file mode 100644 index 000000000..4732dfc15 --- /dev/null +++ b/kernel/drivers/idle/Kconfig @@ -0,0 +1,27 @@ +config INTEL_IDLE + bool "Cpuidle Driver for Intel Processors" + depends on CPU_IDLE + depends on X86 + depends on CPU_SUP_INTEL + help + Enable intel_idle, a cpuidle driver that includes knowledge of + native Intel hardware idle features. The acpi_idle driver + can be configured at the same time, in order to handle + processors intel_idle does not support. + +menu "Memory power savings" +depends on X86_64 + +config I7300_IDLE_IOAT_CHANNEL + bool + +config I7300_IDLE + tristate "Intel chipset idle memory power saving driver" + select I7300_IDLE_IOAT_CHANNEL + help + Enable memory power savings when idle with certain Intel server + chipsets. The chipset must have I/O AT support, such as the + Intel 7300. The power savings depends on the type and quantity of + DRAM devices. + +endmenu diff --git a/kernel/drivers/idle/Makefile b/kernel/drivers/idle/Makefile new file mode 100644 index 000000000..23d295cf1 --- /dev/null +++ b/kernel/drivers/idle/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_I7300_IDLE) += i7300_idle.o +obj-$(CONFIG_INTEL_IDLE) += intel_idle.o + diff --git a/kernel/drivers/idle/i7300_idle.c b/kernel/drivers/idle/i7300_idle.c new file mode 100644 index 000000000..ffeebc7e9 --- /dev/null +++ b/kernel/drivers/idle/i7300_idle.c @@ -0,0 +1,612 @@ +/* + * (C) Copyright 2008 Intel Corporation + * Authors: + * Andy Henroid + * Venkatesh Pallipadi + */ + +/* + * Save DIMM power on Intel 7300-based platforms when all CPUs/cores + * are idle, using the DIMM thermal throttling capability. + * + * This driver depends on the Intel integrated DMA controller (I/O AT). + * If the driver for I/O AT (drivers/dma/ioatdma*) is also enabled, + * this driver should work cooperatively. + */ + +/* #define DEBUG */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../dma/ioat/hw.h" +#include "../dma/ioat/registers.h" + +#define I7300_IDLE_DRIVER_VERSION "1.55" +#define I7300_PRINT "i7300_idle:" + +#define MAX_STOP_RETRIES 10 + +static int debug; +module_param_named(debug, debug, uint, 0644); +MODULE_PARM_DESC(debug, "Enable debug printks in this driver"); + +static int forceload; +module_param_named(forceload, forceload, uint, 0644); +MODULE_PARM_DESC(debug, "Enable driver testing on unvalidated i5000"); + +#define dprintk(fmt, arg...) \ + do { if (debug) printk(KERN_INFO I7300_PRINT fmt, ##arg); } while (0) + +/* + * Value to set THRTLOW to when initiating throttling + * 0 = No throttling + * 1 = Throttle when > 4 activations per eval window (Maximum throttling) + * 2 = Throttle when > 8 activations + * 168 = Throttle when > 672 activations (Minimum throttling) + */ +#define MAX_THROTTLE_LOW_LIMIT 168 +static uint throttle_low_limit = 1; +module_param_named(throttle_low_limit, throttle_low_limit, uint, 0644); +MODULE_PARM_DESC(throttle_low_limit, + "Value for THRTLOWLM activation field " + "(0 = disable throttle, 1 = Max throttle, 168 = Min throttle)"); + +/* + * simple invocation and duration statistics + */ +static unsigned long total_starts; +static unsigned long total_us; + +#ifdef DEBUG +static unsigned long past_skip; +#endif + +static struct pci_dev *fbd_dev; + +static raw_spinlock_t i7300_idle_lock; +static int i7300_idle_active; + +static u8 i7300_idle_thrtctl_saved; +static u8 i7300_idle_thrtlow_saved; +static u32 i7300_idle_mc_saved; + +static cpumask_var_t idle_cpumask; +static ktime_t start_ktime; +static unsigned long avg_idle_us; + +static struct dentry *debugfs_dir; + +/* Begin: I/O AT Helper routines */ + +#define IOAT_CHANBASE(ioat_ctl, chan) (ioat_ctl + 0x80 + 0x80 * chan) +/* Snoop control (disable snoops when coherency is not important) */ +#define IOAT_DESC_SADDR_SNP_CTL (1UL << 1) +#define IOAT_DESC_DADDR_SNP_CTL (1UL << 2) + +static struct pci_dev *ioat_dev; +static struct ioat_dma_descriptor *ioat_desc; /* I/O AT desc & data (1 page) */ +static unsigned long ioat_desc_phys; +static u8 *ioat_iomap; /* I/O AT memory-mapped control regs (aka CB_BAR) */ +static u8 *ioat_chanbase; + +/* Start I/O AT memory copy */ +static int i7300_idle_ioat_start(void) +{ + u32 err; + /* Clear error (due to circular descriptor pointer) */ + err = readl(ioat_chanbase + IOAT_CHANERR_OFFSET); + if (err) + writel(err, ioat_chanbase + IOAT_CHANERR_OFFSET); + + writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + return 0; +} + +/* Stop I/O AT memory copy */ +static void i7300_idle_ioat_stop(void) +{ + int i; + u64 sts; + + for (i = 0; i < MAX_STOP_RETRIES; i++) { + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + udelay(10); + + sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_STATUS; + + if (sts != IOAT_CHANSTS_ACTIVE) + break; + + } + + if (i == MAX_STOP_RETRIES) { + dprintk("failed to stop I/O AT after %d retries\n", + MAX_STOP_RETRIES); + } +} + +/* Test I/O AT by copying 1024 byte from 2k to 1k */ +static int __init i7300_idle_ioat_selftest(u8 *ctl, + struct ioat_dma_descriptor *desc, unsigned long desc_phys) +{ + u64 chan_sts; + + memset(desc, 0, 2048); + memset((u8 *) desc + 2048, 0xab, 1024); + + desc[0].size = 1024; + desc[0].ctl = 0; + desc[0].src_addr = desc_phys + 2048; + desc[0].dst_addr = desc_phys + 1024; + desc[0].next = 0; + + writeb(IOAT_CHANCMD_RESET, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + udelay(1000); + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_STATUS; + + if (chan_sts != IOAT_CHANSTS_DONE) { + /* Not complete, reset the channel */ + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + return -1; + } + + if (*(u32 *) ((u8 *) desc + 3068) != 0xabababab || + *(u32 *) ((u8 *) desc + 2044) != 0xabababab) { + dprintk("Data values src 0x%x, dest 0x%x, memset 0x%x\n", + *(u32 *) ((u8 *) desc + 2048), + *(u32 *) ((u8 *) desc + 1024), + *(u32 *) ((u8 *) desc + 3072)); + return -1; + } + return 0; +} + +static struct device dummy_dma_dev = { + .init_name = "fallback device", + .coherent_dma_mask = DMA_BIT_MASK(64), + .dma_mask = &dummy_dma_dev.coherent_dma_mask, +}; + +/* Setup and initialize I/O AT */ +/* This driver needs I/O AT as the throttling takes effect only when there is + * some memory activity. We use I/O AT to set up a dummy copy, while all CPUs + * go idle and memory is throttled. + */ +static int __init i7300_idle_ioat_init(void) +{ + u8 ver, chan_count, ioat_chan; + u16 chan_ctl; + + ioat_iomap = (u8 *) ioremap_nocache(pci_resource_start(ioat_dev, 0), + pci_resource_len(ioat_dev, 0)); + + if (!ioat_iomap) { + printk(KERN_ERR I7300_PRINT "failed to map I/O AT registers\n"); + goto err_ret; + } + + ver = readb(ioat_iomap + IOAT_VER_OFFSET); + if (ver != IOAT_VER_1_2) { + printk(KERN_ERR I7300_PRINT "unknown I/O AT version (%u.%u)\n", + ver >> 4, ver & 0xf); + goto err_unmap; + } + + chan_count = readb(ioat_iomap + IOAT_CHANCNT_OFFSET); + if (!chan_count) { + printk(KERN_ERR I7300_PRINT "unexpected # of I/O AT channels " + "(%u)\n", + chan_count); + goto err_unmap; + } + + ioat_chan = chan_count - 1; + ioat_chanbase = IOAT_CHANBASE(ioat_iomap, ioat_chan); + + chan_ctl = readw(ioat_chanbase + IOAT_CHANCTRL_OFFSET); + if (chan_ctl & IOAT_CHANCTRL_CHANNEL_IN_USE) { + printk(KERN_ERR I7300_PRINT "channel %d in use\n", ioat_chan); + goto err_unmap; + } + + writew(IOAT_CHANCTRL_CHANNEL_IN_USE, + ioat_chanbase + IOAT_CHANCTRL_OFFSET); + + ioat_desc = (struct ioat_dma_descriptor *)dma_alloc_coherent( + &dummy_dma_dev, 4096, + (dma_addr_t *)&ioat_desc_phys, GFP_KERNEL); + if (!ioat_desc) { + printk(KERN_ERR I7300_PRINT "failed to allocate I/O AT desc\n"); + goto err_mark_unused; + } + + writel(ioat_desc_phys & 0xffffffffUL, + ioat_chanbase + IOAT1_CHAINADDR_OFFSET_LOW); + writel(ioat_desc_phys >> 32, + ioat_chanbase + IOAT1_CHAINADDR_OFFSET_HIGH); + + if (i7300_idle_ioat_selftest(ioat_iomap, ioat_desc, ioat_desc_phys)) { + printk(KERN_ERR I7300_PRINT "I/O AT self-test failed\n"); + goto err_free; + } + + /* Setup circular I/O AT descriptor chain */ + ioat_desc[0].ctl = IOAT_DESC_SADDR_SNP_CTL | IOAT_DESC_DADDR_SNP_CTL; + ioat_desc[0].src_addr = ioat_desc_phys + 2048; + ioat_desc[0].dst_addr = ioat_desc_phys + 3072; + ioat_desc[0].size = 128; + ioat_desc[0].next = ioat_desc_phys + sizeof(struct ioat_dma_descriptor); + + ioat_desc[1].ctl = ioat_desc[0].ctl; + ioat_desc[1].src_addr = ioat_desc[0].src_addr; + ioat_desc[1].dst_addr = ioat_desc[0].dst_addr; + ioat_desc[1].size = ioat_desc[0].size; + ioat_desc[1].next = ioat_desc_phys; + + return 0; + +err_free: + dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0); +err_mark_unused: + writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); +err_unmap: + iounmap(ioat_iomap); +err_ret: + return -ENODEV; +} + +/* Cleanup I/O AT */ +static void __exit i7300_idle_ioat_exit(void) +{ + int i; + u64 chan_sts; + + i7300_idle_ioat_stop(); + + /* Wait for a while for the channel to halt before releasing */ + for (i = 0; i < MAX_STOP_RETRIES; i++) { + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_STATUS; + + if (chan_sts != IOAT_CHANSTS_ACTIVE) { + writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); + break; + } + udelay(1000); + } + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_STATUS; + + /* + * We tried to reset multiple times. If IO A/T channel is still active + * flag an error and return without cleanup. Memory leak is better + * than random corruption in that extreme error situation. + */ + if (chan_sts == IOAT_CHANSTS_ACTIVE) { + printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels." + " Not freeing resources\n"); + return; + } + + dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0); + iounmap(ioat_iomap); +} + +/* End: I/O AT Helper routines */ + +#define DIMM_THRTLOW 0x64 +#define DIMM_THRTCTL 0x67 +#define DIMM_THRTCTL_THRMHUNT (1UL << 0) +#define DIMM_MC 0x40 +#define DIMM_GTW_MODE (1UL << 17) +#define DIMM_GBLACT 0x60 + +/* + * Keep track of an exponential-decaying average of recent idle durations. + * The latest duration gets DURATION_WEIGHT_PCT percentage weight + * in this average, with the old average getting the remaining weight. + * + * High weights emphasize recent history, low weights include long history. + */ +#define DURATION_WEIGHT_PCT 55 + +/* + * When the decaying average of recent durations or the predicted duration + * of the next timer interrupt is shorter than duration_threshold, the + * driver will decline to throttle. + */ +#define DURATION_THRESHOLD_US 100 + + +/* Store DIMM thermal throttle configuration */ +static int i7300_idle_thrt_save(void) +{ + u32 new_mc_val; + u8 gblactlm; + + pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &i7300_idle_thrtctl_saved); + pci_read_config_byte(fbd_dev, DIMM_THRTLOW, &i7300_idle_thrtlow_saved); + pci_read_config_dword(fbd_dev, DIMM_MC, &i7300_idle_mc_saved); + /* + * Make sure we have Global Throttling Window Mode set to have a + * "short" window. This (mostly) works around an issue where + * throttling persists until the end of the global throttling window + * size. On the tested system, this was resulting in a maximum of + * 64 ms to exit throttling (average 32 ms). The actual numbers + * depends on system frequencies. Setting the short window reduces + * this by a factor of 4096. + * + * We will only do this only if the system is set for + * unlimited-activations while in open-loop throttling (i.e., when + * Global Activation Throttle Limit is zero). + */ + pci_read_config_byte(fbd_dev, DIMM_GBLACT, &gblactlm); + dprintk("thrtctl_saved = 0x%02x, thrtlow_saved = 0x%02x\n", + i7300_idle_thrtctl_saved, + i7300_idle_thrtlow_saved); + dprintk("mc_saved = 0x%08x, gblactlm = 0x%02x\n", + i7300_idle_mc_saved, + gblactlm); + if (gblactlm == 0) { + new_mc_val = i7300_idle_mc_saved | DIMM_GTW_MODE; + pci_write_config_dword(fbd_dev, DIMM_MC, new_mc_val); + return 0; + } else { + dprintk("could not set GTW_MODE = 1 (OLTT enabled)\n"); + return -ENODEV; + } +} + +/* Restore DIMM thermal throttle configuration */ +static void i7300_idle_thrt_restore(void) +{ + pci_write_config_dword(fbd_dev, DIMM_MC, i7300_idle_mc_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved); +} + +/* Enable DIMM thermal throttling */ +static void i7300_idle_start(void) +{ + u8 new_ctl; + u8 limit; + + new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); + + limit = throttle_low_limit; + if (unlikely(limit > MAX_THROTTLE_LOW_LIMIT)) + limit = MAX_THROTTLE_LOW_LIMIT; + + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, limit); + + new_ctl = i7300_idle_thrtctl_saved | DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); +} + +/* Disable DIMM thermal throttling */ +static void i7300_idle_stop(void) +{ + u8 new_ctl; + u8 got_ctl; + + new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); + + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved); + pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &got_ctl); + WARN_ON_ONCE(got_ctl != i7300_idle_thrtctl_saved); +} + + +/* + * i7300_avg_duration_check() + * return 0 if the decaying average of recent idle durations is + * more than DURATION_THRESHOLD_US + */ +static int i7300_avg_duration_check(void) +{ + if (avg_idle_us >= DURATION_THRESHOLD_US) + return 0; + +#ifdef DEBUG + past_skip++; +#endif + return 1; +} + +/* Idle notifier to look at idle CPUs */ +static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + unsigned long flags; + ktime_t now_ktime; + static ktime_t idle_begin_time; + static int time_init = 1; + + if (!throttle_low_limit) + return 0; + + if (unlikely(time_init)) { + time_init = 0; + idle_begin_time = ktime_get(); + } + + raw_spin_lock_irqsave(&i7300_idle_lock, flags); + if (val == IDLE_START) { + + cpumask_set_cpu(smp_processor_id(), idle_cpumask); + + if (cpumask_weight(idle_cpumask) != num_online_cpus()) + goto end; + + now_ktime = ktime_get(); + idle_begin_time = now_ktime; + + if (i7300_avg_duration_check()) + goto end; + + i7300_idle_active = 1; + total_starts++; + start_ktime = now_ktime; + + i7300_idle_start(); + i7300_idle_ioat_start(); + + } else if (val == IDLE_END) { + cpumask_clear_cpu(smp_processor_id(), idle_cpumask); + if (cpumask_weight(idle_cpumask) == (num_online_cpus() - 1)) { + /* First CPU coming out of idle */ + u64 idle_duration_us; + + now_ktime = ktime_get(); + + idle_duration_us = ktime_to_us(ktime_sub + (now_ktime, idle_begin_time)); + + avg_idle_us = + ((100 - DURATION_WEIGHT_PCT) * avg_idle_us + + DURATION_WEIGHT_PCT * idle_duration_us) / 100; + + if (i7300_idle_active) { + ktime_t idle_ktime; + + idle_ktime = ktime_sub(now_ktime, start_ktime); + total_us += ktime_to_us(idle_ktime); + + i7300_idle_ioat_stop(); + i7300_idle_stop(); + i7300_idle_active = 0; + } + } + } +end: + raw_spin_unlock_irqrestore(&i7300_idle_lock, flags); + return 0; +} + +static struct notifier_block i7300_idle_nb = { + .notifier_call = i7300_idle_notifier, +}; + +MODULE_DEVICE_TABLE(pci, pci_tbl); + +static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count, + loff_t *off) +{ + unsigned long *p = fp->private_data; + char buf[32]; + int len; + + len = snprintf(buf, 32, "%lu\n", *p); + return simple_read_from_buffer(ubuf, count, off, buf, len); +} + +static const struct file_operations idle_fops = { + .open = simple_open, + .read = stats_read_ul, + .llseek = default_llseek, +}; + +struct debugfs_file_info { + void *ptr; + char name[32]; + struct dentry *file; +} debugfs_file_list[] = { + {&total_starts, "total_starts", NULL}, + {&total_us, "total_us", NULL}, +#ifdef DEBUG + {&past_skip, "past_skip", NULL}, +#endif + {NULL, "", NULL} + }; + +static int __init i7300_idle_init(void) +{ + raw_spin_lock_init(&i7300_idle_lock); + total_us = 0; + + if (i7300_idle_platform_probe(&fbd_dev, &ioat_dev, forceload)) + return -ENODEV; + + if (i7300_idle_thrt_save()) + return -ENODEV; + + if (i7300_idle_ioat_init()) + return -ENODEV; + + if (!zalloc_cpumask_var(&idle_cpumask, GFP_KERNEL)) + return -ENOMEM; + + debugfs_dir = debugfs_create_dir("i7300_idle", NULL); + if (debugfs_dir) { + int i = 0; + + while (debugfs_file_list[i].ptr != NULL) { + debugfs_file_list[i].file = debugfs_create_file( + debugfs_file_list[i].name, + S_IRUSR, + debugfs_dir, + debugfs_file_list[i].ptr, + &idle_fops); + i++; + } + } + + idle_notifier_register(&i7300_idle_nb); + + printk(KERN_INFO "i7300_idle: loaded v%s\n", I7300_IDLE_DRIVER_VERSION); + return 0; +} + +static void __exit i7300_idle_exit(void) +{ + idle_notifier_unregister(&i7300_idle_nb); + free_cpumask_var(idle_cpumask); + + if (debugfs_dir) { + int i = 0; + + while (debugfs_file_list[i].file != NULL) { + debugfs_remove(debugfs_file_list[i].file); + i++; + } + + debugfs_remove(debugfs_dir); + } + i7300_idle_thrt_restore(); + i7300_idle_ioat_exit(); +} + +module_init(i7300_idle_init); +module_exit(i7300_idle_exit); + +MODULE_AUTHOR("Andy Henroid "); +MODULE_DESCRIPTION("Intel Chipset DIMM Idle Power Saving Driver v" + I7300_IDLE_DRIVER_VERSION); +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/idle/intel_idle.c b/kernel/drivers/idle/intel_idle.c new file mode 100644 index 000000000..2a36a95d9 --- /dev/null +++ b/kernel/drivers/idle/intel_idle.c @@ -0,0 +1,1107 @@ +/* + * intel_idle.c - native hardware idle loop for modern Intel processors + * + * Copyright (c) 2013, Intel Corporation. + * Len Brown + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * intel_idle is a cpuidle driver that loads on specific Intel processors + * in lieu of the legacy ACPI processor_idle driver. The intent is to + * make Linux more efficient on these processors, as intel_idle knows + * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. + */ + +/* + * Design Assumptions + * + * All CPUs have same idle states as boot CPU + * + * Chipset BM_STS (bus master status) bit is a NOP + * for preventing entry into deep C-stats + */ + +/* + * Known limitations + * + * The driver currently initializes for_each_online_cpu() upon modprobe. + * It it unaware of subsequent processors hot-added to the system. + * This means that if you boot with maxcpus=n and later online + * processors above n, those processors will use C1 only. + * + * ACPI has a .suspend hack to turn off deep c-statees during suspend + * to avoid complications with the lapic timer workaround. + * Have not seen issues with suspend, but may need same workaround here. + * + * There is currently no kernel-based automatic probing/loading mechanism + * if the driver is built as a module. + */ + +/* un-comment DEBUG to enable pr_debug() statements */ +#define DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define INTEL_IDLE_VERSION "0.4" +#define PREFIX "intel_idle: " + +static struct cpuidle_driver intel_idle_driver = { + .name = "intel_idle", + .owner = THIS_MODULE, +}; +/* intel_idle.max_cstate=0 disables driver */ +static int max_cstate = CPUIDLE_STATE_MAX - 1; + +static unsigned int mwait_substates; + +#define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF +/* Reliable LAPIC Timer States, bit 1 for C1 etc. */ +static unsigned int lapic_timer_reliable_states = (1 << 1); /* Default to only C1 */ + +struct idle_cpu { + struct cpuidle_state *state_table; + + /* + * Hardware C-state auto-demotion may not always be optimal. + * Indicate which enable bits to clear here. + */ + unsigned long auto_demotion_disable_flags; + bool byt_auto_demotion_disable_flag; + bool disable_promotion_to_c1e; +}; + +static const struct idle_cpu *icpu; +static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; +static int intel_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); +static void intel_idle_freeze(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); +static int intel_idle_cpu_init(int cpu); + +static struct cpuidle_state *cpuidle_state_table; + +/* + * Set this flag for states where the HW flushes the TLB for us + * and so we don't need cross-calls to keep it consistent. + * If this flag is set, SW flushes the TLB, so even if the + * HW doesn't do the flushing, this flag is safe to use. + */ +#define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 + +/* + * MWAIT takes an 8-bit "hint" in EAX "suggesting" + * the C-state (top nibble) and sub-state (bottom nibble) + * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc. + * + * We store the hint at the top of our "flags" for each state. + */ +#define flg2MWAIT(flags) (((flags) >> 24) & 0xFF) +#define MWAIT2flg(eax) ((eax & 0xFF) << 24) + +/* + * States are indexed by the cstate number, + * which is also the index into the MWAIT hint array. + * Thus C0 is a dummy. + */ +static struct cpuidle_state nehalem_cstates[] = { + { + .name = "C1-NHM", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 3, + .target_residency = 6, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-NHM", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-NHM", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 20, + .target_residency = 80, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-NHM", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, + .target_residency = 800, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state snb_cstates[] = { + { + .name = "C1-SNB", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-SNB", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-SNB", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 80, + .target_residency = 211, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-SNB", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 104, + .target_residency = 345, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-SNB", + .desc = "MWAIT 0x30", + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 109, + .target_residency = 345, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state byt_cstates[] = { + { + .name = "C1-BYT", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6N-BYT", + .desc = "MWAIT 0x58", + .flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, + .target_residency = 275, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6S-BYT", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 500, + .target_residency = 560, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-BYT", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1200, + .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7S-BYT", + .desc = "MWAIT 0x64", + .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 20000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state cht_cstates[] = { + { + .name = "C1-CHT", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6N-CHT", + .desc = "MWAIT 0x58", + .flags = MWAIT2flg(0x58) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 80, + .target_residency = 275, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6S-CHT", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, + .target_residency = 560, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-CHT", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1200, + .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7S-CHT", + .desc = "MWAIT 0x64", + .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 20000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state ivb_cstates[] = { + { + .name = "C1-IVB", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-IVB", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-IVB", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 156, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-IVB", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 80, + .target_residency = 300, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-IVB", + .desc = "MWAIT 0x30", + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 87, + .target_residency = 300, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state ivt_cstates[] = { + { + .name = "C1-IVT", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-IVT", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 80, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-IVT", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 156, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-IVT", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 82, + .target_residency = 300, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state ivt_cstates_4s[] = { + { + .name = "C1-IVT-4S", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-IVT-4S", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 250, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-IVT-4S", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 300, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-IVT-4S", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 84, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state ivt_cstates_8s[] = { + { + .name = "C1-IVT-8S", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 1, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-IVT-8S", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 500, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-IVT-8S", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 59, + .target_residency = 600, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-IVT-8S", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 88, + .target_residency = 700, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state hsw_cstates[] = { + { + .name = "C1-HSW", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-HSW", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-HSW", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, + .target_residency = 100, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-HSW", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7s-HSW", + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, + .target_residency = 500, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C8-HSW", + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, + .target_residency = 900, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C9-HSW", + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, + .target_residency = 1800, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C10-HSW", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, + .target_residency = 7700, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; +static struct cpuidle_state bdw_cstates[] = { + { + .name = "C1-BDW", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C1E-BDW", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C3-BDW", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, + .target_residency = 100, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-BDW", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7s-BDW", + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, + .target_residency = 500, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C8-BDW", + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, + .target_residency = 900, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C9-BDW", + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, + .target_residency = 1800, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C10-BDW", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, + .target_residency = 7700, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +static struct cpuidle_state atom_cstates[] = { + { + .name = "C1E-ATM", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 10, + .target_residency = 20, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C2-ATM", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10), + .exit_latency = 20, + .target_residency = 80, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C4-ATM", + .desc = "MWAIT 0x30", + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 100, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-ATM", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 560, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; +static struct cpuidle_state avn_cstates[] = { + { + .name = "C1-AVN", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-AVN", + .desc = "MWAIT 0x51", + .flags = MWAIT2flg(0x51) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 15, + .target_residency = 45, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; + +/** + * intel_idle + * @dev: cpuidle_device + * @drv: cpuidle driver + * @index: index of cpuidle state + * + * Must be called under local_irq_disable(). + */ +static int intel_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + unsigned long ecx = 1; /* break on interrupt flag */ + struct cpuidle_state *state = &drv->states[index]; + unsigned long eax = flg2MWAIT(state->flags); + unsigned int cstate; + int cpu = smp_processor_id(); + + cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; + + /* + * leave_mm() to avoid costly and often unnecessary wakeups + * for flushing the user TLB's associated with the active mm. + */ + if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) + leave_mm(cpu); + + if (!(lapic_timer_reliable_states & (1 << (cstate)))) + tick_broadcast_enter(); + + mwait_idle_with_hints(eax, ecx); + + if (!(lapic_timer_reliable_states & (1 << (cstate)))) + tick_broadcast_exit(); + + return index; +} + +/** + * intel_idle_freeze - simplified "enter" callback routine for suspend-to-idle + * @dev: cpuidle_device + * @drv: cpuidle driver + * @index: state index + */ +static void intel_idle_freeze(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + unsigned long ecx = 1; /* break on interrupt flag */ + unsigned long eax = flg2MWAIT(drv->states[index].flags); + + mwait_idle_with_hints(eax, ecx); +} + +static void __setup_broadcast_timer(void *arg) +{ + unsigned long on = (unsigned long)arg; + + if (on) + tick_broadcast_enable(); + else + tick_broadcast_disable(); +} + +static int cpu_hotplug_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct cpuidle_device *dev; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + + if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) + smp_call_function_single(hotcpu, __setup_broadcast_timer, + (void *)true, 1); + + /* + * Some systems can hotplug a cpu at runtime after + * the kernel has booted, we have to initialize the + * driver in this case + */ + dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu); + if (!dev->registered) + intel_idle_cpu_init(hotcpu); + + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_hotplug_notifier = { + .notifier_call = cpu_hotplug_notify, +}; + +static void auto_demotion_disable(void *dummy) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); + msr_bits &= ~(icpu->auto_demotion_disable_flags); + wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); +} +static void c1e_promotion_disable(void *dummy) +{ + unsigned long long msr_bits; + + rdmsrl(MSR_IA32_POWER_CTL, msr_bits); + msr_bits &= ~0x2; + wrmsrl(MSR_IA32_POWER_CTL, msr_bits); +} + +static const struct idle_cpu idle_cpu_nehalem = { + .state_table = nehalem_cstates, + .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, + .disable_promotion_to_c1e = true, +}; + +static const struct idle_cpu idle_cpu_atom = { + .state_table = atom_cstates, +}; + +static const struct idle_cpu idle_cpu_lincroft = { + .state_table = atom_cstates, + .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, +}; + +static const struct idle_cpu idle_cpu_snb = { + .state_table = snb_cstates, + .disable_promotion_to_c1e = true, +}; + +static const struct idle_cpu idle_cpu_byt = { + .state_table = byt_cstates, + .disable_promotion_to_c1e = true, + .byt_auto_demotion_disable_flag = true, +}; + +static const struct idle_cpu idle_cpu_cht = { + .state_table = cht_cstates, + .disable_promotion_to_c1e = true, + .byt_auto_demotion_disable_flag = true, +}; + +static const struct idle_cpu idle_cpu_ivb = { + .state_table = ivb_cstates, + .disable_promotion_to_c1e = true, +}; + +static const struct idle_cpu idle_cpu_ivt = { + .state_table = ivt_cstates, + .disable_promotion_to_c1e = true, +}; + +static const struct idle_cpu idle_cpu_hsw = { + .state_table = hsw_cstates, + .disable_promotion_to_c1e = true, +}; + +static const struct idle_cpu idle_cpu_bdw = { + .state_table = bdw_cstates, + .disable_promotion_to_c1e = true, +}; + +static const struct idle_cpu idle_cpu_avn = { + .state_table = avn_cstates, + .disable_promotion_to_c1e = true, +}; + +#define ICPU(model, cpu) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } + +static const struct x86_cpu_id intel_idle_ids[] __initconst = { + ICPU(0x1a, idle_cpu_nehalem), + ICPU(0x1e, idle_cpu_nehalem), + ICPU(0x1f, idle_cpu_nehalem), + ICPU(0x25, idle_cpu_nehalem), + ICPU(0x2c, idle_cpu_nehalem), + ICPU(0x2e, idle_cpu_nehalem), + ICPU(0x1c, idle_cpu_atom), + ICPU(0x26, idle_cpu_lincroft), + ICPU(0x2f, idle_cpu_nehalem), + ICPU(0x2a, idle_cpu_snb), + ICPU(0x2d, idle_cpu_snb), + ICPU(0x36, idle_cpu_atom), + ICPU(0x37, idle_cpu_byt), + ICPU(0x4c, idle_cpu_cht), + ICPU(0x3a, idle_cpu_ivb), + ICPU(0x3e, idle_cpu_ivt), + ICPU(0x3c, idle_cpu_hsw), + ICPU(0x3f, idle_cpu_hsw), + ICPU(0x45, idle_cpu_hsw), + ICPU(0x46, idle_cpu_hsw), + ICPU(0x4d, idle_cpu_avn), + ICPU(0x3d, idle_cpu_bdw), + ICPU(0x47, idle_cpu_bdw), + ICPU(0x4f, idle_cpu_bdw), + ICPU(0x56, idle_cpu_bdw), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); + +/* + * intel_idle_probe() + */ +static int __init intel_idle_probe(void) +{ + unsigned int eax, ebx, ecx; + const struct x86_cpu_id *id; + + if (max_cstate == 0) { + pr_debug(PREFIX "disabled\n"); + return -EPERM; + } + + id = x86_match_cpu(intel_idle_ids); + if (!id) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) + pr_debug(PREFIX "does not run on family %d model %d\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + return -ENODEV; + } + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return -ENODEV; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || + !(ecx & CPUID5_ECX_INTERRUPT_BREAK) || + !mwait_substates) + return -ENODEV; + + pr_debug(PREFIX "MWAIT substates: 0x%x\n", mwait_substates); + + icpu = (const struct idle_cpu *)id->driver_data; + cpuidle_state_table = icpu->state_table; + + if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ + lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; + else + on_each_cpu(__setup_broadcast_timer, (void *)true, 1); + + pr_debug(PREFIX "v" INTEL_IDLE_VERSION + " model 0x%X\n", boot_cpu_data.x86_model); + + pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", + lapic_timer_reliable_states); + return 0; +} + +/* + * intel_idle_cpuidle_devices_uninit() + * unregister, free cpuidle_devices + */ +static void intel_idle_cpuidle_devices_uninit(void) +{ + int i; + struct cpuidle_device *dev; + + for_each_online_cpu(i) { + dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); + cpuidle_unregister_device(dev); + } + + free_percpu(intel_idle_cpuidle_devices); + return; +} + +/* + * intel_idle_state_table_update() + * + * Update the default state_table for this CPU-id + * + * Currently used to access tuned IVT multi-socket targets + * Assumption: num_sockets == (max_package_num + 1) + */ +void intel_idle_state_table_update(void) +{ + /* IVT uses a different table for 1-2, 3-4, and > 4 sockets */ + if (boot_cpu_data.x86_model == 0x3e) { /* IVT */ + int cpu, package_num, num_sockets = 1; + + for_each_online_cpu(cpu) { + package_num = topology_physical_package_id(cpu); + if (package_num + 1 > num_sockets) { + num_sockets = package_num + 1; + + if (num_sockets > 4) { + cpuidle_state_table = ivt_cstates_8s; + return; + } + } + } + + if (num_sockets > 2) + cpuidle_state_table = ivt_cstates_4s; + /* else, 1 and 2 socket systems use default ivt_cstates */ + } + return; +} + +/* + * intel_idle_cpuidle_driver_init() + * allocate, initialize cpuidle_states + */ +static int __init intel_idle_cpuidle_driver_init(void) +{ + int cstate; + struct cpuidle_driver *drv = &intel_idle_driver; + + intel_idle_state_table_update(); + + drv->state_count = 1; + + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + int num_substates, mwait_hint, mwait_cstate; + + if (cpuidle_state_table[cstate].enter == NULL) + break; + + if (cstate + 1 > max_cstate) { + printk(PREFIX "max_cstate %d reached\n", + max_cstate); + break; + } + + mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags); + mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint); + + /* number of sub-states for this state in CPUID.MWAIT */ + num_substates = (mwait_substates >> ((mwait_cstate + 1) * 4)) + & MWAIT_SUBSTATE_MASK; + + /* if NO sub-states for this state in CPUID, skip it */ + if (num_substates == 0) + continue; + + if (((mwait_cstate + 1) > 2) && + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halts in idle" + " states deeper than C2"); + + drv->states[drv->state_count] = /* structure copy */ + cpuidle_state_table[cstate]; + + drv->state_count += 1; + } + + if (icpu->auto_demotion_disable_flags) + on_each_cpu(auto_demotion_disable, NULL, 1); + + if (icpu->byt_auto_demotion_disable_flag) { + wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0); + wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0); + } + + if (icpu->disable_promotion_to_c1e) /* each-cpu is redundant */ + on_each_cpu(c1e_promotion_disable, NULL, 1); + + return 0; +} + + +/* + * intel_idle_cpu_init() + * allocate, initialize, register cpuidle_devices + * @cpu: cpu/core to initialize + */ +static int intel_idle_cpu_init(int cpu) +{ + struct cpuidle_device *dev; + + dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); + + dev->cpu = cpu; + + if (cpuidle_register_device(dev)) { + pr_debug(PREFIX "cpuidle_register_device %d failed!\n", cpu); + intel_idle_cpuidle_devices_uninit(); + return -EIO; + } + + if (icpu->auto_demotion_disable_flags) + smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); + + if (icpu->disable_promotion_to_c1e) + smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1); + + return 0; +} + +static int __init intel_idle_init(void) +{ + int retval, i; + + /* Do not load intel_idle at all for now if idle= is passed */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return -ENODEV; + + retval = intel_idle_probe(); + if (retval) + return retval; + + intel_idle_cpuidle_driver_init(); + retval = cpuidle_register_driver(&intel_idle_driver); + if (retval) { + struct cpuidle_driver *drv = cpuidle_get_driver(); + printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", + drv ? drv->name : "none"); + return retval; + } + + intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); + if (intel_idle_cpuidle_devices == NULL) + return -ENOMEM; + + cpu_notifier_register_begin(); + + for_each_online_cpu(i) { + retval = intel_idle_cpu_init(i); + if (retval) { + cpu_notifier_register_done(); + cpuidle_unregister_driver(&intel_idle_driver); + return retval; + } + } + __register_cpu_notifier(&cpu_hotplug_notifier); + + cpu_notifier_register_done(); + + return 0; +} + +static void __exit intel_idle_exit(void) +{ + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); + + cpu_notifier_register_begin(); + + if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) + on_each_cpu(__setup_broadcast_timer, (void *)false, 1); + __unregister_cpu_notifier(&cpu_hotplug_notifier); + + cpu_notifier_register_done(); + + return; +} + +module_init(intel_idle_init); +module_exit(intel_idle_exit); + +module_param(max_cstate, int, 0444); + +MODULE_AUTHOR("Len Brown "); +MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION); +MODULE_LICENSE("GPL"); -- cgit 1.2.3-korg