These changes are the raw update to linux-4.4.6-rt14. Kernel sources

are taken from kernel.org, and rt patch from the rt wiki download page. During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
author: José Pekkarinen <jose.pekkarinen@nokia.com> 2016-04-11 10:41:07 +0300
committer: José Pekkarinen <jose.pekkarinen@nokia.com> 2016-04-13 08:17:18 +0300
commit: e09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
tree: d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/arch/x86
parent: f93b97fd65072de626c074dbe099a1fff05ce060 (diff)
550 files changed, 32238 insertions, 18876 deletions
diff --git a/kernel/arch/x86/Kbuild b/kernel/arch/x86/Kbuild
index 3942f74c9..1538562cc 100644
--- a/kernel/arch/x86/Kbuild
+++ b/kernel/arch/x86/Kbuild
@@ -1,3 +1,6 @@
+
+obj-y += entry/
+
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
@@ -11,7 +14,7 @@ obj-y += kernel/
 obj-y += mm/
 
 obj-y += crypto/
-obj-y += vdso/
+
 obj-$(CONFIG_IA32_EMULATION) += ia32/
 
 obj-y += platform/
diff --git a/kernel/arch/x86/Kconfig b/kernel/arch/x86/Kconfig
index aac357a4c..d8a1650a0 100644
--- a/kernel/arch/x86/Kconfig
+++ b/kernel/arch/x86/Kconfig
@@ -9,141 +9,150 @@ config 64BIT
 config X86_32
 	def_bool y
 	depends on !64BIT
-	select CLKSRC_I8253
-	select HAVE_UID16
 
 config X86_64
 	def_bool y
 	depends on 64BIT
-	select X86_DEV_DMA_OPS
-	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_LIVEPATCH
 
 ### Arch settings
 config X86
 	def_bool y
 	select HAVE_PREEMPT_LAZY
-	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
-	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
+	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
+	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
+	select ANON_INODES
+	select ARCH_CLOCKSOURCE_DATA
+	select ARCH_DISCARD_MEMBLOCK
+	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_MMIO_FLUSH
+	select ARCH_HAS_SG_CHAIN
+	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
-	select HAVE_AOUT if X86_32
-	select HAVE_UNSTABLE_SCHED_CLOCK
-	select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
-	select ARCH_SUPPORTS_INT128 if X86_64
-	select HAVE_IDE
-	select HAVE_OPROFILE
-	select HAVE_PCSPKR_PLATFORM
-	select HAVE_PERF_EVENTS
-	select HAVE_IOREMAP_PROT
-	select HAVE_KPROBES
-	select HAVE_MEMBLOCK
-	select HAVE_MEMBLOCK_NODE_MAP
-	select ARCH_DISCARD_MEMBLOCK
-	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+	select ARCH_SUPPORTS_INT128		if X86_64
+	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
+	select ARCH_USE_BUILTIN_BSWAP
+	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
+	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
-	select HAVE_DMA_ATTRS
-	select HAVE_DMA_CONTIGUOUS
-	select HAVE_KRETPROBES
+	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
+	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select BUILDTIME_EXTABLE_SORT
+	select CLKEVT_I8253
+	select CLKSRC_I8253			if X86_32
+	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
+	select CLOCKSOURCE_WATCHDOG
+	select CLONE_BACKWARDS			if X86_32
+	select COMPAT_OLD_SIGACTION		if IA32_EMULATION
+	select DCACHE_WORD_ACCESS
+	select EDAC_ATOMIC_SCRUB
+	select EDAC_SUPPORT
+	select GENERIC_CLOCKEVENTS
+	select GENERIC_CLOCKEVENTS_BROADCAST	if X86_64 || (X86_32 && X86_LOCAL_APIC)
+	select GENERIC_CLOCKEVENTS_MIN_ADJUST
+	select GENERIC_CMOS_UPDATE
+	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_EARLY_IOREMAP
-	select HAVE_OPTPROBES
-	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_FTRACE_MCOUNT_RECORD
-	select HAVE_FENTRY if X86_64
+	select GENERIC_FIND_FIRST_BIT
+	select GENERIC_IOMAP
+	select GENERIC_IRQ_PROBE
+	select GENERIC_IRQ_SHOW
+	select GENERIC_PENDING_IRQ		if SMP
+	select GENERIC_SMP_IDLE_THREAD
+	select GENERIC_STRNCPY_FROM_USER
+	select GENERIC_STRNLEN_USER
+	select GENERIC_TIME_VSYSCALL
+	select HAVE_ACPI_APEI			if ACPI
+	select HAVE_ACPI_APEI_NMI		if ACPI
+	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
+	select HAVE_AOUT			if X86_32
+	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
+	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
+	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_KMEMCHECK
+	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SOFT_DIRTY		if X86_64
+	select HAVE_ARCH_TRACEHOOK
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_BPF_JIT			if X86_64
+	select HAVE_CC_STACKPROTECTOR
+	select HAVE_CMPXCHG_DOUBLE
+	select HAVE_CMPXCHG_LOCAL
+	select HAVE_CONTEXT_TRACKING		if X86_64
+	select HAVE_COPY_THREAD_TLS
 	select HAVE_C_RECORDMCOUNT
+	select HAVE_DEBUG_KMEMLEAK
+	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_DMA_API_DEBUG
+	select HAVE_DMA_ATTRS
+	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_FUNCTION_TRACER
-	select HAVE_FUNCTION_GRAPH_TRACER
-	select HAVE_FUNCTION_GRAPH_FP_TEST
-	select HAVE_SYSCALL_TRACEPOINTS
-	select SYSCTL_EXCEPTION_TRACE
-	select HAVE_KVM
-	select HAVE_ARCH_KGDB
-	select HAVE_ARCH_TRACEHOOK
-	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select USER_STACKTRACE_SUPPORT
-	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_DMA_API_DEBUG
-	select HAVE_KERNEL_GZIP
+	select HAVE_FENTRY			if X86_64
+	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_FUNCTION_GRAPH_FP_TEST
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FUNCTION_TRACER
+	select HAVE_GENERIC_DMA_COHERENT	if X86_32
+	select HAVE_HW_BREAKPOINT
+	select HAVE_IDE
+	select HAVE_IOREMAP_PROT
+	select HAVE_IRQ_EXIT_ON_IRQ_STACK	if X86_64
+	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_KERNEL_BZIP2
+	select HAVE_KERNEL_GZIP
+	select HAVE_KERNEL_LZ4
 	select HAVE_KERNEL_LZMA
-	select HAVE_KERNEL_XZ
 	select HAVE_KERNEL_LZO
-	select HAVE_KERNEL_LZ4
-	select HAVE_HW_BREAKPOINT
+	select HAVE_KERNEL_XZ
+	select HAVE_KPROBES
+	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KRETPROBES
+	select HAVE_KVM
+	select HAVE_LIVEPATCH			if X86_64
+	select HAVE_MEMBLOCK
+	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
-	select PERF_EVENTS
+	select HAVE_OPROFILE
+	select HAVE_OPTPROBES
+	select HAVE_PCSPKR_PLATFORM
+	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
-	select HAVE_DEBUG_KMEMLEAK
-	select ANON_INODES
-	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
-	select HAVE_CMPXCHG_LOCAL
-	select HAVE_CMPXCHG_DOUBLE
-	select HAVE_ARCH_KMEMCHECK
-	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_SYSCALL_TRACEPOINTS
+	select HAVE_UID16			if X86_32 || IA32_EMULATION
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
-	select ARCH_HAS_ELF_RANDOMIZE
-	select HAVE_ARCH_JUMP_LABEL
-	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select SPARSE_IRQ
-	select GENERIC_FIND_FIRST_BIT
-	select GENERIC_IRQ_PROBE
-	select GENERIC_PENDING_IRQ if SMP
-	select GENERIC_IRQ_SHOW
-	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select IRQ_FORCED_THREADING
-	select HAVE_BPF_JIT if X86_64
-	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
-	select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
-	select ARCH_HAS_SG_CHAIN
-	select CLKEVT_I8253
-	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-	select GENERIC_IOMAP
-	select DCACHE_WORD_ACCESS
-	select GENERIC_SMP_IDLE_THREAD
-	select ARCH_WANT_IPC_PARSE_VERSION if X86_32
-	select HAVE_ARCH_SECCOMP_FILTER
-	select BUILDTIME_EXTABLE_SORT
-	select GENERIC_CMOS_UPDATE
-	select HAVE_ARCH_SOFT_DIRTY if X86_64
-	select CLOCKSOURCE_WATCHDOG
-	select GENERIC_CLOCKEVENTS
-	select ARCH_CLOCKSOURCE_DATA
-	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
-	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
-	select GENERIC_TIME_VSYSCALL
-	select GENERIC_STRNCPY_FROM_USER
-	select GENERIC_STRNLEN_USER
-	select HAVE_CONTEXT_TRACKING if X86_64
-	select HAVE_IRQ_TIME_ACCOUNTING
-	select VIRT_TO_BUS
-	select MODULES_USE_ELF_REL if X86_32
-	select MODULES_USE_ELF_RELA if X86_64
-	select CLONE_BACKWARDS if X86_32
-	select ARCH_USE_BUILTIN_BSWAP
-	select ARCH_USE_QUEUE_RWLOCK
-	select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
-	select OLD_SIGACTION if X86_32
-	select COMPAT_OLD_SIGACTION if IA32_EMULATION
+	select MODULES_USE_ELF_RELA		if X86_64
+	select MODULES_USE_ELF_REL		if X86_32
+	select OLD_SIGACTION			if X86_32
+	select OLD_SIGSUSPEND3			if X86_32 || IA32_EMULATION
+	select PERF_EVENTS
 	select RTC_LIB
-	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
-	select HAVE_CC_STACKPROTECTOR
-	select GENERIC_CPU_AUTOPROBE
-	select HAVE_ARCH_AUDITSYSCALL
-	select ARCH_SUPPORTS_ATOMIC_RMW
-	select HAVE_ACPI_APEI if ACPI
-	select HAVE_ACPI_APEI_NMI if ACPI
-	select ACPI_LEGACY_TABLES_LOOKUP if ACPI
-	select X86_FEATURE_NAMES if PROC_FS
+	select SPARSE_IRQ
 	select SRCU
+	select SYSCTL_EXCEPTION_TRACE
+	select USER_STACKTRACE_SUPPORT
+	select VIRT_TO_BUS
+	select X86_DEV_DMA_OPS			if X86_64
+	select X86_FEATURE_NAMES		if PROC_FS
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -252,6 +261,11 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
+config KASAN_SHADOW_OFFSET
+	hex
+	depends on KASAN
+	default 0xdffffc0000000000
+
 config HAVE_INTEL_TXT
 	def_bool y
 	depends on INTEL_IOMMU && ACPI
@@ -264,10 +278,6 @@ config X86_64_SMP
 	def_bool y
 	depends on X86_64 && SMP
 
-config X86_HT
-	def_bool y
-	depends on SMP
-
 config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
@@ -345,7 +355,7 @@ config X86_FEATURE_NAMES
 
 config X86_X2APIC
 	bool "Support x2apic"
-	depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP
+	depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
 	---help---
 	  This enables x2apic support on CPUs that have this feature.
 
@@ -445,6 +455,7 @@ config X86_UV
 	depends on X86_EXTENDED_PLATFORM
 	depends on NUMA
 	depends on X86_X2APIC
+	depends on PCI
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
 	  If you don't have one of these, you should say N here.
@@ -470,7 +481,6 @@ config X86_INTEL_CE
 	select X86_REBOOTFIXUPS
 	select OF
 	select OF_EARLY_FLATTREE
-	select IRQ_DOMAIN
 	---help---
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  This option compiles in support for the CE4100 SOC for settop
@@ -670,7 +680,7 @@ config PARAVIRT_DEBUG
 config PARAVIRT_SPINLOCKS
 	bool "Paravirtualization layer for spinlocks"
 	depends on PARAVIRT && SMP
-	select UNINLINE_SPIN_UNLOCK
+	select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
 	---help---
 	  Paravirtualized spinlocks allow a pvops backend to replace the
 	  spinlock implementation with something virtualization-friendly
@@ -855,11 +865,12 @@ config NR_CPUS
 	default "1" if !SMP
 	default "8192" if MAXSMP
 	default "32" if SMP && X86_BIGSMP
-	default "8" if SMP
+	default "8" if SMP && X86_32
+	default "64" if SMP
 	---help---
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum
-	  supported value is 4096, otherwise the maximum value is 512.  The
+	  supported value is 8192, otherwise the maximum value is 512.  The
 	  minimum value which makes sense is 2.
 
 	  This is purely to save memory - each supported CPU adds
@@ -867,7 +878,7 @@ config NR_CPUS
 
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
-	depends on X86_HT
+	depends on SMP
 	---help---
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -877,7 +888,7 @@ config SCHED_SMT
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
-	depends on X86_HT
+	depends on SMP
 	---help---
 	  Multi-core scheduler support improves the CPU scheduler's decision
 	  making when dealing with multi-core CPU chips at a cost of slightly
@@ -918,12 +929,12 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
-	select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+	select IRQ_DOMAIN_HIERARCHY
+	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 
 config X86_IO_APIC
 	def_bool y
 	depends on X86_LOCAL_APIC || X86_UP_IOAPIC
-	select IRQ_DOMAIN
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
@@ -950,6 +961,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 
 config X86_MCE
 	bool "Machine Check / overheating reporting"
+	select GENERIC_ALLOCATOR
 	default y
 	---help---
 	  Machine Check support allows the processor to notify the
@@ -997,19 +1009,42 @@ config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
 
-config VM86
-	bool "Enable VM86 support" if EXPERT
-	default y
+config X86_LEGACY_VM86
+	bool "Legacy VM86 support"
+	default n
 	depends on X86_32
 	---help---
-	  This option is required by programs like DOSEMU to run
-	  16-bit real mode legacy code on x86 processors. It also may
-	  be needed by software like XFree86 to initialize some video
-	  cards via BIOS. Disabling this option saves about 6K.
+	  This option allows user programs to put the CPU into V8086
+	  mode, which is an 80286-era approximation of 16-bit real mode.
+
+	  Some very old versions of X and/or vbetool require this option
+	  for user mode setting.  Similarly, DOSEMU will use it if
+	  available to accelerate real mode DOS programs.  However, any
+	  recent version of DOSEMU, X, or vbetool should be fully
+	  functional even without kernel VM86 support, as they will all
+	  fall back to software emulation. Nevertheless, if you are using
+	  a 16-bit DOS program where 16-bit performance matters, vm86
+	  mode might be faster than emulation and you might want to
+	  enable this option.
+
+	  Note that any app that works on a 64-bit kernel is unlikely to
+	  need this option, as 64-bit kernels don't, and can't, support
+	  V8086 mode. This option is also unrelated to 16-bit protected
+	  mode and is not needed to run most 16-bit programs under Wine.
+
+	  Enabling this option increases the complexity of the kernel
+	  and slows down exception handling a tiny bit.
+
+	  If unsure, say N here.
+
+config VM86
+       bool
+       default X86_LEGACY_VM86
 
 config X86_16BIT
 	bool "Enable support for 16-bit segments" if EXPERT
 	default y
+	depends on MODIFY_LDT_SYSCALL
 	---help---
 	  This option is required by programs like Wine to run 16-bit
 	  protected mode legacy code on x86 processors.  Disabling
@@ -1059,24 +1094,19 @@ config TOSHIBA
 	  Say N otherwise.
 
 config I8K
-	tristate "Dell laptop support"
+	tristate "Dell i8k legacy laptop support"
 	select HWMON
+	select SENSORS_DELL_SMM
 	---help---
-	  This adds a driver to safely access the System Management Mode
-	  of the CPU on the Dell Inspiron 8000. The System Management Mode
-	  is used to read cpu temperature and cooling fan status and to
-	  control the fans on the I8K portables.
-
-	  This driver has been tested only on the Inspiron 8000 but it may
-	  also work with other Dell laptops. You can force loading on other
-	  models by passing the parameter `force=1' to the module. Use at
-	  your own risk.
-
-	  For information on utilities to make use of this driver see the
-	  I8K Linux utilities web site at:
-	  <http://people.debian.org/~dz/i8k/>
+	  This option enables legacy /proc/i8k userspace interface in hwmon
+	  dell-smm-hwmon driver. Character file /proc/i8k reports bios version,
+	  temperature and allows controlling fan speeds of Dell laptops via
+	  System Management Mode. For old Dell laptops (like Dell Inspiron 8000)
+	  it reports also power and hotkey status. For fan speed control is
+	  needed userspace package i8kutils.
 
-	  Say Y if you intend to run this kernel on a Dell Inspiron 8000.
+	  Say Y if you intend to run this kernel on old Dell laptops or want to
+	  use userspace package i8kutils.
 	  Say N otherwise.
 
 config X86_REBOOTFIXUPS
@@ -1097,8 +1127,10 @@ config X86_REBOOTFIXUPS
 	  Say N otherwise.
 
 config MICROCODE
-	tristate "CPU microcode loading support"
+	bool "CPU microcode loading support"
+	default y
 	depends on CPU_SUP_AMD || CPU_SUP_INTEL
+	depends on BLK_DEV_INITRD
 	select FW_LOADER
 	---help---
 
@@ -1140,24 +1172,6 @@ config MICROCODE_OLD_INTERFACE
 	def_bool y
 	depends on MICROCODE
 
-config MICROCODE_INTEL_EARLY
-	bool
-
-config MICROCODE_AMD_EARLY
-	bool
-
-config MICROCODE_EARLY
-	bool "Early load microcode"
-	depends on MICROCODE=y && BLK_DEV_INITRD
-	select MICROCODE_INTEL_EARLY if MICROCODE_INTEL
-	select MICROCODE_AMD_EARLY if MICROCODE_AMD
-	default y
-	help
-	  This option provides functionality to read additional microcode data
-	  at the beginning of initrd image. The data tells kernel to load
-	  microcode to CPU's as early as possible. No functional change if no
-	  microcode data is glued to the initrd, therefore it's safe to say Y.
-
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
 	---help---
@@ -1282,6 +1296,7 @@ config HIGHMEM
 config X86_PAE
 	bool "PAE (Physical Address Extension) Support"
 	depends on X86_32 && !HIGHMEM4G
+	select SWIOTLB
 	---help---
 	  PAE is required for NX support, and furthermore enables
 	  larger swapspace support for non-overcommit purposes. It
@@ -1426,8 +1441,15 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY_DEVICE
+	bool
+
 config X86_PMEM_LEGACY
-	bool "Support non-standard NVDIMMs and ADR protected memory"
+	tristate "Support non-standard NVDIMMs and ADR protected memory"
+	depends on PHYS_ADDR_T_64BIT
+	depends on BLK_DEV
+	select X86_PMEM_LEGACY_DEVICE
+	select LIBNVDIMM
 	help
 	  Treat memory marked using the non-standard e820 type of 12 as used
 	  by the Intel Sandy Bridge-EP reference BIOS as protected memory.
@@ -1506,6 +1528,7 @@ config X86_RESERVE_LOW
 
 config MATH_EMULATION
 	bool
+	depends on MODIFY_LDT_SYSCALL
 	prompt "Math emulation" if X86_32
 	---help---
 	  Linux can emulate a math coprocessor (used for floating point
@@ -1721,6 +1744,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call"
+	select KEXEC_CORE
 	---help---
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
@@ -1737,8 +1761,8 @@ config KEXEC
 
 config KEXEC_FILE
 	bool "kexec file based system call"
+	select KEXEC_CORE
 	select BUILD_BIN2C
-	depends on KEXEC
 	depends on X86_64
 	depends on CRYPTO=y
 	depends on CRYPTO_SHA256=y
@@ -2007,6 +2031,55 @@ config COMPAT_VDSO
 	  If unsure, say N: if you are compiling your own kernel, you
 	  are unlikely to be using a buggy version of glibc.
 
+choice
+	prompt "vsyscall table for legacy applications"
+	depends on X86_64
+	default LEGACY_VSYSCALL_EMULATE
+	help
+	  Legacy user code that does not know how to find the vDSO expects
+	  to be able to issue three syscalls by calling fixed addresses in
+	  kernel space. Since this location is not randomized with ASLR,
+	  it can be used to assist security vulnerability exploitation.
+
+	  This setting can be changed at boot time via the kernel command
+	  line parameter vsyscall=[native|emulate|none].
+
+	  On a system with recent enough glibc (2.14 or newer) and no
+	  static binaries, you can say None without a performance penalty
+	  to improve security.
+
+	  If unsure, select "Emulate".
+
+	config LEGACY_VSYSCALL_NATIVE
+		bool "Native"
+		help
+		  Actual executable code is located in the fixed vsyscall
+		  address mapping, implementing time() efficiently. Since
+		  this makes the mapping executable, it can be used during
+		  security vulnerability exploitation (traditionally as
+		  ROP gadgets). This configuration is not recommended.
+
+	config LEGACY_VSYSCALL_EMULATE
+		bool "Emulate"
+		help
+		  The kernel traps and emulates calls into the fixed
+		  vsyscall address mapping. This makes the mapping
+		  non-executable, but it still contains known contents,
+		  which could be used in certain rare security vulnerability
+		  exploits. This configuration is recommended when userspace
+		  still uses the vsyscall area.
+
+	config LEGACY_VSYSCALL_NONE
+		bool "None"
+		help
+		  There will be no vsyscall mapping at all. This will
+		  eliminate any risk of ASLR bypass due to the vsyscall
+		  fixed address mapping. Attempts to use the vsyscalls
+		  will be reported to dmesg, so that either old or
+		  malicious userspace programs can be identified.
+
+endchoice
+
 config CMDLINE_BOOL
 	bool "Built-in kernel command line"
 	---help---
@@ -2018,7 +2091,7 @@ config CMDLINE_BOOL
 
 	  To compile command line arguments into the kernel,
 	  set this option to 'Y', then fill in the
-	  the boot arguments in CONFIG_CMDLINE.
+	  boot arguments in CONFIG_CMDLINE.
 
 	  Systems with fully functional boot loaders (i.e. non-embedded)
 	  should leave this option set to 'N'.
@@ -2050,6 +2123,22 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+config MODIFY_LDT_SYSCALL
+	bool "Enable the LDT (local descriptor table)" if EXPERT
+	default y
+	---help---
+	  Linux can allow user programs to install a per-process x86
+	  Local Descriptor Table (LDT) using the modify_ldt(2) system
+	  call.  This is required to run 16-bit or segmented code such as
+	  DOSEMU or some Wine programs.  It is also used by some very old
+	  threading libraries.
+
+	  Enabling this feature adds a small amount of overhead to
+	  context switches and increases the low-level kernel attack
+	  surface.  Disabling it removes the modify_ldt(2) system call.
+
+	  Saying 'N' here may make sense for embedded or server kernels.
+
 source "kernel/livepatch/Kconfig"
 
 endmenu
@@ -2519,7 +2608,7 @@ config IA32_EMULATION
 	depends on X86_64
 	select BINFMT_ELF
 	select COMPAT_BINFMT_ELF
-	select HAVE_UID16
+	select ARCH_WANT_OLD_COMPAT_IPC
 	---help---
 	  Include code to run legacy 32-bit programs under a
 	  64-bit kernel. You should likely turn this on, unless you're
@@ -2533,7 +2622,7 @@ config IA32_AOUT
 
 config X86_X32
 	bool "x32 ABI for 64-bit mode"
-	depends on X86_64 && IA32_EMULATION
+	depends on X86_64
 	---help---
 	  Include code to run binaries for the x32 native 32-bit ABI
 	  for 64-bit processors.  An x32 process gets access to the
@@ -2547,7 +2636,6 @@ config X86_X32
 config COMPAT
 	def_bool y
 	depends on IA32_EMULATION || X86_X32
-	select ARCH_WANT_OLD_COMPAT_IPC
 
 if COMPAT
 config COMPAT_FOR_U64_ALIGNMENT
diff --git a/kernel/arch/x86/Kconfig.cpu b/kernel/arch/x86/Kconfig.cpu
index 6983314c8..3ba5ff2f2 100644
--- a/kernel/arch/x86/Kconfig.cpu
+++ b/kernel/arch/x86/Kconfig.cpu
@@ -3,10 +3,6 @@ choice
 	prompt "Processor family"
 	default M686 if X86_32
 	default GENERIC_CPU if X86_64
-
-config M486
-	bool "486"
-	depends on X86_32
 	---help---
 	  This is the processor type of your CPU. This information is
 	  used for optimizing purposes. In order to compile a kernel
@@ -23,9 +19,9 @@ config M486
 
 	  Here are the settings recommended for greatest speed:
 	  - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or
-	  SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
+	    SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
 	  - "586" for generic Pentium CPUs lacking the TSC
-	  (time stamp counter) register.
+	    (time stamp counter) register.
 	  - "Pentium-Classic" for the Intel Pentium.
 	  - "Pentium-MMX" for the Intel Pentium MMX.
 	  - "Pentium-Pro" for the Intel Pentium Pro.
@@ -34,17 +30,31 @@ config M486
 	  - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron.
 	  - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D).
 	  - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird).
+	  - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs.
 	  - "Crusoe" for the Transmeta Crusoe series.
 	  - "Efficeon" for the Transmeta Efficeon series.
 	  - "Winchip-C6" for original IDT Winchip.
 	  - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
+	  - "AMD Elan" for the 32-bit AMD Elan embedded CPU.
 	  - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
 	  - "Geode GX/LX" For AMD Geode GX and LX processors.
 	  - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
 	  - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above).
 	  - "VIA C7" for VIA C7.
+	  - "Intel P4" for the Pentium 4/Netburst microarchitecture.
+	  - "Core 2/newer Xeon" for all core2 and newer Intel CPUs.
+	  - "Intel Atom" for the Atom-microarchitecture CPUs.
+	  - "Generic-x86-64" for a kernel which runs on any x86-64 CPU.
+
+	  See each option's help text for additional details. If you don't know
+	  what to do, choose "486".
 
-	  If you don't know what to do, choose "486".
+config M486
+	bool "486"
+	depends on X86_32
+	---help---
+	  Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel
+	  486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
 
 config M586
 	bool "586/K5/5x86/6x86/6x86MX"
diff --git a/kernel/arch/x86/Kconfig.debug b/kernel/arch/x86/Kconfig.debug
index 72484a645..137dfa96a 100644
--- a/kernel/arch/x86/Kconfig.debug
+++ b/kernel/arch/x86/Kconfig.debug
@@ -65,10 +65,14 @@ config EARLY_PRINTK_EFI
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized.
 
+config X86_PTDUMP_CORE
+	def_bool n
+
 config X86_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
 	depends on DEBUG_KERNEL
 	select DEBUG_FS
+	select X86_PTDUMP_CORE
 	---help---
 	  Say Y here if you want to show the kernel pagetable layout in a
 	  debugfs file. This information is only useful for kernel developers
@@ -79,7 +83,8 @@ config X86_PTDUMP
 
 config EFI_PGT_DUMP
 	bool "Dump the EFI pagetable"
-	depends on EFI && X86_PTDUMP
+	depends on EFI
+	select X86_PTDUMP_CORE
 	---help---
 	  Enable this if you want to dump the EFI page table before
 	  enabling virtual mode. This can be used to debug miscellaneous
@@ -105,6 +110,34 @@ config DEBUG_RODATA_TEST
 	  feature as well as for the change_page_attr() infrastructure.
 	  If in doubt, say "N"
 
+config DEBUG_WX
+	bool "Warn on W+X mappings at boot"
+	depends on DEBUG_RODATA
+	select X86_PTDUMP_CORE
+	---help---
+	  Generate a warning if any W+X mappings are found at boot.
+
+	  This is useful for discovering cases where the kernel is leaving
+	  W+X mappings after applying NX, as such mappings are a security risk.
+
+	  Look for a message in dmesg output like this:
+
+	    x86/mm: Checked W+X mappings: passed, no W+X pages found.
+
+	  or like this, if the check failed:
+
+	    x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
+
+	  Note that even if the check fails, your kernel is possibly
+	  still fine, as W+X mappings are not a security hole in
+	  themselves, what they do is that they make the exploitation
+	  of other unfixed kernel bugs easier.
+
+	  There is no runtime or memory usage effect of this option
+	  once the kernel has booted up - it's a one time check.
+
+	  If in doubt, say "Y".
+
 config DEBUG_SET_MODULE_RONX
 	bool "Set loadable kernel module data as NX and text as RO"
 	depends on MODULES
@@ -297,6 +330,18 @@ config OPTIMIZE_INLINING
 
 	  If unsure, say N.
 
+config DEBUG_ENTRY
+	bool "Debug low-level entry code"
+	depends on DEBUG_KERNEL
+	---help---
+	  This option enables sanity checks in x86's low-level entry code.
+	  Some of these sanity checks may slow down kernel entries and
+	  exits or otherwise impact performance.
+
+	  This is currently used to help test NMI code.
+
+	  If unsure, say N.
+
 config DEBUG_NMI_SELFTEST
 	bool "NMI Selftest"
 	depends on DEBUG_KERNEL && X86_LOCAL_APIC
@@ -332,4 +377,27 @@ config X86_DEBUG_STATIC_CPU_HAS
 
 	  If unsure, say N.
 
+config X86_DEBUG_FPU
+	bool "Debug the x86 FPU code"
+	depends on DEBUG_KERNEL
+	default y
+	---help---
+	  If this option is enabled then there will be extra sanity
+	  checks and (boot time) debug printouts added to the kernel.
+	  This debugging adds some small amount of runtime overhead
+	  to the kernel.
+
+	  If unsure, say N.
+
+config PUNIT_ATOM_DEBUG
+	tristate "ATOM Punit debug driver"
+	select DEBUG_FS
+	select IOSF_MBI
+	---help---
+	  This is a debug driver, which gets the power states
+	  of all Punit North Complex devices. The power states of
+	  each device is exposed as part of the debugfs interface.
+	  The current power state can be read from
+	  /sys/kernel/debug/punit_atom/dev_power_state
+
 endmenu
diff --git a/kernel/arch/x86/Makefile b/kernel/arch/x86/Makefile
index 2fda005bb..4086abca0 100644
--- a/kernel/arch/x86/Makefile
+++ b/kernel/arch/x86/Makefile
@@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS
         LDFLAGS_vmlinux := --emit-relocs
 endif
 
+#
+# Prevent GCC from generating any FP code by mistake.
+#
+# This must happen before we try the -mpreferred-stack-boundary, see:
+#
+#    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
+KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+
 ifeq ($(CONFIG_X86_32),y)
         BITS := 32
         UTS_MACHINE := i386
@@ -77,6 +87,12 @@ else
         KBUILD_AFLAGS += -m64
         KBUILD_CFLAGS += -m64
 
+        # Align jump targets to 1 byte, not the default 16 bytes:
+        KBUILD_CFLAGS += -falign-jumps=1
+
+        # Pack loops tightly as well:
+        KBUILD_CFLAGS += -falign-loops=1
+
         # Don't autogenerate traditional x87 instructions
         KBUILD_CFLAGS += $(call cc-option,-mno-80387)
         KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
@@ -84,6 +100,9 @@ else
 	# Use -mpreferred-stack-boundary=3 if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
 
+	# Use -mskip-rax-setup if supported.
+	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
+
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
@@ -152,9 +171,11 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
@@ -164,9 +185,6 @@ KBUILD_CFLAGS += -pipe
 KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-# prevent gcc from generating any FP code by mistake
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
 
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
@@ -178,7 +196,7 @@ archscripts: scripts_basic
 # Syscall table generation
 
 archheaders:
-	$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+	$(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
 
 archprepare:
 ifeq ($(CONFIG_KEXEC_FILE),y)
@@ -209,6 +227,8 @@ drivers-$(CONFIG_PM) += arch/x86/power/
 
 drivers-$(CONFIG_FB) += arch/x86/video/
 
+drivers-$(CONFIG_RAS) += arch/x86/ras/
+
 ####
 # boot loader support. Several targets are kept for legacy purposes
 
@@ -241,7 +261,7 @@ install:
 
 PHONY += vdso_install
 vdso_install:
-	$(Q)$(MAKE) $(build)=arch/x86/vdso $@
+	$(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@
 
 archclean:
 	$(Q)rm -rf $(objtree)/arch/i386
diff --git a/kernel/arch/x86/boot/Makefile b/kernel/arch/x86/boot/Makefile
index 57bbf2fb2..2ee62dba0 100644
--- a/kernel/arch/x86/boot/Makefile
+++ b/kernel/arch/x86/boot/Makefile
@@ -9,13 +9,13 @@
 # Changed by many, many contributors over the years.
 #
 
+KASAN_SANITIZE := n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
-KASAN_SANITIZE := n
-
 SVGA_MODE	:= -DSVGA_MODE=NORMAL_VGA
 
 targets		:= vmlinux.bin setup.bin setup.elf bzImage
@@ -23,7 +23,7 @@ targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
 setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
-setup-y		+= early_serial_console.o edd.o header.o main.o mca.o memory.o
+setup-y		+= early_serial_console.o edd.o header.o main.o memory.o
 setup-y		+= pm.o pmjump.o printf.o regs.o string.o tty.o video.o
 setup-y		+= video-mode.o version.o
 setup-$(CONFIG_X86_APM_BOOT) += apm.o
diff --git a/kernel/arch/x86/boot/boot.h b/kernel/arch/x86/boot/boot.h
index bd49ec612..9011a8835 100644
--- a/kernel/arch/x86/boot/boot.h
+++ b/kernel/arch/x86/boot/boot.h
@@ -23,7 +23,6 @@
 #include <stdarg.h>
 #include <linux/types.h>
 #include <linux/edd.h>
-#include <asm/boot.h>
 #include <asm/setup.h>
 #include "bitops.h"
 #include "ctype.h"
@@ -307,9 +306,6 @@ void query_edd(void);
 /* header.S */
 void __attribute__((noreturn)) die(void);
 
-/* mca.c */
-int query_mca(void);
-
 /* memory.c */
 int detect_memory(void);
 
diff --git a/kernel/arch/x86/boot/compressed/aslr.c b/kernel/arch/x86/boot/compressed/aslr.c
index d7b1f655b..6a9b96b46 100644
--- a/kernel/arch/x86/boot/compressed/aslr.c
+++ b/kernel/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 		debug_putstr(" RDTSC");
-		rdtscll(raw);
+		raw = rdtsc();
 
 		random ^= raw;
 		use_i8254 = false;
diff --git a/kernel/arch/x86/boot/compressed/eboot.c b/kernel/arch/x86/boot/compressed/eboot.c
index 0cdc154a2..583d539a4 100644
--- a/kernel/arch/x86/boot/compressed/eboot.c
+++ b/kernel/arch/x86/boot/compressed/eboot.c
@@ -624,7 +624,7 @@ setup_pixel_info(struct screen_info *si, u32 pixels_per_scan_line,
 static efi_status_t
 __gop_query32(struct efi_graphics_output_protocol_32 *gop32,
 	      struct efi_graphics_output_mode_info **info,
-	      unsigned long *size, u32 *fb_base)
+	      unsigned long *size, u64 *fb_base)
 {
 	struct efi_graphics_output_protocol_mode_32 *mode;
 	efi_status_t status;
@@ -650,7 +650,8 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
 	unsigned long nr_gops;
 	u16 width, height;
 	u32 pixels_per_scan_line;
-	u32 fb_base;
+	u32 ext_lfb_base;
+	u64 fb_base;
 	struct efi_pixel_bitmask pixel_info;
 	int pixel_format;
 	efi_status_t status;
@@ -667,6 +668,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
 		bool conout_found = false;
 		void *dummy = NULL;
 		u32 h = handles[i];
+		u64 current_fb_base;
 
 		status = efi_call_early(handle_protocol, h,
 					proto, (void **)&gop32);
@@ -678,7 +680,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
 		if (status == EFI_SUCCESS)
 			conout_found = true;
 
-		status = __gop_query32(gop32, &info, &size, &fb_base);
+		status = __gop_query32(gop32, &info, &size, &current_fb_base);
 		if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
 			/*
 			 * Systems that use the UEFI Console Splitter may
@@ -692,6 +694,7 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
 			pixel_format = info->pixel_format;
 			pixel_info = info->pixel_information;
 			pixels_per_scan_line = info->pixels_per_scan_line;
+			fb_base = current_fb_base;
 
 			/*
 			 * Once we've found a GOP supporting ConOut,
@@ -713,6 +716,13 @@ setup_gop32(struct screen_info *si, efi_guid_t *proto,
 	si->lfb_width = width;
 	si->lfb_height = height;
 	si->lfb_base = fb_base;
+
+	ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+	if (ext_lfb_base) {
+		si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+		si->ext_lfb_base = ext_lfb_base;
+	}
+
 	si->pages = 1;
 
 	setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
@@ -727,7 +737,7 @@ out:
 static efi_status_t
 __gop_query64(struct efi_graphics_output_protocol_64 *gop64,
 	      struct efi_graphics_output_mode_info **info,
-	      unsigned long *size, u32 *fb_base)
+	      unsigned long *size, u64 *fb_base)
 {
 	struct efi_graphics_output_protocol_mode_64 *mode;
 	efi_status_t status;
@@ -753,7 +763,8 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
 	unsigned long nr_gops;
 	u16 width, height;
 	u32 pixels_per_scan_line;
-	u32 fb_base;
+	u32 ext_lfb_base;
+	u64 fb_base;
 	struct efi_pixel_bitmask pixel_info;
 	int pixel_format;
 	efi_status_t status;
@@ -770,6 +781,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
 		bool conout_found = false;
 		void *dummy = NULL;
 		u64 h = handles[i];
+		u64 current_fb_base;
 
 		status = efi_call_early(handle_protocol, h,
 					proto, (void **)&gop64);
@@ -781,7 +793,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
 		if (status == EFI_SUCCESS)
 			conout_found = true;
 
-		status = __gop_query64(gop64, &info, &size, &fb_base);
+		status = __gop_query64(gop64, &info, &size, &current_fb_base);
 		if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
 			/*
 			 * Systems that use the UEFI Console Splitter may
@@ -795,6 +807,7 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
 			pixel_format = info->pixel_format;
 			pixel_info = info->pixel_information;
 			pixels_per_scan_line = info->pixels_per_scan_line;
+			fb_base = current_fb_base;
 
 			/*
 			 * Once we've found a GOP supporting ConOut,
@@ -816,6 +829,13 @@ setup_gop64(struct screen_info *si, efi_guid_t *proto,
 	si->lfb_width = width;
 	si->lfb_height = height;
 	si->lfb_base = fb_base;
+
+	ext_lfb_base = (u64)(unsigned long)fb_base >> 32;
+	if (ext_lfb_base) {
+		si->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+		si->ext_lfb_base = ext_lfb_base;
+	}
+
 	si->pages = 1;
 
 	setup_pixel_info(si, pixels_per_scan_line, pixel_info, pixel_format);
@@ -1041,7 +1061,6 @@ void setup_graphics(struct boot_params *boot_params)
 struct boot_params *make_boot_params(struct efi_config *c)
 {
 	struct boot_params *boot_params;
-	struct sys_desc_table *sdt;
 	struct apm_bios_info *bi;
 	struct setup_header *hdr;
 	struct efi_info *efi;
@@ -1089,7 +1108,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	hdr = &boot_params->hdr;
 	efi = &boot_params->efi_info;
 	bi = &boot_params->apm_bios_info;
-	sdt = &boot_params->sys_desc_table;
 
 	/* Copy the second sector to boot_params */
 	memcpy(&hdr->jump, image->image_base + 512, 512);
@@ -1118,8 +1136,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
 	/* Clear APM BIOS info */
 	memset(bi, 0, sizeof(*bi));
 
-	memset(sdt, 0, sizeof(*sdt));
-
 	status = efi_parse_options(cmdline_ptr);
 	if (status != EFI_SUCCESS)
 		goto fail2;
@@ -1228,6 +1244,10 @@ static efi_status_t setup_e820(struct boot_params *params,
 			e820_type = E820_NVS;
 			break;
 
+		case EFI_PERSISTENT_MEMORY:
+			e820_type = E820_PMEM;
+			break;
+
 		default:
 			continue;
 		}
diff --git a/kernel/arch/x86/boot/compressed/misc.c b/kernel/arch/x86/boot/compressed/misc.c
index e28437e0f..79dac1758 100644
--- a/kernel/arch/x86/boot/compressed/misc.c
+++ b/kernel/arch/x86/boot/compressed/misc.c
@@ -220,6 +220,23 @@ void __putstr(const char *s)
 	outb(0xff & (pos >> 1), vidport+1);
 }
 
+void __puthex(unsigned long value)
+{
+	char alpha[2] = "0";
+	int bits;
+
+	for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) {
+		unsigned long digit = (value >> bits) & 0xf;
+
+		if (digit < 0xA)
+			alpha[0] = '0' + digit;
+		else
+			alpha[0] = 'a' + (digit - 0xA);
+
+		__putstr(alpha);
+	}
+}
+
 static void error(char *x)
 {
 	error_putstr("\n\n");
@@ -399,6 +416,13 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
+	/* Report initial kernel position details. */
+	debug_putaddr(input_data);
+	debug_putaddr(input_len);
+	debug_putaddr(output);
+	debug_putaddr(output_len);
+	debug_putaddr(run_size);
+
 	/*
 	 * The memory hole needed for the kernel is the larger of either
 	 * the entire decompressed kernel plus relocation table, or the
diff --git a/kernel/arch/x86/boot/compressed/misc.h b/kernel/arch/x86/boot/compressed/misc.h
index 805d25ca5..3783dc3e1 100644
--- a/kernel/arch/x86/boot/compressed/misc.h
+++ b/kernel/arch/x86/boot/compressed/misc.h
@@ -34,16 +34,27 @@ extern memptr free_mem_ptr;
 extern memptr free_mem_end_ptr;
 extern struct boot_params *real_mode;		/* Pointer to real-mode data */
 void __putstr(const char *s);
+void __puthex(unsigned long value);
 #define error_putstr(__x)  __putstr(__x)
+#define error_puthex(__x)  __puthex(__x)
 
 #ifdef CONFIG_X86_VERBOSE_BOOTUP
 
 #define debug_putstr(__x)  __putstr(__x)
+#define debug_puthex(__x)  __puthex(__x)
+#define debug_putaddr(__x) { \
+		debug_putstr(#__x ": 0x"); \
+		debug_puthex((unsigned long)(__x)); \
+		debug_putstr("\n"); \
+	}
 
 #else
 
 static inline void debug_putstr(const char *s)
 { }
+static inline void debug_puthex(const char *s)
+{ }
+#define debug_putaddr(x) /* */
 
 #endif
 
diff --git a/kernel/arch/x86/boot/header.S b/kernel/arch/x86/boot/header.S
index 16ef02596..6236b9ec4 100644
--- a/kernel/arch/x86/boot/header.S
+++ b/kernel/arch/x86/boot/header.S
@@ -154,7 +154,7 @@ extra_header_fields:
 #else
 	.quad	0				# ImageBase
 #endif
-	.long	CONFIG_PHYSICAL_ALIGN		# SectionAlignment
+	.long	0x20				# SectionAlignment
 	.long	0x20				# FileAlignment
 	.word	0				# MajorOperatingSystemVersion
 	.word	0				# MinorOperatingSystemVersion
@@ -414,7 +414,7 @@ xloadflags:
 # define XLF23 0
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
 # define XLF4 XLF_EFI_KEXEC
 #else
 # define XLF4 0
diff --git a/kernel/arch/x86/boot/main.c b/kernel/arch/x86/boot/main.c
index fd6c9f236..9bcea386d 100644
--- a/kernel/arch/x86/boot/main.c
+++ b/kernel/arch/x86/boot/main.c
@@ -161,9 +161,6 @@ void main(void)
 	/* Set keyboard repeat rate (why?) and query the lock flags */
 	keyboard_init();
 
-	/* Query MCA information */
-	query_mca();
-
 	/* Query Intel SpeedStep (IST) information */
 	query_ist();
 
diff --git a/kernel/arch/x86/boot/mca.c b/kernel/arch/x86/boot/mca.c
deleted file mode 100644
index a95a53114..000000000
--- a/kernel/arch/x86/boot/mca.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright (C) 1991, 1992 Linus Torvalds
- *   Copyright 2007 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation; author H. Peter Anvin
- *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * Get the MCA system description table
- */
-
-#include "boot.h"
-
-int query_mca(void)
-{
-	struct biosregs ireg, oreg;
-	u16 len;
-
-	initregs(&ireg);
-	ireg.ah = 0xc0;
-	intcall(0x15, &ireg, &oreg);
-
-	if (oreg.eflags & X86_EFLAGS_CF)
-		return -1;	/* No MCA present */
-
-	set_fs(oreg.es);
-	len = rdfs16(oreg.bx);
-
-	if (len > sizeof(boot_params.sys_desc_table))
-		len = sizeof(boot_params.sys_desc_table);
-
-	copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
-	return 0;
-}
diff --git a/kernel/arch/x86/boot/video-mode.c b/kernel/arch/x86/boot/video-mode.c
index aa8a96b05..95c7a818c 100644
--- a/kernel/arch/x86/boot/video-mode.c
+++ b/kernel/arch/x86/boot/video-mode.c
@@ -19,6 +19,8 @@
 #include "video.h"
 #include "vesa.h"
 
+#include <uapi/asm/boot.h>
+
 /*
  * Common variables
  */
diff --git a/kernel/arch/x86/boot/video.c b/kernel/arch/x86/boot/video.c
index 05111bb8d..77780e386 100644
--- a/kernel/arch/x86/boot/video.c
+++ b/kernel/arch/x86/boot/video.c
@@ -13,6 +13,8 @@
  * Select video mode
  */
 
+#include <uapi/asm/boot.h>
+
 #include "boot.h"
 #include "video.h"
 #include "vesa.h"
diff --git a/kernel/arch/x86/configs/i386_defconfig b/kernel/arch/x86/configs/i386_defconfig
index aaa1118bf..028be48c8 100644
--- a/kernel/arch/x86/configs/i386_defconfig
+++ b/kernel/arch/x86/configs/i386_defconfig
@@ -23,6 +23,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/kernel/arch/x86/configs/x86_64_defconfig b/kernel/arch/x86/configs/x86_64_defconfig
index 315b86106..cb5b3ab5b 100644
--- a/kernel/arch/x86/configs/x86_64_defconfig
+++ b/kernel/arch/x86/configs/x86_64_defconfig
@@ -22,6 +22,7 @@ CONFIG_BLK_DEV_INITRD=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
 CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
@@ -207,7 +208,6 @@ CONFIG_AGP_AMD64=y
 CONFIG_AGP_INTEL=y
 CONFIG_DRM=y
 CONFIG_DRM_I915=y
-CONFIG_DRM_I915_KMS=y
 CONFIG_FB_MODE_HELPERS=y
 CONFIG_FB_TILEBLITTING=y
 CONFIG_FB_EFI=y
diff --git a/kernel/arch/x86/configs/xen.config b/kernel/arch/x86/configs/xen.config
new file mode 100644
index 000000000..d9fc7139f
--- /dev/null
+++ b/kernel/arch/x86/configs/xen.config
@@ -0,0 +1,28 @@
+# global x86 required specific stuff
+# On 32-bit HIGHMEM4G is not allowed
+CONFIG_HIGHMEM64G=y
+CONFIG_64BIT=y
+
+# These enable us to allow some of the
+# not so generic stuff below
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_X86_MCE=y
+CONFIG_ACPI_PROCESSOR=y
+CONFIG_CPU_FREQ=y
+
+# x86 xen specific config options
+CONFIG_XEN_PVH=y
+CONFIG_XEN_MAX_DOMAIN_MEMORY=500
+CONFIG_XEN_SAVE_RESTORE=y
+# CONFIG_XEN_DEBUG_FS is not set
+CONFIG_XEN_MCE_LOG=y
+CONFIG_XEN_ACPI_PROCESSOR=m
+# x86 specific backend drivers
+CONFIG_XEN_PCIDEV_BACKEND=m
+# x86 specific frontend drivers
+CONFIG_XEN_PCIDEV_FRONTEND=m
+# depends on MEMORY_HOTPLUG, arm64 doesn't enable this yet,
+# move to generic config if it ever does.
+CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y
diff --git a/kernel/arch/x86/crypto/Makefile b/kernel/arch/x86/crypto/Makefile
index 5a4a089e8..b9b912a44 100644
--- a/kernel/arch/x86/crypto/Makefile
+++ b/kernel/arch/x86/crypto/Makefile
@@ -5,6 +5,8 @@
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
 				$(comma)4)$(comma)%ymm2,yes,no)
+sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
 
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
@@ -20,6 +22,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -30,6 +33,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
@@ -60,6 +64,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 ifeq ($(avx_supported),yes)
@@ -75,6 +80,7 @@ endif
 
 ifeq ($(avx2_supported),yes)
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+	chacha20-x86_64-y += chacha20-avx2-x86_64.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 endif
 
@@ -82,12 +88,20 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+poly1305-x86_64-y += poly1305-avx2-x86_64.o
+endif
+ifeq ($(sha1_ni_supported),yes)
+sha1-ssse3-y += sha1_ni_asm.o
 endif
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+ifeq ($(sha256_ni_supported),yes)
+sha256-ssse3-y += sha256_ni_asm.o
+endif
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/kernel/arch/x86/crypto/aesni-intel_glue.c b/kernel/arch/x86/crypto/aesni-intel_glue.c
index 3fd3b1634..c6d5458ee 100644
--- a/kernel/arch/x86/crypto/aesni-intel_glue.c
+++ b/kernel/arch/x86/crypto/aesni-intel_glue.c
@@ -32,7 +32,7 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/aes.h>
 #include <crypto/ablk_helper.h>
 #include <crypto/scatterwalk.h>
@@ -44,15 +44,19 @@
 #endif
 
 
+#define AESNI_ALIGN	16
+#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE - 1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
+
 /* This data is stored at the end of the crypto_tfm struct.
  * It's a type of per "session" data storage location.
  * This needs to be 16 byte aligned.
  */
 struct aesni_rfc4106_gcm_ctx {
-	u8 hash_subkey[16];
-	struct crypto_aes_ctx aes_key_expanded;
+	u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+	struct crypto_aes_ctx aes_key_expanded
+		__attribute__ ((__aligned__(AESNI_ALIGN)));
 	u8 nonce[4];
-	struct cryptd_aead *cryptd_tfm;
 };
 
 struct aesni_gcm_set_hash_subkey_result {
@@ -66,10 +70,6 @@ struct aesni_hash_subkey_req_data {
 	struct scatterlist sg;
 };
 
-#define AESNI_ALIGN	(16)
-#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
-#define RFC4106_HASH_SUBKEY_SIZE 16
-
 struct aesni_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
@@ -283,10 +283,11 @@ static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
 static inline struct
 aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
 {
-	return
-		(struct aesni_rfc4106_gcm_ctx *)
-		PTR_ALIGN((u8 *)
-		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+	unsigned long align = AESNI_ALIGN;
+
+	if (align <= crypto_tfm_ctx_alignment())
+		align = 1;
+	return PTR_ALIGN(crypto_aead_ctx(tfm), align);
 }
 #endif
 
@@ -792,36 +793,27 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 #endif
 
 #ifdef CONFIG_X86_64
-static int rfc4106_init(struct crypto_tfm *tfm)
+static int rfc4106_init(struct crypto_aead *aead)
 {
 	struct cryptd_aead *cryptd_tfm;
-	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
-		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
-	struct crypto_aead *cryptd_child;
-	struct aesni_rfc4106_gcm_ctx *child_ctx;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
 	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
 				       CRYPTO_ALG_INTERNAL,
 				       CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 
-	cryptd_child = cryptd_aead_child(cryptd_tfm);
-	child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child);
-	memcpy(child_ctx, ctx, sizeof(*ctx));
-	ctx->cryptd_tfm = cryptd_tfm;
-	tfm->crt_aead.reqsize = sizeof(struct aead_request)
-		+ crypto_aead_reqsize(&cryptd_tfm->base);
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
 	return 0;
 }
 
-static void rfc4106_exit(struct crypto_tfm *tfm)
+static void rfc4106_exit(struct crypto_aead *aead)
 {
-	struct aesni_rfc4106_gcm_ctx *ctx =
-		(struct aesni_rfc4106_gcm_ctx *)
-		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
-	if (!IS_ERR(ctx->cryptd_tfm))
-		cryptd_free_aead(ctx->cryptd_tfm);
-	return;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
 }
 
 static void
@@ -847,8 +839,6 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
 	if (IS_ERR(ctr_tfm))
 		return PTR_ERR(ctr_tfm);
 
-	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
-
 	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
 	if (ret)
 		goto out_free_ablkcipher;
@@ -897,73 +887,29 @@ out_free_ablkcipher:
 static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
 				  unsigned int key_len)
 {
-	int ret = 0;
-	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
-	u8 *new_key_align, *new_key_mem = NULL;
 
 	if (key_len < 4) {
-		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 	/*Account for 4 byte nonce at the end.*/
 	key_len -= 4;
-	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
-	    key_len != AES_KEYSIZE_256) {
-		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
 
 	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
-	/*This must be on a 16 byte boundary!*/
-	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
-		return -EINVAL;
-
-	if ((unsigned long)key % AESNI_ALIGN) {
-		/*key is not aligned: use an auxuliar aligned pointer*/
-		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
-		if (!new_key_mem)
-			return -ENOMEM;
-
-		new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
-		memcpy(new_key_align, key, key_len);
-		key = new_key_align;
-	}
 
-	if (!irq_fpu_usable())
-		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
-		key, key_len);
-	else {
-		kernel_fpu_begin();
-		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
-		kernel_fpu_end();
-	}
-	/*This must be on a 16 byte boundary!*/
-	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
-		ret = -EINVAL;
-		goto exit;
-	}
-	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
-exit:
-	kfree(new_key_mem);
-	return ret;
+	return aes_set_key_common(crypto_aead_tfm(aead),
+				  &ctx->aes_key_expanded, key, key_len) ?:
+	       rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
 }
 
 static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 			   unsigned int key_len)
 {
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
-	struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child);
-	struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm;
-	int ret;
+	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
+	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	ret = crypto_aead_setkey(child, key, key_len);
-	if (!ret) {
-		memcpy(ctx, c_ctx, sizeof(*ctx));
-		ctx->cryptd_tfm = cryptd_tfm;
-	}
-	return ret;
+	return crypto_aead_setkey(&cryptd_tfm->base, key, key_len);
 }
 
 static int common_rfc4106_set_authsize(struct crypto_aead *aead,
@@ -977,7 +923,7 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead,
 	default:
 		return -EINVAL;
 	}
-	crypto_aead_crt(aead)->authsize = authsize;
+
 	return 0;
 }
 
@@ -986,44 +932,31 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead,
 static int rfc4106_set_authsize(struct crypto_aead *parent,
 				unsigned int authsize)
 {
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
-	int ret;
+	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
+	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	ret = crypto_aead_setauthsize(child, authsize);
-	if (!ret)
-		crypto_aead_crt(parent)->authsize = authsize;
-	return ret;
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
 }
 
-static int __driver_rfc4106_encrypt(struct aead_request *req)
+static int helper_rfc4106_encrypt(struct aead_request *req)
 {
 	u8 one_entry_in_sg = 0;
 	u8 *src, *dst, *assoc;
 	__be32 counter = cpu_to_be32(1);
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 iv_tab[16+AESNI_ALIGN];
-	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+	u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
 	struct scatter_walk src_sg_walk;
-	struct scatter_walk assoc_sg_walk;
 	struct scatter_walk dst_sg_walk;
 	unsigned int i;
 
 	/* Assuming we are supporting rfc4106 64-bit extended */
 	/* sequence numbers We need to have the AAD length equal */
-	/* to 8 or 12 bytes */
-	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+	/* to 16 or 20 bytes */
+	if (unlikely(req->assoclen != 16 && req->assoclen != 20))
 		return -EINVAL;
-	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
-	        return -EINVAL;
-	if (unlikely(key_len != AES_KEYSIZE_128 &&
-	             key_len != AES_KEYSIZE_192 &&
-	             key_len != AES_KEYSIZE_256))
-	        return -EINVAL;
 
 	/* IV below built */
 	for (i = 0; i < 4; i++)
@@ -1032,55 +965,57 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 		*(iv+4+i) = req->iv[i];
 	*((__be32 *)(iv+12)) = counter;
 
-	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+	if (sg_is_last(req->src) &&
+	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    sg_is_last(req->dst) &&
+	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
-		scatterwalk_start(&assoc_sg_walk, req->assoc);
-		src = scatterwalk_map(&src_sg_walk);
-		assoc = scatterwalk_map(&assoc_sg_walk);
+		assoc = scatterwalk_map(&src_sg_walk);
+		src = assoc + req->assoclen;
 		dst = src;
 		if (unlikely(req->src != req->dst)) {
 			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk);
+			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
 		}
-
 	} else {
 		/* Allocate memory for src, dst, assoc */
-		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+		assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
 			GFP_ATOMIC);
-		if (unlikely(!src))
+		if (unlikely(!assoc))
 			return -ENOMEM;
-		assoc = (src + req->cryptlen + auth_tag_len);
-		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
-		scatterwalk_map_and_copy(assoc, req->assoc, 0,
-					req->assoclen, 0);
+		scatterwalk_map_and_copy(assoc, req->src, 0,
+					 req->assoclen + req->cryptlen, 0);
+		src = assoc + req->assoclen;
 		dst = src;
 	}
 
-	aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
-		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
-		+ ((unsigned long)req->cryptlen), auth_tag_len);
+	kernel_fpu_begin();
+	aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv,
+			  ctx->hash_subkey, assoc, req->assoclen - 8,
+			  dst + req->cryptlen, auth_tag_len);
+	kernel_fpu_end();
 
 	/* The authTag (aka the Integrity Check Value) needs to be written
 	 * back to the packet. */
 	if (one_entry_in_sg) {
 		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst);
-			scatterwalk_done(&dst_sg_walk, 0, 0);
+			scatterwalk_unmap(dst - req->assoclen);
+			scatterwalk_advance(&dst_sg_walk, req->dst->length);
+			scatterwalk_done(&dst_sg_walk, 1, 0);
 		}
-		scatterwalk_unmap(src);
 		scatterwalk_unmap(assoc);
-		scatterwalk_done(&src_sg_walk, 0, 0);
-		scatterwalk_done(&assoc_sg_walk, 0, 0);
+		scatterwalk_advance(&src_sg_walk, req->src->length);
+		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
 	} else {
-		scatterwalk_map_and_copy(dst, req->dst, 0,
-			req->cryptlen + auth_tag_len, 1);
-		kfree(src);
+		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
+					 req->cryptlen + auth_tag_len, 1);
+		kfree(assoc);
 	}
 	return 0;
 }
 
-static int __driver_rfc4106_decrypt(struct aead_request *req)
+static int helper_rfc4106_decrypt(struct aead_request *req)
 {
 	u8 one_entry_in_sg = 0;
 	u8 *src, *dst, *assoc;
@@ -1089,30 +1024,20 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	int retval = 0;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 iv_and_authTag[32+AESNI_ALIGN];
-	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
-	u8 *authTag = iv + 16;
+	u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+	u8 authTag[16];
 	struct scatter_walk src_sg_walk;
-	struct scatter_walk assoc_sg_walk;
 	struct scatter_walk dst_sg_walk;
 	unsigned int i;
 
-	if (unlikely((req->cryptlen < auth_tag_len) ||
-		(req->assoclen != 8 && req->assoclen != 12)))
+	if (unlikely(req->assoclen != 16 && req->assoclen != 20))
 		return -EINVAL;
-	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
-	        return -EINVAL;
-	if (unlikely(key_len != AES_KEYSIZE_128 &&
-	             key_len != AES_KEYSIZE_192 &&
-	             key_len != AES_KEYSIZE_256))
-	        return -EINVAL;
 
 	/* Assuming we are supporting rfc4106 64-bit extended */
 	/* sequence numbers We need to have the AAD length */
-	/* equal to 8 or 12 bytes */
+	/* equal to 16 or 20 bytes */
 
 	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
 	/* IV below built */
@@ -1122,33 +1047,36 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		*(iv+4+i) = req->iv[i];
 	*((__be32 *)(iv+12)) = counter;
 
-	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+	if (sg_is_last(req->src) &&
+	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    sg_is_last(req->dst) &&
+	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
-		scatterwalk_start(&assoc_sg_walk, req->assoc);
-		src = scatterwalk_map(&src_sg_walk);
-		assoc = scatterwalk_map(&assoc_sg_walk);
+		assoc = scatterwalk_map(&src_sg_walk);
+		src = assoc + req->assoclen;
 		dst = src;
 		if (unlikely(req->src != req->dst)) {
 			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk);
+			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
 		}
 
 	} else {
 		/* Allocate memory for src, dst, assoc */
-		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
-		if (!src)
+		assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+		if (!assoc)
 			return -ENOMEM;
-		assoc = (src + req->cryptlen);
-		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
-		scatterwalk_map_and_copy(assoc, req->assoc, 0,
-			req->assoclen, 0);
+		scatterwalk_map_and_copy(assoc, req->src, 0,
+					 req->assoclen + req->cryptlen, 0);
+		src = assoc + req->assoclen;
 		dst = src;
 	}
 
+	kernel_fpu_begin();
 	aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
-		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
-		authTag, auth_tag_len);
+			  ctx->hash_subkey, assoc, req->assoclen - 8,
+			  authTag, auth_tag_len);
+	kernel_fpu_end();
 
 	/* Compare generated tag with passed in tag. */
 	retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
@@ -1156,90 +1084,45 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 
 	if (one_entry_in_sg) {
 		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst);
-			scatterwalk_done(&dst_sg_walk, 0, 0);
+			scatterwalk_unmap(dst - req->assoclen);
+			scatterwalk_advance(&dst_sg_walk, req->dst->length);
+			scatterwalk_done(&dst_sg_walk, 1, 0);
 		}
-		scatterwalk_unmap(src);
 		scatterwalk_unmap(assoc);
-		scatterwalk_done(&src_sg_walk, 0, 0);
-		scatterwalk_done(&assoc_sg_walk, 0, 0);
+		scatterwalk_advance(&src_sg_walk, req->src->length);
+		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
 	} else {
-		scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1);
-		kfree(src);
+		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
+					 tempCipherLen, 1);
+		kfree(assoc);
 	}
 	return retval;
 }
 
 static int rfc4106_encrypt(struct aead_request *req)
 {
-	int ret;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
+	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
+	aead_request_set_tfm(req, irq_fpu_usable() ?
+				  cryptd_aead_child(cryptd_tfm) :
+				  &cryptd_tfm->base);
 
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		ret = crypto_aead_encrypt(cryptd_req);
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_encrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
+	return crypto_aead_encrypt(req);
 }
 
 static int rfc4106_decrypt(struct aead_request *req)
 {
-	int ret;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
-
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		ret = crypto_aead_decrypt(cryptd_req);
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_decrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
-}
-
-static int helper_rfc4106_encrypt(struct aead_request *req)
-{
-	int ret;
-
-	if (unlikely(!irq_fpu_usable())) {
-		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
-		ret = -EINVAL;
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_encrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
-}
+	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
+	struct cryptd_aead *cryptd_tfm = *ctx;
 
-static int helper_rfc4106_decrypt(struct aead_request *req)
-{
-	int ret;
+	aead_request_set_tfm(req, irq_fpu_usable() ?
+				  cryptd_aead_child(cryptd_tfm) :
+				  &cryptd_tfm->base);
 
-	if (unlikely(!irq_fpu_usable())) {
-		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
-		ret = -EINVAL;
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_decrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
+	return crypto_aead_decrypt(req);
 }
 #endif
 
@@ -1412,51 +1295,6 @@ static struct crypto_alg aesni_algs[] = { {
 			.geniv		= "chainiv",
 		},
 	},
-}, {
-	.cra_name		= "__gcm-aes-aesni",
-	.cra_driver_name	= "__driver-gcm-aes-aesni",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
-				  AESNI_ALIGN,
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_aead_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.aead = {
-			.setkey		= common_rfc4106_set_key,
-			.setauthsize	= common_rfc4106_set_authsize,
-			.encrypt	= helper_rfc4106_encrypt,
-			.decrypt	= helper_rfc4106_decrypt,
-			.ivsize		= 8,
-			.maxauthsize	= 16,
-		},
-	},
-}, {
-	.cra_name		= "rfc4106(gcm(aes))",
-	.cra_driver_name	= "rfc4106-gcm-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
-				  AESNI_ALIGN,
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_nivaead_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= rfc4106_init,
-	.cra_exit		= rfc4106_exit,
-	.cra_u = {
-		.aead = {
-			.setkey		= rfc4106_set_key,
-			.setauthsize	= rfc4106_set_authsize,
-			.encrypt	= rfc4106_encrypt,
-			.decrypt	= rfc4106_decrypt,
-			.geniv		= "seqiv",
-			.ivsize		= 8,
-			.maxauthsize	= 16,
-		},
-	},
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_PCBC)
 }, {
@@ -1571,6 +1409,46 @@ static struct crypto_alg aesni_algs[] = { {
 	},
 } };
 
+#ifdef CONFIG_X86_64
+static struct aead_alg aesni_aead_algs[] = { {
+	.setkey			= common_rfc4106_set_key,
+	.setauthsize		= common_rfc4106_set_authsize,
+	.encrypt		= helper_rfc4106_encrypt,
+	.decrypt		= helper_rfc4106_decrypt,
+	.ivsize			= 8,
+	.maxauthsize		= 16,
+	.base = {
+		.cra_name		= "__gcm-aes-aesni",
+		.cra_driver_name	= "__driver-gcm-aes-aesni",
+		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx),
+		.cra_alignmask		= AESNI_ALIGN - 1,
+		.cra_module		= THIS_MODULE,
+	},
+}, {
+	.init			= rfc4106_init,
+	.exit			= rfc4106_exit,
+	.setkey			= rfc4106_set_key,
+	.setauthsize		= rfc4106_set_authsize,
+	.encrypt		= rfc4106_encrypt,
+	.decrypt		= rfc4106_decrypt,
+	.ivsize			= 8,
+	.maxauthsize		= 16,
+	.base = {
+		.cra_name		= "rfc4106(gcm(aes))",
+		.cra_driver_name	= "rfc4106-gcm-aesni",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct cryptd_aead *),
+		.cra_module		= THIS_MODULE,
+	},
+} };
+#else
+static struct aead_alg aesni_aead_algs[0];
+#endif
+
 
 static const struct x86_cpu_id aesni_cpu_id[] = {
 	X86_FEATURE_MATCH(X86_FEATURE_AES),
@@ -1618,17 +1496,33 @@ static int __init aesni_init(void)
 	if (err)
 		return err;
 
-	return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+	err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+	if (err)
+		goto fpu_exit;
+
+	err = crypto_register_aeads(aesni_aead_algs,
+				    ARRAY_SIZE(aesni_aead_algs));
+	if (err)
+		goto unregister_algs;
+
+	return err;
+
+unregister_algs:
+	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+fpu_exit:
+	crypto_fpu_exit();
+	return err;
 }
 
 static void __exit aesni_exit(void)
 {
+	crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
 	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
 
 	crypto_fpu_exit();
 }
 
-module_init(aesni_init);
+late_initcall(aesni_init);
 module_exit(aesni_exit);
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
diff --git a/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c b/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c
index baf0ac21a..d84456924 100644
--- a/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/kernel/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -19,8 +19,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -561,16 +560,16 @@ static struct crypto_alg cmll_algs[10] = { {
 
 static int __init camellia_aesni_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
 	if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
 		pr_info("AVX2 or AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
 
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX2 detected but unusable.\n");
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c b/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c
index 78818a1e7..93d8f2957 100644
--- a/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/kernel/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -19,8 +19,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -553,16 +552,16 @@ static struct crypto_alg cmll_algs[10] = { {
 
 static int __init camellia_aesni_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
 	if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
 		pr_info("AVX or AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
 
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/kernel/arch/x86/crypto/cast5_avx_glue.c b/kernel/arch/x86/crypto/cast5_avx_glue.c
index f799ec36b..d7699130e 100644
--- a/kernel/arch/x86/crypto/cast5_avx_glue.c
+++ b/kernel/arch/x86/crypto/cast5_avx_glue.c
@@ -31,8 +31,7 @@
 #include <crypto/cast5.h>
 #include <crypto/cryptd.h>
 #include <crypto/ctr.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/glue_helper.h>
 
 #define CAST5_PARALLEL_BLOCKS 16
@@ -465,16 +464,11 @@ static struct crypto_alg cast5_algs[6] = { {
 
 static int __init cast5_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		pr_info("AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/kernel/arch/x86/crypto/cast6_avx_glue.c b/kernel/arch/x86/crypto/cast6_avx_glue.c
index f448810ca..fca459578 100644
--- a/kernel/arch/x86/crypto/cast6_avx_glue.c
+++ b/kernel/arch/x86/crypto/cast6_avx_glue.c
@@ -36,8 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/glue_helper.h>
 
 #define CAST6_PARALLEL_BLOCKS 8
@@ -590,16 +589,11 @@ static struct crypto_alg cast6_algs[10] = { {
 
 static int __init cast6_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		pr_info("AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/kernel/arch/x86/crypto/chacha20-avx2-x86_64.S b/kernel/arch/x86/crypto/chacha20-avx2-x86_64.S
new file mode 100644
index 000000000..16694e625
--- /dev/null
+++ b/kernel/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -0,0 +1,443 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 32
+
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+	.octa 0x0e0d0c0f0a09080b0605040702010003
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+	.octa 0x0d0c0f0e09080b0a0504070601000302
+CTRINC:	.octa 0x00000003000000020000000100000000
+	.octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx2)
+	# %rdi: Input state matrix, s
+	# %rsi: 8 data blocks output, o
+	# %rdx: 8 data blocks input, i
+
+	# This function encrypts eight consecutive ChaCha20 blocks by loading
+	# the state matrix in AVX registers eight times. As we need some
+	# scratch registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
+	# words, which allows us to do XOR in AVX registers. 8/16-bit word
+	# rotation is done with the slightly better performing byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	vzeroupper
+	# 4 * 32 byte stack, 32-byte aligned
+	mov		%rsp, %r8
+	and		$~31, %rsp
+	sub		$0x80, %rsp
+
+	# x0..15[0-7] = s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpbroadcastd	0x04(%rdi),%ymm1
+	vpbroadcastd	0x08(%rdi),%ymm2
+	vpbroadcastd	0x0c(%rdi),%ymm3
+	vpbroadcastd	0x10(%rdi),%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm15
+	# x0..3 on stack
+	vmovdqa		%ymm0,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm3,0x60(%rsp)
+
+	vmovdqa		CTRINC(%rip),%ymm1
+	vmovdqa		ROT8(%rip),%ymm2
+	vmovdqa		ROT16(%rip),%ymm3
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	mov		$10,%ecx
+
+.Ldoubleround8:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	vpaddd		%ymm12,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	vpaddd		%ymm13,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	vpaddd		%ymm14,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vpaddd		%ymm15,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm3,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm3,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm3,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm3,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$12,%ymm5,%ymm0
+	vpsrld		$20,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$12,%ymm6,%ymm0
+	vpsrld		$20,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$12,%ymm7,%ymm0
+	vpsrld		$20,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$12,%ymm4,%ymm0
+	vpsrld		$20,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	vpaddd		0x00(%rsp),%ymm5,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpxor		%ymm0,%ymm15,%ymm15
+	vpshufb		%ymm2,%ymm15,%ymm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	vpaddd		0x20(%rsp),%ymm6,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpxor		%ymm0,%ymm12,%ymm12
+	vpshufb		%ymm2,%ymm12,%ymm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	vpaddd		0x40(%rsp),%ymm7,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpxor		%ymm0,%ymm13,%ymm13
+	vpshufb		%ymm2,%ymm13,%ymm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vpaddd		0x60(%rsp),%ymm4,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpxor		%ymm0,%ymm14,%ymm14
+	vpshufb		%ymm2,%ymm14,%ymm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	vpaddd		%ymm15,%ymm10,%ymm10
+	vpxor		%ymm10,%ymm5,%ymm5
+	vpslld		$7,%ymm5,%ymm0
+	vpsrld		$25,%ymm5,%ymm5
+	vpor		%ymm0,%ymm5,%ymm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	vpaddd		%ymm12,%ymm11,%ymm11
+	vpxor		%ymm11,%ymm6,%ymm6
+	vpslld		$7,%ymm6,%ymm0
+	vpsrld		$25,%ymm6,%ymm6
+	vpor		%ymm0,%ymm6,%ymm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	vpaddd		%ymm13,%ymm8,%ymm8
+	vpxor		%ymm8,%ymm7,%ymm7
+	vpslld		$7,%ymm7,%ymm0
+	vpsrld		$25,%ymm7,%ymm7
+	vpor		%ymm0,%ymm7,%ymm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vpaddd		%ymm14,%ymm9,%ymm9
+	vpxor		%ymm9,%ymm4,%ymm4
+	vpslld		$7,%ymm4,%ymm0
+	vpsrld		$25,%ymm4,%ymm4
+	vpor		%ymm0,%ymm4,%ymm4
+
+	dec		%ecx
+	jnz		.Ldoubleround8
+
+	# x0..15[0-3] += s[0..15]
+	vpbroadcastd	0x00(%rdi),%ymm0
+	vpaddd		0x00(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x00(%rsp)
+	vpbroadcastd	0x04(%rdi),%ymm0
+	vpaddd		0x20(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x20(%rsp)
+	vpbroadcastd	0x08(%rdi),%ymm0
+	vpaddd		0x40(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x40(%rsp)
+	vpbroadcastd	0x0c(%rdi),%ymm0
+	vpaddd		0x60(%rsp),%ymm0,%ymm0
+	vmovdqa		%ymm0,0x60(%rsp)
+	vpbroadcastd	0x10(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm4,%ymm4
+	vpbroadcastd	0x14(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm5,%ymm5
+	vpbroadcastd	0x18(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm6,%ymm6
+	vpbroadcastd	0x1c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm7,%ymm7
+	vpbroadcastd	0x20(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm8,%ymm8
+	vpbroadcastd	0x24(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm9,%ymm9
+	vpbroadcastd	0x28(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm10,%ymm10
+	vpbroadcastd	0x2c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm11,%ymm11
+	vpbroadcastd	0x30(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm12,%ymm12
+	vpbroadcastd	0x34(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm13,%ymm13
+	vpbroadcastd	0x38(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm14,%ymm14
+	vpbroadcastd	0x3c(%rdi),%ymm0
+	vpaddd		%ymm0,%ymm15,%ymm15
+
+	# x12 += counter values 0-3
+	vpaddd		%ymm1,%ymm12,%ymm12
+
+	# interleave 32-bit words in state n, n+1
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x20(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x00(%rsp)
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm1
+	vpunpckldq	%ymm1,%ymm0,%ymm2
+	vpunpckhdq	%ymm1,%ymm0,%ymm1
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		%ymm1,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpckldq	%ymm5,%ymm0,%ymm4
+	vpunpckhdq	%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm6,%ymm0
+	vpunpckldq	%ymm7,%ymm0,%ymm6
+	vpunpckhdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpckldq	%ymm9,%ymm0,%ymm8
+	vpunpckhdq	%ymm9,%ymm0,%ymm9
+	vmovdqa		%ymm10,%ymm0
+	vpunpckldq	%ymm11,%ymm0,%ymm10
+	vpunpckhdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpckldq	%ymm13,%ymm0,%ymm12
+	vpunpckhdq	%ymm13,%ymm0,%ymm13
+	vmovdqa		%ymm14,%ymm0
+	vpunpckldq	%ymm15,%ymm0,%ymm14
+	vpunpckhdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 64-bit words in state n, n+2
+	vmovdqa		0x00(%rsp),%ymm0
+	vmovdqa		0x40(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		%ymm2,0x40(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vmovdqa		0x60(%rsp),%ymm2
+	vpunpcklqdq	%ymm2,%ymm0,%ymm1
+	vpunpckhqdq	%ymm2,%ymm0,%ymm2
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		%ymm2,0x60(%rsp)
+	vmovdqa		%ymm4,%ymm0
+	vpunpcklqdq	%ymm6,%ymm0,%ymm4
+	vpunpckhqdq	%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm5,%ymm0
+	vpunpcklqdq	%ymm7,%ymm0,%ymm5
+	vpunpckhqdq	%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm8,%ymm0
+	vpunpcklqdq	%ymm10,%ymm0,%ymm8
+	vpunpckhqdq	%ymm10,%ymm0,%ymm10
+	vmovdqa		%ymm9,%ymm0
+	vpunpcklqdq	%ymm11,%ymm0,%ymm9
+	vpunpckhqdq	%ymm11,%ymm0,%ymm11
+	vmovdqa		%ymm12,%ymm0
+	vpunpcklqdq	%ymm14,%ymm0,%ymm12
+	vpunpckhqdq	%ymm14,%ymm0,%ymm14
+	vmovdqa		%ymm13,%ymm0
+	vpunpcklqdq	%ymm15,%ymm0,%ymm13
+	vpunpckhqdq	%ymm15,%ymm0,%ymm15
+
+	# interleave 128-bit words in state n, n+4
+	vmovdqa		0x00(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm4,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
+	vmovdqa		%ymm1,0x00(%rsp)
+	vmovdqa		0x20(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm5,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm5,%ymm0,%ymm5
+	vmovdqa		%ymm1,0x20(%rsp)
+	vmovdqa		0x40(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm6,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm6,%ymm0,%ymm6
+	vmovdqa		%ymm1,0x40(%rsp)
+	vmovdqa		0x60(%rsp),%ymm0
+	vperm2i128	$0x20,%ymm7,%ymm0,%ymm1
+	vperm2i128	$0x31,%ymm7,%ymm0,%ymm7
+	vmovdqa		%ymm1,0x60(%rsp)
+	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
+	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
+	vmovdqa		%ymm0,%ymm8
+	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
+	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
+	vmovdqa		%ymm0,%ymm9
+	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
+	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
+	vmovdqa		%ymm0,%ymm10
+	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
+	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
+	vmovdqa		%ymm0,%ymm11
+
+	# xor with corresponding input, write to output
+	vmovdqa		0x00(%rsp),%ymm0
+	vpxor		0x0000(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0000(%rsi)
+	vmovdqa		0x20(%rsp),%ymm0
+	vpxor		0x0080(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0080(%rsi)
+	vmovdqa		0x40(%rsp),%ymm0
+	vpxor		0x0040(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x0040(%rsi)
+	vmovdqa		0x60(%rsp),%ymm0
+	vpxor		0x00c0(%rdx),%ymm0,%ymm0
+	vmovdqu		%ymm0,0x00c0(%rsi)
+	vpxor		0x0100(%rdx),%ymm4,%ymm4
+	vmovdqu		%ymm4,0x0100(%rsi)
+	vpxor		0x0180(%rdx),%ymm5,%ymm5
+	vmovdqu		%ymm5,0x00180(%rsi)
+	vpxor		0x0140(%rdx),%ymm6,%ymm6
+	vmovdqu		%ymm6,0x0140(%rsi)
+	vpxor		0x01c0(%rdx),%ymm7,%ymm7
+	vmovdqu		%ymm7,0x01c0(%rsi)
+	vpxor		0x0020(%rdx),%ymm8,%ymm8
+	vmovdqu		%ymm8,0x0020(%rsi)
+	vpxor		0x00a0(%rdx),%ymm9,%ymm9
+	vmovdqu		%ymm9,0x00a0(%rsi)
+	vpxor		0x0060(%rdx),%ymm10,%ymm10
+	vmovdqu		%ymm10,0x0060(%rsi)
+	vpxor		0x00e0(%rdx),%ymm11,%ymm11
+	vmovdqu		%ymm11,0x00e0(%rsi)
+	vpxor		0x0120(%rdx),%ymm12,%ymm12
+	vmovdqu		%ymm12,0x0120(%rsi)
+	vpxor		0x01a0(%rdx),%ymm13,%ymm13
+	vmovdqu		%ymm13,0x01a0(%rsi)
+	vpxor		0x0160(%rdx),%ymm14,%ymm14
+	vmovdqu		%ymm14,0x0160(%rsi)
+	vpxor		0x01e0(%rdx),%ymm15,%ymm15
+	vmovdqu		%ymm15,0x01e0(%rsi)
+
+	vzeroupper
+	mov		%r8,%rsp
+	ret
+ENDPROC(chacha20_8block_xor_avx2)
diff --git a/kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S b/kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S
new file mode 100644
index 000000000..3a33124e9
--- /dev/null
+++ b/kernel/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -0,0 +1,627 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 16
+
+ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
+ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
+CTRINC:	.octa 0x00000003000000020000000100000000
+
+.text
+
+ENTRY(chacha20_block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: 1 data block output, o
+	# %rdx: 1 data block input, i
+
+	# This function encrypts one ChaCha20 block by loading the state matrix
+	# in four SSE registers. It performs matrix operation on four words in
+	# parallel, but requireds shuffling to rearrange the words after each
+	# round. 8/16-bit word rotation is done with the slightly better
+	# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
+	# traditional shift+OR.
+
+	# x0..3 = s0..3
+	movdqa		0x00(%rdi),%xmm0
+	movdqa		0x10(%rdi),%xmm1
+	movdqa		0x20(%rdi),%xmm2
+	movdqa		0x30(%rdi),%xmm3
+	movdqa		%xmm0,%xmm8
+	movdqa		%xmm1,%xmm9
+	movdqa		%xmm2,%xmm10
+	movdqa		%xmm3,%xmm11
+
+	movdqa		ROT8(%rip),%xmm4
+	movdqa		ROT16(%rip),%xmm5
+
+	mov	$10,%ecx
+
+.Ldoubleround:
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm3,%xmm3
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm5,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm6
+	pslld		$12,%xmm6
+	psrld		$20,%xmm1
+	por		%xmm6,%xmm1
+
+	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm4,%xmm3
+
+	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm7
+	pslld		$7,%xmm7
+	psrld		$25,%xmm1
+	por		%xmm7,%xmm1
+
+	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	pshufd		$0x93,%xmm1,%xmm1
+	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	pshufd		$0x4e,%xmm2,%xmm2
+	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	pshufd		$0x39,%xmm3,%xmm3
+
+	dec		%ecx
+	jnz		.Ldoubleround
+
+	# o0 = i0 ^ (x0 + s0)
+	movdqu		0x00(%rdx),%xmm4
+	paddd		%xmm8,%xmm0
+	pxor		%xmm4,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+	# o1 = i1 ^ (x1 + s1)
+	movdqu		0x10(%rdx),%xmm5
+	paddd		%xmm9,%xmm1
+	pxor		%xmm5,%xmm1
+	movdqu		%xmm1,0x10(%rsi)
+	# o2 = i2 ^ (x2 + s2)
+	movdqu		0x20(%rdx),%xmm6
+	paddd		%xmm10,%xmm2
+	pxor		%xmm6,%xmm2
+	movdqu		%xmm2,0x20(%rsi)
+	# o3 = i3 ^ (x3 + s3)
+	movdqu		0x30(%rdx),%xmm7
+	paddd		%xmm11,%xmm3
+	pxor		%xmm7,%xmm3
+	movdqu		%xmm3,0x30(%rsi)
+
+	ret
+ENDPROC(chacha20_block_xor_ssse3)
+
+ENTRY(chacha20_4block_xor_ssse3)
+	# %rdi: Input state matrix, s
+	# %rsi: 4 data blocks output, o
+	# %rdx: 4 data blocks input, i
+
+	# This function encrypts four consecutive ChaCha20 blocks by loading the
+	# the state matrix in SSE registers four times. As we need some scratch
+	# registers, we save the first four registers on the stack. The
+	# algorithm performs each operation on the corresponding word of each
+	# state matrix, hence requires no word shuffling. For final XORing step
+	# we transpose the matrix by interleaving 32- and then 64-bit words,
+	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+	# done with the slightly better performing SSSE3 byte shuffling,
+	# 7/12-bit word rotation uses traditional shift+OR.
+
+	mov		%rsp,%r11
+	sub		$0x80,%rsp
+	and		$~63,%rsp
+
+	# x0..15[0-3] = s0..3[0..3]
+	movq		0x00(%rdi),%xmm1
+	pshufd		$0x00,%xmm1,%xmm0
+	pshufd		$0x55,%xmm1,%xmm1
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	movq		0x10(%rdi),%xmm5
+	pshufd		$0x00,%xmm5,%xmm4
+	pshufd		$0x55,%xmm5,%xmm5
+	movq		0x18(%rdi),%xmm7
+	pshufd		$0x00,%xmm7,%xmm6
+	pshufd		$0x55,%xmm7,%xmm7
+	movq		0x20(%rdi),%xmm9
+	pshufd		$0x00,%xmm9,%xmm8
+	pshufd		$0x55,%xmm9,%xmm9
+	movq		0x28(%rdi),%xmm11
+	pshufd		$0x00,%xmm11,%xmm10
+	pshufd		$0x55,%xmm11,%xmm11
+	movq		0x30(%rdi),%xmm13
+	pshufd		$0x00,%xmm13,%xmm12
+	pshufd		$0x55,%xmm13,%xmm13
+	movq		0x38(%rdi),%xmm15
+	pshufd		$0x00,%xmm15,%xmm14
+	pshufd		$0x55,%xmm15,%xmm15
+	# x0..3 on stack
+	movdqa		%xmm0,0x00(%rsp)
+	movdqa		%xmm1,0x10(%rsp)
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm3,0x30(%rsp)
+
+	movdqa		CTRINC(%rip),%xmm1
+	movdqa		ROT8(%rip),%xmm2
+	movdqa		ROT16(%rip),%xmm3
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+	mov		$10,%ecx
+
+.Ldoubleround4:
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+
+	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	paddd		%xmm12,%xmm8
+	pxor		%xmm8,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	paddd		%xmm13,%xmm9
+	pxor		%xmm9,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	paddd		%xmm14,%xmm10
+	pxor		%xmm10,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	paddd		%xmm15,%xmm11
+	pxor		%xmm11,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm3,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm3,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm3,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm3,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$12,%xmm0
+	psrld		$20,%xmm4
+	por		%xmm0,%xmm4
+
+	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	movdqa		0x00(%rsp),%xmm0
+	paddd		%xmm5,%xmm0
+	movdqa		%xmm0,0x00(%rsp)
+	pxor		%xmm0,%xmm15
+	pshufb		%xmm2,%xmm15
+	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	movdqa		0x10(%rsp),%xmm0
+	paddd		%xmm6,%xmm0
+	movdqa		%xmm0,0x10(%rsp)
+	pxor		%xmm0,%xmm12
+	pshufb		%xmm2,%xmm12
+	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	movdqa		0x20(%rsp),%xmm0
+	paddd		%xmm7,%xmm0
+	movdqa		%xmm0,0x20(%rsp)
+	pxor		%xmm0,%xmm13
+	pshufb		%xmm2,%xmm13
+	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	movdqa		0x30(%rsp),%xmm0
+	paddd		%xmm4,%xmm0
+	movdqa		%xmm0,0x30(%rsp)
+	pxor		%xmm0,%xmm14
+	pshufb		%xmm2,%xmm14
+
+	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	paddd		%xmm15,%xmm10
+	pxor		%xmm10,%xmm5
+	movdqa		%xmm5,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm5
+	por		%xmm0,%xmm5
+	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	paddd		%xmm12,%xmm11
+	pxor		%xmm11,%xmm6
+	movdqa		%xmm6,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm6
+	por		%xmm0,%xmm6
+	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	paddd		%xmm13,%xmm8
+	pxor		%xmm8,%xmm7
+	movdqa		%xmm7,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm7
+	por		%xmm0,%xmm7
+	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	paddd		%xmm14,%xmm9
+	pxor		%xmm9,%xmm4
+	movdqa		%xmm4,%xmm0
+	pslld		$7,%xmm0
+	psrld		$25,%xmm4
+	por		%xmm0,%xmm4
+
+	dec		%ecx
+	jnz		.Ldoubleround4
+
+	# x0[0-3] += s0[0]
+	# x1[0-3] += s0[1]
+	movq		0x00(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x00(%rsp),%xmm2
+	movdqa		%xmm2,0x00(%rsp)
+	paddd		0x10(%rsp),%xmm3
+	movdqa		%xmm3,0x10(%rsp)
+	# x2[0-3] += s0[2]
+	# x3[0-3] += s0[3]
+	movq		0x08(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		0x20(%rsp),%xmm2
+	movdqa		%xmm2,0x20(%rsp)
+	paddd		0x30(%rsp),%xmm3
+	movdqa		%xmm3,0x30(%rsp)
+
+	# x4[0-3] += s1[0]
+	# x5[0-3] += s1[1]
+	movq		0x10(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm4
+	paddd		%xmm3,%xmm5
+	# x6[0-3] += s1[2]
+	# x7[0-3] += s1[3]
+	movq		0x18(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm6
+	paddd		%xmm3,%xmm7
+
+	# x8[0-3] += s2[0]
+	# x9[0-3] += s2[1]
+	movq		0x20(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm8
+	paddd		%xmm3,%xmm9
+	# x10[0-3] += s2[2]
+	# x11[0-3] += s2[3]
+	movq		0x28(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm10
+	paddd		%xmm3,%xmm11
+
+	# x12[0-3] += s3[0]
+	# x13[0-3] += s3[1]
+	movq		0x30(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm12
+	paddd		%xmm3,%xmm13
+	# x14[0-3] += s3[2]
+	# x15[0-3] += s3[3]
+	movq		0x38(%rdi),%xmm3
+	pshufd		$0x00,%xmm3,%xmm2
+	pshufd		$0x55,%xmm3,%xmm3
+	paddd		%xmm2,%xmm14
+	paddd		%xmm3,%xmm15
+
+	# x12 += counter values 0-3
+	paddd		%xmm1,%xmm12
+
+	# interleave 32-bit words in state n, n+1
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x10(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x10(%rsp)
+	movdqa		0x20(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpckldq	%xmm1,%xmm2
+	punpckhdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x20(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpckldq	%xmm5,%xmm4
+	punpckhdq	%xmm5,%xmm0
+	movdqa		%xmm0,%xmm5
+	movdqa		%xmm6,%xmm0
+	punpckldq	%xmm7,%xmm6
+	punpckhdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpckldq	%xmm9,%xmm8
+	punpckhdq	%xmm9,%xmm0
+	movdqa		%xmm0,%xmm9
+	movdqa		%xmm10,%xmm0
+	punpckldq	%xmm11,%xmm10
+	punpckhdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpckldq	%xmm13,%xmm12
+	punpckhdq	%xmm13,%xmm0
+	movdqa		%xmm0,%xmm13
+	movdqa		%xmm14,%xmm0
+	punpckldq	%xmm15,%xmm14
+	punpckhdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# interleave 64-bit words in state n, n+2
+	movdqa		0x00(%rsp),%xmm0
+	movdqa		0x20(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x00(%rsp)
+	movdqa		%xmm0,0x20(%rsp)
+	movdqa		0x10(%rsp),%xmm0
+	movdqa		0x30(%rsp),%xmm1
+	movdqa		%xmm0,%xmm2
+	punpcklqdq	%xmm1,%xmm2
+	punpckhqdq	%xmm1,%xmm0
+	movdqa		%xmm2,0x10(%rsp)
+	movdqa		%xmm0,0x30(%rsp)
+	movdqa		%xmm4,%xmm0
+	punpcklqdq	%xmm6,%xmm4
+	punpckhqdq	%xmm6,%xmm0
+	movdqa		%xmm0,%xmm6
+	movdqa		%xmm5,%xmm0
+	punpcklqdq	%xmm7,%xmm5
+	punpckhqdq	%xmm7,%xmm0
+	movdqa		%xmm0,%xmm7
+	movdqa		%xmm8,%xmm0
+	punpcklqdq	%xmm10,%xmm8
+	punpckhqdq	%xmm10,%xmm0
+	movdqa		%xmm0,%xmm10
+	movdqa		%xmm9,%xmm0
+	punpcklqdq	%xmm11,%xmm9
+	punpckhqdq	%xmm11,%xmm0
+	movdqa		%xmm0,%xmm11
+	movdqa		%xmm12,%xmm0
+	punpcklqdq	%xmm14,%xmm12
+	punpckhqdq	%xmm14,%xmm0
+	movdqa		%xmm0,%xmm14
+	movdqa		%xmm13,%xmm0
+	punpcklqdq	%xmm15,%xmm13
+	punpckhqdq	%xmm15,%xmm0
+	movdqa		%xmm0,%xmm15
+
+	# xor with corresponding input, write to output
+	movdqa		0x00(%rsp),%xmm0
+	movdqu		0x00(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x00(%rsi)
+	movdqa		0x10(%rsp),%xmm0
+	movdqu		0x80(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x80(%rsi)
+	movdqa		0x20(%rsp),%xmm0
+	movdqu		0x40(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0x40(%rsi)
+	movdqa		0x30(%rsp),%xmm0
+	movdqu		0xc0(%rdx),%xmm1
+	pxor		%xmm1,%xmm0
+	movdqu		%xmm0,0xc0(%rsi)
+	movdqu		0x10(%rdx),%xmm1
+	pxor		%xmm1,%xmm4
+	movdqu		%xmm4,0x10(%rsi)
+	movdqu		0x90(%rdx),%xmm1
+	pxor		%xmm1,%xmm5
+	movdqu		%xmm5,0x90(%rsi)
+	movdqu		0x50(%rdx),%xmm1
+	pxor		%xmm1,%xmm6
+	movdqu		%xmm6,0x50(%rsi)
+	movdqu		0xd0(%rdx),%xmm1
+	pxor		%xmm1,%xmm7
+	movdqu		%xmm7,0xd0(%rsi)
+	movdqu		0x20(%rdx),%xmm1
+	pxor		%xmm1,%xmm8
+	movdqu		%xmm8,0x20(%rsi)
+	movdqu		0xa0(%rdx),%xmm1
+	pxor		%xmm1,%xmm9
+	movdqu		%xmm9,0xa0(%rsi)
+	movdqu		0x60(%rdx),%xmm1
+	pxor		%xmm1,%xmm10
+	movdqu		%xmm10,0x60(%rsi)
+	movdqu		0xe0(%rdx),%xmm1
+	pxor		%xmm1,%xmm11
+	movdqu		%xmm11,0xe0(%rsi)
+	movdqu		0x30(%rdx),%xmm1
+	pxor		%xmm1,%xmm12
+	movdqu		%xmm12,0x30(%rsi)
+	movdqu		0xb0(%rdx),%xmm1
+	pxor		%xmm1,%xmm13
+	movdqu		%xmm13,0xb0(%rsi)
+	movdqu		0x70(%rdx),%xmm1
+	pxor		%xmm1,%xmm14
+	movdqu		%xmm14,0x70(%rsi)
+	movdqu		0xf0(%rdx),%xmm1
+	pxor		%xmm1,%xmm15
+	movdqu		%xmm15,0xf0(%rsi)
+
+	mov		%r11,%rsp
+	ret
+ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/kernel/arch/x86/crypto/chacha20_glue.c b/kernel/arch/x86/crypto/chacha20_glue.c
new file mode 100644
index 000000000..722bacea0
--- /dev/null
+++ b/kernel/arch/x86/crypto/chacha20_glue.c
@@ -0,0 +1,150 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#define CHACHA20_STATE_ALIGN 16
+
+asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
+static bool chacha20_use_avx2;
+#endif
+
+static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA20_BLOCK_SIZE];
+
+#ifdef CONFIG_AS_AVX2
+	if (chacha20_use_avx2) {
+		while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
+			chacha20_8block_xor_avx2(state, dst, src);
+			bytes -= CHACHA20_BLOCK_SIZE * 8;
+			src += CHACHA20_BLOCK_SIZE * 8;
+			dst += CHACHA20_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+	}
+#endif
+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_ssse3(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE * 4;
+		src += CHACHA20_BLOCK_SIZE * 4;
+		dst += CHACHA20_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_block_xor_ssse3(state, dst, src);
+		bytes -= CHACHA20_BLOCK_SIZE;
+		src += CHACHA20_BLOCK_SIZE;
+		dst += CHACHA20_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_ssse3(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
+			 struct scatterlist *src, unsigned int nbytes)
+{
+	u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1];
+	struct blkcipher_walk walk;
+	int err;
+
+	if (!may_use_simd())
+		return crypto_chacha20_crypt(desc, dst, src, nbytes);
+
+	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
+
+	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
+
+	kernel_fpu_begin();
+
+	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+		err = blkcipher_walk_done(desc, &walk,
+					  walk.nbytes % CHACHA20_BLOCK_SIZE);
+	}
+
+	if (walk.nbytes) {
+		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+				walk.nbytes);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct crypto_alg alg = {
+	.cra_name		= "chacha20",
+	.cra_driver_name	= "chacha20-simd",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_ctxsize		= sizeof(struct chacha20_ctx),
+	.cra_alignmask		= sizeof(u32) - 1,
+	.cra_module		= THIS_MODULE,
+	.cra_u			= {
+		.blkcipher = {
+			.min_keysize	= CHACHA20_KEY_SIZE,
+			.max_keysize	= CHACHA20_KEY_SIZE,
+			.ivsize		= CHACHA20_IV_SIZE,
+			.geniv		= "seqiv",
+			.setkey		= crypto_chacha20_setkey,
+			.encrypt	= chacha20_simd,
+			.decrypt	= chacha20_simd,
+		},
+	},
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!cpu_has_ssse3)
+		return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+	chacha20_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
+			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#endif
+	return crypto_register_alg(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
diff --git a/kernel/arch/x86/crypto/crc32-pclmul_glue.c b/kernel/arch/x86/crypto/crc32-pclmul_glue.c
index 1937fc1d8..07d2c6c86 100644
--- a/kernel/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/kernel/arch/x86/crypto/crc32-pclmul_glue.c
@@ -35,7 +35,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
diff --git a/kernel/arch/x86/crypto/crc32c-intel_glue.c b/kernel/arch/x86/crypto/crc32c-intel_glue.c
index 28640c3d6..81a595d75 100644
--- a/kernel/arch/x86/crypto/crc32c-intel_glue.c
+++ b/kernel/arch/x86/crypto/crc32c-intel_glue.c
@@ -32,8 +32,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
diff --git a/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 225be06ed..4fe27e074 100644
--- a/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/kernel/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -330,7 +330,7 @@ ENDPROC(crc_pcl)
 	## PCLMULQDQ tables
 	## Table is 128 entries x 2 words (8 bytes) each
 	################################################################
-.section	.rotata, "a", %progbits
+.section	.rodata, "a", %progbits
 .align 8
 K_table:
 	.long 0x493c7d27, 0x00000001
diff --git a/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c b/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c
index b6c67bf30..a3fcfc97a 100644
--- a/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/kernel/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -29,7 +29,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
 
diff --git a/kernel/arch/x86/crypto/fpu.c b/kernel/arch/x86/crypto/fpu.c
index f368ba261..e7d679e2a 100644
--- a/kernel/arch/x86/crypto/fpu.c
+++ b/kernel/arch/x86/crypto/fpu.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 struct crypto_fpu_ctx {
 	struct crypto_blkcipher *child;
@@ -156,7 +156,7 @@ int __init crypto_fpu_init(void)
 	return crypto_register_template(&crypto_fpu_tmpl);
 }
 
-void __exit crypto_fpu_exit(void)
+void crypto_fpu_exit(void)
 {
 	crypto_unregister_template(&crypto_fpu_tmpl);
 }
diff --git a/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c b/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c
index daf8d2b9a..440df0c7a 100644
--- a/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/kernel/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,7 +19,7 @@
 #include <crypto/cryptd.h>
 #include <crypto/gf128mul.h>
 #include <crypto/internal/hash.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/cpu_device_id.h>
 
 #define GHASH_BLOCK_SIZE	16
diff --git a/kernel/arch/x86/crypto/poly1305-avx2-x86_64.S b/kernel/arch/x86/crypto/poly1305-avx2-x86_64.S
new file mode 100644
index 000000000..eff2f414e
--- /dev/null
+++ b/kernel/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -0,0 +1,386 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 32
+
+ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
+	.octa 0x0000000003ffffff0000000003ffffff
+ORMASK:	.octa 0x00000000010000000000000001000000
+	.octa 0x00000000010000000000000001000000
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define w0 0x14(%r8)
+#define w1 0x18(%r8)
+#define w2 0x1c(%r8)
+#define w3 0x20(%r8)
+#define w4 0x24(%r8)
+#define y0 0x28(%r8)
+#define y1 0x2c(%r8)
+#define y2 0x30(%r8)
+#define y3 0x34(%r8)
+#define y4 0x38(%r8)
+#define m %rsi
+#define hc0 %ymm0
+#define hc1 %ymm1
+#define hc2 %ymm2
+#define hc3 %ymm3
+#define hc4 %ymm4
+#define hc0x %xmm0
+#define hc1x %xmm1
+#define hc2x %xmm2
+#define hc3x %xmm3
+#define hc4x %xmm4
+#define t1 %ymm5
+#define t2 %ymm6
+#define t1x %xmm5
+#define t2x %xmm6
+#define ruwy0 %ymm7
+#define ruwy1 %ymm8
+#define ruwy2 %ymm9
+#define ruwy3 %ymm10
+#define ruwy4 %ymm11
+#define ruwy0x %xmm7
+#define ruwy1x %xmm8
+#define ruwy2x %xmm9
+#define ruwy3x %xmm10
+#define ruwy4x %xmm11
+#define svxz1 %ymm12
+#define svxz2 %ymm13
+#define svxz3 %ymm14
+#define svxz4 %ymm15
+#define d0 %r9
+#define d1 %r10
+#define d2 %r11
+#define d3 %r12
+#define d4 %r13
+
+ENTRY(poly1305_4block_avx2)
+	# %rdi: Accumulator h[5]
+	# %rsi: 64 byte input block m
+	# %rdx: Poly1305 key r[5]
+	# %rcx: Quadblock count
+	# %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
+
+	# This four-block variant uses loop unrolled block processing. It
+	# requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
+	# h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
+
+	vzeroupper
+	push		%rbx
+	push		%r12
+	push		%r13
+
+	# combine r0,u0,w0,y0
+	vmovd		y0,ruwy0x
+	vmovd		w0,t1x
+	vpunpcklqdq	t1,ruwy0,ruwy0
+	vmovd		u0,t1x
+	vmovd		r0,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy0,ruwy0
+
+	# combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
+	vmovd		y1,ruwy1x
+	vmovd		w1,t1x
+	vpunpcklqdq	t1,ruwy1,ruwy1
+	vmovd		u1,t1x
+	vmovd		r1,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy1,ruwy1
+	vpslld		$2,ruwy1,svxz1
+	vpaddd		ruwy1,svxz1,svxz1
+
+	# combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
+	vmovd		y2,ruwy2x
+	vmovd		w2,t1x
+	vpunpcklqdq	t1,ruwy2,ruwy2
+	vmovd		u2,t1x
+	vmovd		r2,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy2,ruwy2
+	vpslld		$2,ruwy2,svxz2
+	vpaddd		ruwy2,svxz2,svxz2
+
+	# combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
+	vmovd		y3,ruwy3x
+	vmovd		w3,t1x
+	vpunpcklqdq	t1,ruwy3,ruwy3
+	vmovd		u3,t1x
+	vmovd		r3,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy3,ruwy3
+	vpslld		$2,ruwy3,svxz3
+	vpaddd		ruwy3,svxz3,svxz3
+
+	# combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
+	vmovd		y4,ruwy4x
+	vmovd		w4,t1x
+	vpunpcklqdq	t1,ruwy4,ruwy4
+	vmovd		u4,t1x
+	vmovd		r4,t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,ruwy4,ruwy4
+	vpslld		$2,ruwy4,svxz4
+	vpaddd		ruwy4,svxz4,svxz4
+
+.Ldoblock4:
+	# hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
+	#	 m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
+	vmovd		0x00(m),hc0x
+	vmovd		0x10(m),t1x
+	vpunpcklqdq	t1,hc0,hc0
+	vmovd		0x20(m),t1x
+	vmovd		0x30(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc0,hc0
+	vpand		ANMASK(%rip),hc0,hc0
+	vmovd		h0,t1x
+	vpaddd		t1,hc0,hc0
+	# hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
+	#	 (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
+	vmovd		0x03(m),hc1x
+	vmovd		0x13(m),t1x
+	vpunpcklqdq	t1,hc1,hc1
+	vmovd		0x23(m),t1x
+	vmovd		0x33(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc1,hc1
+	vpsrld		$2,hc1,hc1
+	vpand		ANMASK(%rip),hc1,hc1
+	vmovd		h1,t1x
+	vpaddd		t1,hc1,hc1
+	# hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
+	#	 (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
+	vmovd		0x06(m),hc2x
+	vmovd		0x16(m),t1x
+	vpunpcklqdq	t1,hc2,hc2
+	vmovd		0x26(m),t1x
+	vmovd		0x36(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc2,hc2
+	vpsrld		$4,hc2,hc2
+	vpand		ANMASK(%rip),hc2,hc2
+	vmovd		h2,t1x
+	vpaddd		t1,hc2,hc2
+	# hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
+	#	 (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
+	vmovd		0x09(m),hc3x
+	vmovd		0x19(m),t1x
+	vpunpcklqdq	t1,hc3,hc3
+	vmovd		0x29(m),t1x
+	vmovd		0x39(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc3,hc3
+	vpsrld		$6,hc3,hc3
+	vpand		ANMASK(%rip),hc3,hc3
+	vmovd		h3,t1x
+	vpaddd		t1,hc3,hc3
+	# hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
+	#	 (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
+	vmovd		0x0c(m),hc4x
+	vmovd		0x1c(m),t1x
+	vpunpcklqdq	t1,hc4,hc4
+	vmovd		0x2c(m),t1x
+	vmovd		0x3c(m),t2x
+	vpunpcklqdq	t2,t1,t1
+	vperm2i128	$0x20,t1,hc4,hc4
+	vpsrld		$8,hc4,hc4
+	vpor		ORMASK(%rip),hc4,hc4
+	vmovd		h4,t1x
+	vpaddd		t1,hc4,hc4
+
+	# t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
+	vpmuludq	hc0,ruwy0,t1
+	# t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
+	vpmuludq	hc1,svxz4,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
+	vpmuludq	hc2,svxz3,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
+	vpmuludq	hc3,svxz2,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
+	vpmuludq	hc4,svxz1,t2
+	vpaddq		t2,t1,t1
+	# d0 = t1[0] + t1[1] + t[2] + t[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d0
+
+	# t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
+	vpmuludq	hc0,ruwy1,t1
+	# t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
+	vpmuludq	hc1,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
+	vpmuludq	hc2,svxz4,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
+	vpmuludq	hc3,svxz3,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
+	vpmuludq	hc4,svxz2,t2
+	vpaddq		t2,t1,t1
+	# d1 = t1[0] + t1[1] + t1[3] + t1[4]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d1
+
+	# t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
+	vpmuludq	hc0,ruwy2,t1
+	# t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
+	vpmuludq	hc1,ruwy1,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
+	vpmuludq	hc2,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
+	vpmuludq	hc3,svxz4,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
+	vpmuludq	hc4,svxz3,t2
+	vpaddq		t2,t1,t1
+	# d2 = t1[0] + t1[1] + t1[2] + t1[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d2
+
+	# t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
+	vpmuludq	hc0,ruwy3,t1
+	# t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
+	vpmuludq	hc1,ruwy2,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
+	vpmuludq	hc2,ruwy1,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
+	vpmuludq	hc3,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
+	vpmuludq	hc4,svxz4,t2
+	vpaddq		t2,t1,t1
+	# d3 = t1[0] + t1[1] + t1[2] + t1[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d3
+
+	# t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
+	vpmuludq	hc0,ruwy4,t1
+	# t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
+	vpmuludq	hc1,ruwy3,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
+	vpmuludq	hc2,ruwy2,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
+	vpmuludq	hc3,ruwy1,t2
+	vpaddq		t2,t1,t1
+	# t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
+	vpmuludq	hc4,ruwy0,t2
+	vpaddq		t2,t1,t1
+	# d4 = t1[0] + t1[1] + t1[2] + t1[3]
+	vpermq		$0xee,t1,t2
+	vpaddq		t2,t1,t1
+	vpsrldq		$8,t1,t2
+	vpaddq		t2,t1,t1
+	vmovq		t1x,d4
+
+	# d1 += d0 >> 26
+	mov		d0,%rax
+	shr		$26,%rax
+	add		%rax,d1
+	# h0 = d0 & 0x3ffffff
+	mov		d0,%rbx
+	and		$0x3ffffff,%ebx
+
+	# d2 += d1 >> 26
+	mov		d1,%rax
+	shr		$26,%rax
+	add		%rax,d2
+	# h1 = d1 & 0x3ffffff
+	mov		d1,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h1
+
+	# d3 += d2 >> 26
+	mov		d2,%rax
+	shr		$26,%rax
+	add		%rax,d3
+	# h2 = d2 & 0x3ffffff
+	mov		d2,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h2
+
+	# d4 += d3 >> 26
+	mov		d3,%rax
+	shr		$26,%rax
+	add		%rax,d4
+	# h3 = d3 & 0x3ffffff
+	mov		d3,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h3
+
+	# h0 += (d4 >> 26) * 5
+	mov		d4,%rax
+	shr		$26,%rax
+	lea		(%eax,%eax,4),%eax
+	add		%eax,%ebx
+	# h4 = d4 & 0x3ffffff
+	mov		d4,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h4
+
+	# h1 += h0 >> 26
+	mov		%ebx,%eax
+	shr		$26,%eax
+	add		%eax,h1
+	# h0 = h0 & 0x3ffffff
+	andl		$0x3ffffff,%ebx
+	mov		%ebx,h0
+
+	add		$0x40,m
+	dec		%rcx
+	jnz		.Ldoblock4
+
+	vzeroupper
+	pop		%r13
+	pop		%r12
+	pop		%rbx
+	ret
+ENDPROC(poly1305_4block_avx2)
diff --git a/kernel/arch/x86/crypto/poly1305-sse2-x86_64.S b/kernel/arch/x86/crypto/poly1305-sse2-x86_64.S
new file mode 100644
index 000000000..338c74805
--- /dev/null
+++ b/kernel/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -0,0 +1,582 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 16
+
+ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
+ORMASK:	.octa 0x00000000010000000000000001000000
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define s1 0x00(%rsp)
+#define s2 0x04(%rsp)
+#define s3 0x08(%rsp)
+#define s4 0x0c(%rsp)
+#define m %rsi
+#define h01 %xmm0
+#define h23 %xmm1
+#define h44 %xmm2
+#define t1 %xmm3
+#define t2 %xmm4
+#define t3 %xmm5
+#define t4 %xmm6
+#define mask %xmm7
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define d4 %r12
+
+ENTRY(poly1305_block_sse2)
+	# %rdi: Accumulator h[5]
+	# %rsi: 16 byte input block m
+	# %rdx: Poly1305 key r[5]
+	# %rcx: Block count
+
+	# This single block variant tries to improve performance by doing two
+	# multiplications in parallel using SSE instructions. There is quite
+	# some quardword packing involved, hence the speedup is marginal.
+
+	push		%rbx
+	push		%r12
+	sub		$0x10,%rsp
+
+	# s1..s4 = r1..r4 * 5
+	mov		r1,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s1
+	mov		r2,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s2
+	mov		r3,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s3
+	mov		r4,%eax
+	lea		(%eax,%eax,4),%eax
+	mov		%eax,s4
+
+	movdqa		ANMASK(%rip),mask
+
+.Ldoblock:
+	# h01 = [0, h1, 0, h0]
+	# h23 = [0, h3, 0, h2]
+	# h44 = [0, h4, 0, h4]
+	movd		h0,h01
+	movd		h1,t1
+	movd		h2,h23
+	movd		h3,t2
+	movd		h4,h44
+	punpcklqdq	t1,h01
+	punpcklqdq	t2,h23
+	punpcklqdq	h44,h44
+
+	# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
+	movd		0x00(m),t1
+	movd		0x03(m),t2
+	psrld		$2,t2
+	punpcklqdq	t2,t1
+	pand		mask,t1
+	paddd		t1,h01
+	# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
+	movd		0x06(m),t1
+	movd		0x09(m),t2
+	psrld		$4,t1
+	psrld		$6,t2
+	punpcklqdq	t2,t1
+	pand		mask,t1
+	paddd		t1,h23
+	# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
+	mov		0x0c(m),%eax
+	shr		$8,%eax
+	or		$0x01000000,%eax
+	movd		%eax,t1
+	pshufd		$0xc4,t1,t1
+	paddd		t1,h44
+
+	# t1[0] = h0 * r0 + h2 * s3
+	# t1[1] = h1 * s4 + h3 * s2
+	movd		r0,t1
+	movd		s4,t2
+	punpcklqdq	t2,t1
+	pmuludq		h01,t1
+	movd		s3,t2
+	movd		s2,t3
+	punpcklqdq	t3,t2
+	pmuludq		h23,t2
+	paddq		t2,t1
+	# t2[0] = h0 * r1 + h2 * s4
+	# t2[1] = h1 * r0 + h3 * s3
+	movd		r1,t2
+	movd		r0,t3
+	punpcklqdq	t3,t2
+	pmuludq		h01,t2
+	movd		s4,t3
+	movd		s3,t4
+	punpcklqdq	t4,t3
+	pmuludq		h23,t3
+	paddq		t3,t2
+	# t3[0] = h4 * s1
+	# t3[1] = h4 * s2
+	movd		s1,t3
+	movd		s2,t4
+	punpcklqdq	t4,t3
+	pmuludq		h44,t3
+	# d0 = t1[0] + t1[1] + t3[0]
+	# d1 = t2[0] + t2[1] + t3[1]
+	movdqa		t1,t4
+	punpcklqdq	t2,t4
+	punpckhqdq	t2,t1
+	paddq		t4,t1
+	paddq		t3,t1
+	movq		t1,d0
+	psrldq		$8,t1
+	movq		t1,d1
+
+	# t1[0] = h0 * r2 + h2 * r0
+	# t1[1] = h1 * r1 + h3 * s4
+	movd		r2,t1
+	movd		r1,t2
+	punpcklqdq 	t2,t1
+	pmuludq		h01,t1
+	movd		r0,t2
+	movd		s4,t3
+	punpcklqdq	t3,t2
+	pmuludq		h23,t2
+	paddq		t2,t1
+	# t2[0] = h0 * r3 + h2 * r1
+	# t2[1] = h1 * r2 + h3 * r0
+	movd		r3,t2
+	movd		r2,t3
+	punpcklqdq	t3,t2
+	pmuludq		h01,t2
+	movd		r1,t3
+	movd		r0,t4
+	punpcklqdq	t4,t3
+	pmuludq		h23,t3
+	paddq		t3,t2
+	# t3[0] = h4 * s3
+	# t3[1] = h4 * s4
+	movd		s3,t3
+	movd		s4,t4
+	punpcklqdq	t4,t3
+	pmuludq		h44,t3
+	# d2 = t1[0] + t1[1] + t3[0]
+	# d3 = t2[0] + t2[1] + t3[1]
+	movdqa		t1,t4
+	punpcklqdq	t2,t4
+	punpckhqdq	t2,t1
+	paddq		t4,t1
+	paddq		t3,t1
+	movq		t1,d2
+	psrldq		$8,t1
+	movq		t1,d3
+
+	# t1[0] = h0 * r4 + h2 * r2
+	# t1[1] = h1 * r3 + h3 * r1
+	movd		r4,t1
+	movd		r3,t2
+	punpcklqdq	t2,t1
+	pmuludq		h01,t1
+	movd		r2,t2
+	movd		r1,t3
+	punpcklqdq	t3,t2
+	pmuludq		h23,t2
+	paddq		t2,t1
+	# t3[0] = h4 * r0
+	movd		r0,t3
+	pmuludq		h44,t3
+	# d4 = t1[0] + t1[1] + t3[0]
+	movdqa		t1,t4
+	psrldq		$8,t4
+	paddq		t4,t1
+	paddq		t3,t1
+	movq		t1,d4
+
+	# d1 += d0 >> 26
+	mov		d0,%rax
+	shr		$26,%rax
+	add		%rax,d1
+	# h0 = d0 & 0x3ffffff
+	mov		d0,%rbx
+	and		$0x3ffffff,%ebx
+
+	# d2 += d1 >> 26
+	mov		d1,%rax
+	shr		$26,%rax
+	add		%rax,d2
+	# h1 = d1 & 0x3ffffff
+	mov		d1,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h1
+
+	# d3 += d2 >> 26
+	mov		d2,%rax
+	shr		$26,%rax
+	add		%rax,d3
+	# h2 = d2 & 0x3ffffff
+	mov		d2,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h2
+
+	# d4 += d3 >> 26
+	mov		d3,%rax
+	shr		$26,%rax
+	add		%rax,d4
+	# h3 = d3 & 0x3ffffff
+	mov		d3,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h3
+
+	# h0 += (d4 >> 26) * 5
+	mov		d4,%rax
+	shr		$26,%rax
+	lea		(%eax,%eax,4),%eax
+	add		%eax,%ebx
+	# h4 = d4 & 0x3ffffff
+	mov		d4,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h4
+
+	# h1 += h0 >> 26
+	mov		%ebx,%eax
+	shr		$26,%eax
+	add		%eax,h1
+	# h0 = h0 & 0x3ffffff
+	andl		$0x3ffffff,%ebx
+	mov		%ebx,h0
+
+	add		$0x10,m
+	dec		%rcx
+	jnz		.Ldoblock
+
+	add		$0x10,%rsp
+	pop		%r12
+	pop		%rbx
+	ret
+ENDPROC(poly1305_block_sse2)
+
+
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define hc0 %xmm0
+#define hc1 %xmm1
+#define hc2 %xmm2
+#define hc3 %xmm5
+#define hc4 %xmm6
+#define ru0 %xmm7
+#define ru1 %xmm8
+#define ru2 %xmm9
+#define ru3 %xmm10
+#define ru4 %xmm11
+#define sv1 %xmm12
+#define sv2 %xmm13
+#define sv3 %xmm14
+#define sv4 %xmm15
+#undef d0
+#define d0 %r13
+
+ENTRY(poly1305_2block_sse2)
+	# %rdi: Accumulator h[5]
+	# %rsi: 16 byte input block m
+	# %rdx: Poly1305 key r[5]
+	# %rcx: Doubleblock count
+	# %r8:  Poly1305 derived key r^2 u[5]
+
+	# This two-block variant further improves performance by using loop
+	# unrolled block processing. This is more straight forward and does
+	# less byte shuffling, but requires a second Poly1305 key r^2:
+	# h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
+
+	push		%rbx
+	push		%r12
+	push		%r13
+
+	# combine r0,u0
+	movd		u0,ru0
+	movd		r0,t1
+	punpcklqdq	t1,ru0
+
+	# combine r1,u1 and s1=r1*5,v1=u1*5
+	movd		u1,ru1
+	movd		r1,t1
+	punpcklqdq	t1,ru1
+	movdqa		ru1,sv1
+	pslld		$2,sv1
+	paddd		ru1,sv1
+
+	# combine r2,u2 and s2=r2*5,v2=u2*5
+	movd		u2,ru2
+	movd		r2,t1
+	punpcklqdq	t1,ru2
+	movdqa		ru2,sv2
+	pslld		$2,sv2
+	paddd		ru2,sv2
+
+	# combine r3,u3 and s3=r3*5,v3=u3*5
+	movd		u3,ru3
+	movd		r3,t1
+	punpcklqdq	t1,ru3
+	movdqa		ru3,sv3
+	pslld		$2,sv3
+	paddd		ru3,sv3
+
+	# combine r4,u4 and s4=r4*5,v4=u4*5
+	movd		u4,ru4
+	movd		r4,t1
+	punpcklqdq	t1,ru4
+	movdqa		ru4,sv4
+	pslld		$2,sv4
+	paddd		ru4,sv4
+
+.Ldoblock2:
+	# hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
+	movd		0x00(m),hc0
+	movd		0x10(m),t1
+	punpcklqdq	t1,hc0
+	pand		ANMASK(%rip),hc0
+	movd		h0,t1
+	paddd		t1,hc0
+	# hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
+	movd		0x03(m),hc1
+	movd		0x13(m),t1
+	punpcklqdq	t1,hc1
+	psrld		$2,hc1
+	pand		ANMASK(%rip),hc1
+	movd		h1,t1
+	paddd		t1,hc1
+	# hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
+	movd		0x06(m),hc2
+	movd		0x16(m),t1
+	punpcklqdq	t1,hc2
+	psrld		$4,hc2
+	pand		ANMASK(%rip),hc2
+	movd		h2,t1
+	paddd		t1,hc2
+	# hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
+	movd		0x09(m),hc3
+	movd		0x19(m),t1
+	punpcklqdq	t1,hc3
+	psrld		$6,hc3
+	pand		ANMASK(%rip),hc3
+	movd		h3,t1
+	paddd		t1,hc3
+	# hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
+	movd		0x0c(m),hc4
+	movd		0x1c(m),t1
+	punpcklqdq	t1,hc4
+	psrld		$8,hc4
+	por		ORMASK(%rip),hc4
+	movd		h4,t1
+	paddd		t1,hc4
+
+	# t1 = [ hc0[1] * r0, hc0[0] * u0 ]
+	movdqa		ru0,t1
+	pmuludq		hc0,t1
+	# t1 += [ hc1[1] * s4, hc1[0] * v4 ]
+	movdqa		sv4,t2
+	pmuludq		hc1,t2
+	paddq		t2,t1
+	# t1 += [ hc2[1] * s3, hc2[0] * v3 ]
+	movdqa		sv3,t2
+	pmuludq		hc2,t2
+	paddq		t2,t1
+	# t1 += [ hc3[1] * s2, hc3[0] * v2 ]
+	movdqa		sv2,t2
+	pmuludq		hc3,t2
+	paddq		t2,t1
+	# t1 += [ hc4[1] * s1, hc4[0] * v1 ]
+	movdqa		sv1,t2
+	pmuludq		hc4,t2
+	paddq		t2,t1
+	# d0 = t1[0] + t1[1]
+	movdqa		t1,t2
+	psrldq		$8,t2
+	paddq		t2,t1
+	movq		t1,d0
+
+	# t1 = [ hc0[1] * r1, hc0[0] * u1 ]
+	movdqa		ru1,t1
+	pmuludq		hc0,t1
+	# t1 += [ hc1[1] * r0, hc1[0] * u0 ]
+	movdqa		ru0,t2
+	pmuludq		hc1,t2
+	paddq		t2,t1
+	# t1 += [ hc2[1] * s4, hc2[0] * v4 ]
+	movdqa		sv4,t2
+	pmuludq		hc2,t2
+	paddq		t2,t1
+	# t1 += [ hc3[1] * s3, hc3[0] * v3 ]
+	movdqa		sv3,t2
+	pmuludq		hc3,t2
+	paddq		t2,t1
+	# t1 += [ hc4[1] * s2, hc4[0] * v2 ]
+	movdqa		sv2,t2
+	pmuludq		hc4,t2
+	paddq		t2,t1
+	# d1 = t1[0] + t1[1]
+	movdqa		t1,t2
+	psrldq		$8,t2
+	paddq		t2,t1
+	movq		t1,d1
+
+	# t1 = [ hc0[1] * r2, hc0[0] * u2 ]
+	movdqa		ru2,t1
+	pmuludq		hc0,t1
+	# t1 += [ hc1[1] * r1, hc1[0] * u1 ]
+	movdqa		ru1,t2
+	pmuludq		hc1,t2
+	paddq		t2,t1
+	# t1 += [ hc2[1] * r0, hc2[0] * u0 ]
+	movdqa		ru0,t2
+	pmuludq		hc2,t2
+	paddq		t2,t1
+	# t1 += [ hc3[1] * s4, hc3[0] * v4 ]
+	movdqa		sv4,t2
+	pmuludq		hc3,t2
+	paddq		t2,t1
+	# t1 += [ hc4[1] * s3, hc4[0] * v3 ]
+	movdqa		sv3,t2
+	pmuludq		hc4,t2
+	paddq		t2,t1
+	# d2 = t1[0] + t1[1]
+	movdqa		t1,t2
+	psrldq		$8,t2
+	paddq		t2,t1
+	movq		t1,d2
+
+	# t1 = [ hc0[1] * r3, hc0[0] * u3 ]
+	movdqa		ru3,t1
+	pmuludq		hc0,t1
+	# t1 += [ hc1[1] * r2, hc1[0] * u2 ]
+	movdqa		ru2,t2
+	pmuludq		hc1,t2
+	paddq		t2,t1
+	# t1 += [ hc2[1] * r1, hc2[0] * u1 ]
+	movdqa		ru1,t2
+	pmuludq		hc2,t2
+	paddq		t2,t1
+	# t1 += [ hc3[1] * r0, hc3[0] * u0 ]
+	movdqa		ru0,t2
+	pmuludq		hc3,t2
+	paddq		t2,t1
+	# t1 += [ hc4[1] * s4, hc4[0] * v4 ]
+	movdqa		sv4,t2
+	pmuludq		hc4,t2
+	paddq		t2,t1
+	# d3 = t1[0] + t1[1]
+	movdqa		t1,t2
+	psrldq		$8,t2
+	paddq		t2,t1
+	movq		t1,d3
+
+	# t1 = [ hc0[1] * r4, hc0[0] * u4 ]
+	movdqa		ru4,t1
+	pmuludq		hc0,t1
+	# t1 += [ hc1[1] * r3, hc1[0] * u3 ]
+	movdqa		ru3,t2
+	pmuludq		hc1,t2
+	paddq		t2,t1
+	# t1 += [ hc2[1] * r2, hc2[0] * u2 ]
+	movdqa		ru2,t2
+	pmuludq		hc2,t2
+	paddq		t2,t1
+	# t1 += [ hc3[1] * r1, hc3[0] * u1 ]
+	movdqa		ru1,t2
+	pmuludq		hc3,t2
+	paddq		t2,t1
+	# t1 += [ hc4[1] * r0, hc4[0] * u0 ]
+	movdqa		ru0,t2
+	pmuludq		hc4,t2
+	paddq		t2,t1
+	# d4 = t1[0] + t1[1]
+	movdqa		t1,t2
+	psrldq		$8,t2
+	paddq		t2,t1
+	movq		t1,d4
+
+	# d1 += d0 >> 26
+	mov		d0,%rax
+	shr		$26,%rax
+	add		%rax,d1
+	# h0 = d0 & 0x3ffffff
+	mov		d0,%rbx
+	and		$0x3ffffff,%ebx
+
+	# d2 += d1 >> 26
+	mov		d1,%rax
+	shr		$26,%rax
+	add		%rax,d2
+	# h1 = d1 & 0x3ffffff
+	mov		d1,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h1
+
+	# d3 += d2 >> 26
+	mov		d2,%rax
+	shr		$26,%rax
+	add		%rax,d3
+	# h2 = d2 & 0x3ffffff
+	mov		d2,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h2
+
+	# d4 += d3 >> 26
+	mov		d3,%rax
+	shr		$26,%rax
+	add		%rax,d4
+	# h3 = d3 & 0x3ffffff
+	mov		d3,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h3
+
+	# h0 += (d4 >> 26) * 5
+	mov		d4,%rax
+	shr		$26,%rax
+	lea		(%eax,%eax,4),%eax
+	add		%eax,%ebx
+	# h4 = d4 & 0x3ffffff
+	mov		d4,%rax
+	and		$0x3ffffff,%eax
+	mov		%eax,h4
+
+	# h1 += h0 >> 26
+	mov		%ebx,%eax
+	shr		$26,%eax
+	add		%eax,h1
+	# h0 = h0 & 0x3ffffff
+	andl		$0x3ffffff,%ebx
+	mov		%ebx,h0
+
+	add		$0x20,m
+	dec		%rcx
+	jnz		.Ldoblock2
+
+	pop		%r13
+	pop		%r12
+	pop		%rbx
+	ret
+ENDPROC(poly1305_2block_sse2)
diff --git a/kernel/arch/x86/crypto/poly1305_glue.c b/kernel/arch/x86/crypto/poly1305_glue.c
new file mode 100644
index 000000000..4264a3d59
--- /dev/null
+++ b/kernel/arch/x86/crypto/poly1305_glue.c
@@ -0,0 +1,207 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/poly1305.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+struct poly1305_simd_desc_ctx {
+	struct poly1305_desc_ctx base;
+	/* derived key u set? */
+	bool uset;
+#ifdef CONFIG_AS_AVX2
+	/* derived keys r^3, r^4 set? */
+	bool wset;
+#endif
+	/* derived Poly1305 key r^2 */
+	u32 u[5];
+	/* ... silently appended r^3 and r^4 when using AVX2 */
+};
+
+asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+				    const u32 *r, unsigned int blocks);
+asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
+				     unsigned int blocks, const u32 *u);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+				     unsigned int blocks, const u32 *u);
+static bool poly1305_use_avx2;
+#endif
+
+static int poly1305_simd_init(struct shash_desc *desc)
+{
+	struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
+
+	sctx->uset = false;
+#ifdef CONFIG_AS_AVX2
+	sctx->wset = false;
+#endif
+
+	return crypto_poly1305_init(desc);
+}
+
+static void poly1305_simd_mult(u32 *a, const u32 *b)
+{
+	u8 m[POLY1305_BLOCK_SIZE];
+
+	memset(m, 0, sizeof(m));
+	/* The poly1305 block function adds a hi-bit to the accumulator which
+	 * we don't need for key multiplication; compensate for it. */
+	a[4] -= 1 << 24;
+	poly1305_block_sse2(a, m, b, 1);
+}
+
+static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+					 const u8 *src, unsigned int srclen)
+{
+	struct poly1305_simd_desc_ctx *sctx;
+	unsigned int blocks, datalen;
+
+	BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
+	sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
+
+	if (unlikely(!dctx->sset)) {
+		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+		src += srclen - datalen;
+		srclen = datalen;
+	}
+
+#ifdef CONFIG_AS_AVX2
+	if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
+		if (unlikely(!sctx->wset)) {
+			if (!sctx->uset) {
+				memcpy(sctx->u, dctx->r, sizeof(sctx->u));
+				poly1305_simd_mult(sctx->u, dctx->r);
+				sctx->uset = true;
+			}
+			memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
+			poly1305_simd_mult(sctx->u + 5, dctx->r);
+			memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
+			poly1305_simd_mult(sctx->u + 10, dctx->r);
+			sctx->wset = true;
+		}
+		blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
+		poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+		src += POLY1305_BLOCK_SIZE * 4 * blocks;
+		srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+	}
+#endif
+	if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
+		if (unlikely(!sctx->uset)) {
+			memcpy(sctx->u, dctx->r, sizeof(sctx->u));
+			poly1305_simd_mult(sctx->u, dctx->r);
+			sctx->uset = true;
+		}
+		blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
+		poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+		src += POLY1305_BLOCK_SIZE * 2 * blocks;
+		srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
+	}
+	if (srclen >= POLY1305_BLOCK_SIZE) {
+		poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+		srclen -= POLY1305_BLOCK_SIZE;
+	}
+	return srclen;
+}
+
+static int poly1305_simd_update(struct shash_desc *desc,
+				const u8 *src, unsigned int srclen)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+	unsigned int bytes;
+
+	/* kernel_fpu_begin/end is costly, use fallback for small updates */
+	if (srclen <= 288 || !may_use_simd())
+		return crypto_poly1305_update(desc, src, srclen);
+
+	kernel_fpu_begin();
+
+	if (unlikely(dctx->buflen)) {
+		bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+		memcpy(dctx->buf + dctx->buflen, src, bytes);
+		src += bytes;
+		srclen -= bytes;
+		dctx->buflen += bytes;
+
+		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+			poly1305_simd_blocks(dctx, dctx->buf,
+					     POLY1305_BLOCK_SIZE);
+			dctx->buflen = 0;
+		}
+	}
+
+	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+		bytes = poly1305_simd_blocks(dctx, src, srclen);
+		src += srclen - bytes;
+		srclen = bytes;
+	}
+
+	kernel_fpu_end();
+
+	if (unlikely(srclen)) {
+		dctx->buflen = srclen;
+		memcpy(dctx->buf, src, srclen);
+	}
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	= POLY1305_DIGEST_SIZE,
+	.init		= poly1305_simd_init,
+	.update		= poly1305_simd_update,
+	.final		= crypto_poly1305_final,
+	.setkey		= crypto_poly1305_setkey,
+	.descsize	= sizeof(struct poly1305_simd_desc_ctx),
+	.base		= {
+		.cra_name		= "poly1305",
+		.cra_driver_name	= "poly1305-simd",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_alignmask		= sizeof(u32) - 1,
+		.cra_blocksize		= POLY1305_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	},
+};
+
+static int __init poly1305_simd_mod_init(void)
+{
+	if (!cpu_has_xmm2)
+		return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+	poly1305_use_avx2 = cpu_has_avx && cpu_has_avx2 &&
+			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+	alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
+	if (poly1305_use_avx2)
+		alg.descsize += 10 * sizeof(u32);
+#endif
+	return crypto_register_shash(&alg);
+}
+
+static void __exit poly1305_simd_mod_exit(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(poly1305_simd_mod_init);
+module_exit(poly1305_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("Poly1305 authenticator");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/kernel/arch/x86/crypto/serpent_avx2_glue.c b/kernel/arch/x86/crypto/serpent_avx2_glue.c
index 2f63dc89e..6d198342e 100644
--- a/kernel/arch/x86/crypto/serpent_avx2_glue.c
+++ b/kernel/arch/x86/crypto/serpent_avx2_glue.c
@@ -20,8 +20,7 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 #include <crypto/serpent.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/serpent-avx.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -537,16 +536,15 @@ static struct crypto_alg srp_algs[10] = { {
 
 static int __init init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
 	if (!cpu_has_avx2 || !cpu_has_osxsave) {
 		pr_info("AVX2 instructions are not detected.\n");
 		return -ENODEV;
 	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/kernel/arch/x86/crypto/serpent_avx_glue.c b/kernel/arch/x86/crypto/serpent_avx_glue.c
index c8d478af8..5dc37026c 100644
--- a/kernel/arch/x86/crypto/serpent_avx_glue.c
+++ b/kernel/arch/x86/crypto/serpent_avx_glue.c
@@ -36,8 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/serpent-avx.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -596,16 +595,11 @@ static struct crypto_alg serpent_algs[10] = { {
 
 static int __init serpent_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		printk(KERN_INFO "AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		printk(KERN_INFO "AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				&feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/kernel/arch/x86/crypto/sha-mb/sha1_mb.c b/kernel/arch/x86/crypto/sha-mb/sha1_mb.c
index e510b1c5d..a841e9765 100644
--- a/kernel/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/kernel/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -65,11 +65,8 @@
 #include <crypto/mcryptd.h>
 #include <crypto/crypto_wq.h>
 #include <asm/byteorder.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
 #include <linux/hardirq.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
 #include "sha_mb_ctx.h"
 
 #define FLUSH_INTERVAL 1000 /* in usec */
@@ -885,7 +882,8 @@ static int __init sha1_mb_mod_init(void)
 		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
 		cpu_state->cpu = cpu;
 		cpu_state->alg_state = &sha1_mb_alg_state;
-		cpu_state->mgr = (struct sha1_ctx_mgr *) kzalloc(sizeof(struct sha1_ctx_mgr), GFP_KERNEL);
+		cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
+					GFP_KERNEL);
 		if (!cpu_state->mgr)
 			goto err2;
 		sha1_ctx_mgr_init(cpu_state->mgr);
diff --git a/kernel/arch/x86/crypto/sha1_ni_asm.S b/kernel/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 000000000..874a651b9
--- /dev/null
+++ b/kernel/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,302 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define RSPSAVE		%rax
+
+/* gcc conversion */
+#define FRAME_SIZE	32	/* space for 2x16 bytes */
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+#define SHUF_MASK	%xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process.  Once all blocks have
+ * been processed, the digest pointer is  updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(uint32_t *digest, const void *data,
+		uint32_t numBlocks)
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+.text
+.align 32
+ENTRY(sha1_ni_transform)
+	mov		%rsp, RSPSAVE
+	sub		$FRAME_SIZE, %rsp
+	and		$~0xF, %rsp
+
+	shl		$6, NUM_BLKS		/* convert to bytes */
+	jz		.Ldone_hash
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/* load initial hash values */
+	pinsrd		$3, 1*16(DIGEST_PTR), E0
+	movdqu		0*16(DIGEST_PTR), ABCD
+	pand		UPPER_WORD_MASK(%rip), E0
+	pshufd		$0x1B, ABCD, ABCD
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		E0, (0*16)(%rsp)
+	movdqa		ABCD, (1*16)(%rsp)
+
+	/* Rounds 0-3 */
+	movdqu		0*16(DATA_PTR), MSG0
+	pshufb		SHUF_MASK, MSG0
+		paddd		MSG0, E0
+		movdqa		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+
+	/* Rounds 4-7 */
+	movdqu		1*16(DATA_PTR), MSG1
+	pshufb		SHUF_MASK, MSG1
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG1, MSG0
+
+	/* Rounds 8-11 */
+	movdqu		2*16(DATA_PTR), MSG2
+	pshufb		SHUF_MASK, MSG2
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 12-15 */
+	movdqu		3*16(DATA_PTR), MSG3
+	pshufb		SHUF_MASK, MSG3
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 16-19 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 20-23 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 24-27 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 28-31 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 32-35 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 36-39 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 40-43 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 44-47 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 48-51 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 52-55 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	pxor		MSG1, MSG3
+
+	/* Rounds 56-59 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	pxor		MSG2, MSG0
+
+	/* Rounds 60-63 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$3, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	pxor		MSG3, MSG1
+
+	/* Rounds 64-67 */
+		sha1nexte	MSG0, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$3, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	pxor		MSG0, MSG2
+
+	/* Rounds 68-71 */
+		sha1nexte	MSG1, E1
+		movdqa		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$3, E1, ABCD
+	pxor		MSG1, MSG3
+
+	/* Rounds 72-75 */
+		sha1nexte	MSG2, E0
+		movdqa		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$3, E0, ABCD
+
+	/* Rounds 76-79 */
+		sha1nexte	MSG3, E1
+		movdqa		ABCD, E0
+		sha1rnds4	$3, E1, ABCD
+
+	/* Add current hash values with previously saved */
+	sha1nexte	(0*16)(%rsp), E0
+	paddd		(1*16)(%rsp), ABCD
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	pshufd		$0x1B, ABCD, ABCD
+	movdqu		ABCD, 0*16(DIGEST_PTR)
+	pextrd		$3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+	mov		RSPSAVE, %rsp
+
+	ret
+ENDPROC(sha1_ni_transform)
+
+.data
+
+.align 64
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x000102030405060708090a0b0c0d0e0f
+UPPER_WORD_MASK:
+	.octa 0xFFFFFFFF000000000000000000000000
diff --git a/kernel/arch/x86/crypto/sha1_ssse3_glue.c b/kernel/arch/x86/crypto/sha1_ssse3_glue.c
index 33d1b9dc1..dd14616b7 100644
--- a/kernel/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/kernel/arch/x86/crypto/sha1_ssse3_glue.c
@@ -29,28 +29,13 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha1_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 
+typedef void (sha1_transform_fn)(u32 *digest, const char *data,
+				unsigned int rounds);
 
-asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
-				     unsigned int rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
-				   unsigned int rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-#define SHA1_AVX2_BLOCK_OPTSIZE	4	/* optimal 4*64 bytes of SHA1 blocks */
-
-asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
-				    unsigned int rounds);
-#endif
-
-static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
-
-static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, sha1_transform_fn *sha1_xform)
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
 
@@ -63,14 +48,14 @@ static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
 
 	kernel_fpu_begin();
 	sha1_base_do_update(desc, data, len,
-			    (sha1_block_fn *)sha1_transform_asm);
+			    (sha1_block_fn *)sha1_xform);
 	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
+static int sha1_finup(struct shash_desc *desc, const u8 *data,
+		      unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
 {
 	if (!irq_fpu_usable())
 		return crypto_sha1_finup(desc, data, len, out);
@@ -78,32 +63,37 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
 	kernel_fpu_begin();
 	if (len)
 		sha1_base_do_update(desc, data, len,
-				    (sha1_block_fn *)sha1_transform_asm);
-	sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
+				    (sha1_block_fn *)sha1_xform);
+	sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform);
 	kernel_fpu_end();
 
 	return sha1_base_finish(desc, out);
 }
 
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
+				     unsigned int rounds);
+
+static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
 {
-	return sha1_ssse3_finup(desc, NULL, 0, out);
+	return sha1_update(desc, data, len,
+			(sha1_transform_fn *) sha1_transform_ssse3);
 }
 
-#ifdef CONFIG_AS_AVX2
-static void sha1_apply_transform_avx2(u32 *digest, const char *data,
-				unsigned int rounds)
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
-	/* Select the optimal transform based on data block size */
-	if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
-		sha1_transform_avx2(digest, data, rounds);
-	else
-		sha1_transform_avx(digest, data, rounds);
+	return sha1_finup(desc, data, len, out,
+			(sha1_transform_fn *) sha1_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+	return sha1_ssse3_finup(desc, NULL, 0, out);
 }
-#endif
 
-static struct shash_alg alg = {
+static struct shash_alg sha1_ssse3_alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_base_init,
 	.update		=	sha1_ssse3_update,
@@ -112,7 +102,7 @@ static struct shash_alg alg = {
 	.descsize	=	sizeof(struct sha1_state),
 	.base		=	{
 		.cra_name	=	"sha1",
-		.cra_driver_name=	"sha1-ssse3",
+		.cra_driver_name =	"sha1-ssse3",
 		.cra_priority	=	150,
 		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
@@ -120,73 +110,261 @@ static struct shash_alg alg = {
 	}
 };
 
+static int register_sha1_ssse3(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		return crypto_register_shash(&sha1_ssse3_alg);
+	return 0;
+}
+
+static void unregister_sha1_ssse3(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		crypto_unregister_shash(&sha1_ssse3_alg);
+}
+
 #ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
+				   unsigned int rounds);
+
+static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
 {
-	u64 xcr0;
+	return sha1_update(desc, data, len,
+			(sha1_transform_fn *) sha1_transform_avx);
+}
 
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
+static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return sha1_finup(desc, data, len, out,
+			(sha1_transform_fn *) sha1_transform_avx);
+}
 
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+static int sha1_avx_final(struct shash_desc *desc, u8 *out)
+{
+	return sha1_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx_alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_base_init,
+	.update		=	sha1_avx_update,
+	.final		=	sha1_avx_final,
+	.finup		=	sha1_avx_finup,
+	.descsize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name	=	"sha1",
+		.cra_driver_name =	"sha1-avx",
+		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA1_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
 
+static bool avx_usable(void)
+{
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
 	return true;
 }
 
-#ifdef CONFIG_AS_AVX2
-static bool __init avx2_usable(void)
+static int register_sha1_avx(void)
 {
-	if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) &&
-	    boot_cpu_has(X86_FEATURE_BMI2))
+	if (avx_usable())
+		return crypto_register_shash(&sha1_avx_alg);
+	return 0;
+}
+
+static void unregister_sha1_avx(void)
+{
+	if (avx_usable())
+		crypto_unregister_shash(&sha1_avx_alg);
+}
+
+#else  /* CONFIG_AS_AVX */
+static inline int register_sha1_avx(void) { return 0; }
+static inline void unregister_sha1_avx(void) { }
+#endif /* CONFIG_AS_AVX */
+
+
+#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
+#define SHA1_AVX2_BLOCK_OPTSIZE	4	/* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
+				    unsigned int rounds);
+
+static bool avx2_usable(void)
+{
+	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+		&& boot_cpu_has(X86_FEATURE_BMI1)
+		&& boot_cpu_has(X86_FEATURE_BMI2))
 		return true;
 
 	return false;
 }
+
+static void sha1_apply_transform_avx2(u32 *digest, const char *data,
+				unsigned int rounds)
+{
+	/* Select the optimal transform based on data block size */
+	if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
+		sha1_transform_avx2(digest, data, rounds);
+	else
+		sha1_transform_avx(digest, data, rounds);
+}
+
+static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	return sha1_update(desc, data, len,
+		(sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return sha1_finup(desc, data, len, out,
+		(sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
+{
+	return sha1_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx2_alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_base_init,
+	.update		=	sha1_avx2_update,
+	.final		=	sha1_avx2_final,
+	.finup		=	sha1_avx2_finup,
+	.descsize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name	=	"sha1",
+		.cra_driver_name =	"sha1-avx2",
+		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA1_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static int register_sha1_avx2(void)
+{
+	if (avx2_usable())
+		return crypto_register_shash(&sha1_avx2_alg);
+	return 0;
+}
+
+static void unregister_sha1_avx2(void)
+{
+	if (avx2_usable())
+		crypto_unregister_shash(&sha1_avx2_alg);
+}
+
+#else
+static inline int register_sha1_avx2(void) { return 0; }
+static inline void unregister_sha1_avx2(void) { }
 #endif
+
+#ifdef CONFIG_AS_SHA1_NI
+asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
+				   unsigned int rounds);
+
+static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	return sha1_update(desc, data, len,
+		(sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return sha1_finup(desc, data, len, out,
+		(sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_final(struct shash_desc *desc, u8 *out)
+{
+	return sha1_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_ni_alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_base_init,
+	.update		=	sha1_ni_update,
+	.final		=	sha1_ni_final,
+	.finup		=	sha1_ni_finup,
+	.descsize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name	=	"sha1",
+		.cra_driver_name =	"sha1-ni",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA1_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static int register_sha1_ni(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI))
+		return crypto_register_shash(&sha1_ni_alg);
+	return 0;
+}
+
+static void unregister_sha1_ni(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI))
+		crypto_unregister_shash(&sha1_ni_alg);
+}
+
+#else
+static inline int register_sha1_ni(void) { return 0; }
+static inline void unregister_sha1_ni(void) { }
 #endif
 
 static int __init sha1_ssse3_mod_init(void)
 {
-	char *algo_name;
+	if (register_sha1_ssse3())
+		goto fail;
 
-	/* test for SSSE3 first */
-	if (cpu_has_ssse3) {
-		sha1_transform_asm = sha1_transform_ssse3;
-		algo_name = "SSSE3";
+	if (register_sha1_avx()) {
+		unregister_sha1_ssse3();
+		goto fail;
 	}
 
-#ifdef CONFIG_AS_AVX
-	/* allow AVX to override SSSE3, it's a little faster */
-	if (avx_usable()) {
-		sha1_transform_asm = sha1_transform_avx;
-		algo_name = "AVX";
-#ifdef CONFIG_AS_AVX2
-		/* allow AVX2 to override AVX, it's a little faster */
-		if (avx2_usable()) {
-			sha1_transform_asm = sha1_apply_transform_avx2;
-			algo_name = "AVX2";
-		}
-#endif
+	if (register_sha1_avx2()) {
+		unregister_sha1_avx();
+		unregister_sha1_ssse3();
+		goto fail;
 	}
-#endif
 
-	if (sha1_transform_asm) {
-		pr_info("Using %s optimized SHA-1 implementation\n", algo_name);
-		return crypto_register_shash(&alg);
+	if (register_sha1_ni()) {
+		unregister_sha1_avx2();
+		unregister_sha1_avx();
+		unregister_sha1_ssse3();
+		goto fail;
 	}
-	pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n");
 
+	return 0;
+fail:
 	return -ENODEV;
 }
 
 static void __exit sha1_ssse3_mod_fini(void)
 {
-	crypto_unregister_shash(&alg);
+	unregister_sha1_ni();
+	unregister_sha1_avx2();
+	unregister_sha1_avx();
+	unregister_sha1_ssse3();
 }
 
 module_init(sha1_ssse3_mod_init);
diff --git a/kernel/arch/x86/crypto/sha256_ni_asm.S b/kernel/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 000000000..748cdf21a
--- /dev/null
+++ b/kernel/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,353 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * 	Sean Gulley <sean.m.gulley@intel.com>
+ * 	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 	* Redistributions of source code must retain the above copyright
+ * 	  notice, this list of conditions and the following disclaimer.
+ * 	* Redistributions in binary form must reproduce the above copyright
+ * 	  notice, this list of conditions and the following disclaimer in
+ * 	  the documentation and/or other materials provided with the
+ * 	  distribution.
+ * 	* Neither the name of Intel Corporation nor the names of its
+ * 	  contributors may be used to endorse or promote products derived
+ * 	  from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR	%rdi	/* 1st arg */
+#define DATA_PTR	%rsi	/* 2nd arg */
+#define NUM_BLKS	%rdx	/* 3rd arg */
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+#define MSGTMP4		%xmm7
+
+#define SHUF_MASK	%xmm8
+
+#define ABEF_SAVE	%xmm9
+#define CDGH_SAVE	%xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process.  Once all blocks have
+ * been processed, the digest pointer is  updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks.  All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha256_ni_transform(uint32_t *digest, const void *data,
+		uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+
+.text
+.align 32
+ENTRY(sha256_ni_transform)
+
+	shl		$6, NUM_BLKS		/*  convert to bytes */
+	jz		.Ldone_hash
+	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
+
+	/*
+	 * load initial hash values
+	 * Need to reorder these appropriately
+	 * DCBA, HGFE -> ABEF, CDGH
+	 */
+	movdqu		0*16(DIGEST_PTR), STATE0
+	movdqu		1*16(DIGEST_PTR), STATE1
+
+	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
+	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
+	movdqa		STATE0, MSGTMP4
+	palignr		$8, STATE1,  STATE0		/* ABEF */
+	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+
+	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+	lea		K256(%rip), SHA256CONSTANTS
+
+.Lloop0:
+	/* Save hash values for addition after rounds */
+	movdqa		STATE0, ABEF_SAVE
+	movdqa		STATE1, CDGH_SAVE
+
+	/* Rounds 0-3 */
+	movdqu		0*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP0
+		paddd		0*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movdqu		1*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP1
+		paddd		1*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movdqu		2*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP2
+		paddd		2*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movdqu		3*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	movdqa		MSG, MSGTMP3
+		paddd		3*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	movdqa		MSGTMP0, MSG
+		paddd		4*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	movdqa		MSGTMP1, MSG
+		paddd		5*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	movdqa		MSGTMP2, MSG
+		paddd		6*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	movdqa		MSGTMP3, MSG
+		paddd		7*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	movdqa		MSGTMP0, MSG
+		paddd		8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	movdqa		MSGTMP1, MSG
+		paddd		9*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	movdqa		MSGTMP2, MSG
+		paddd		10*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	movdqa		MSGTMP3, MSG
+		paddd		11*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	movdqa		MSGTMP0, MSG
+		paddd		12*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	movdqa		MSGTMP1, MSG
+		paddd		13*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 56-59 */
+	movdqa		MSGTMP2, MSG
+		paddd		14*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	movdqa		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 60-63 */
+	movdqa		MSGTMP3, MSG
+		paddd		15*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		pshufd 		$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Add current hash values with previously saved */
+	paddd		ABEF_SAVE, STATE0
+	paddd		CDGH_SAVE, STATE1
+
+	/* Increment data pointer and loop if more to process */
+	add		$64, DATA_PTR
+	cmp		NUM_BLKS, DATA_PTR
+	jne		.Lloop0
+
+	/* Write hash values back in the correct order */
+	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
+	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
+	movdqa		STATE0, MSGTMP4
+	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
+	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+
+	movdqu		STATE0, 0*16(DIGEST_PTR)
+	movdqu		STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+	ret
+ENDPROC(sha256_ni_transform)
+
+.data
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/kernel/arch/x86/crypto/sha256_ssse3_glue.c b/kernel/arch/x86/crypto/sha256_ssse3_glue.c
index ccc338881..5f4d6086d 100644
--- a/kernel/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/kernel/arch/x86/crypto/sha256_ssse3_glue.c
@@ -37,26 +37,15 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <linux/string.h>
 
 asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
 				       u64 rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
-				     u64 rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
-				      u64 rounds);
-#endif
-
-static void (*sha256_transform_asm)(u32 *, const char *, u64);
+typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
 
-static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
+static int sha256_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len, sha256_transform_fn *sha256_xform)
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 
@@ -69,14 +58,14 @@ static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
 
 	kernel_fpu_begin();
 	sha256_base_do_update(desc, data, len,
-			      (sha256_block_fn *)sha256_transform_asm);
+			      (sha256_block_fn *)sha256_xform);
 	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+	      unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
 {
 	if (!irq_fpu_usable())
 		return crypto_sha256_finup(desc, data, len, out);
@@ -84,20 +73,32 @@ static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
 	kernel_fpu_begin();
 	if (len)
 		sha256_base_do_update(desc, data, len,
-				      (sha256_block_fn *)sha256_transform_asm);
-	sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
+				      (sha256_block_fn *)sha256_xform);
+	sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
 	kernel_fpu_end();
 
 	return sha256_base_finish(desc, out);
 }
 
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len)
+{
+	return sha256_update(desc, data, len, sha256_transform_ssse3);
+}
+
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+	      unsigned int len, u8 *out)
+{
+	return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
+}
+
 /* Add padding and return the message digest. */
 static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
 {
 	return sha256_ssse3_finup(desc, NULL, 0, out);
 }
 
-static struct shash_alg algs[] = { {
+static struct shash_alg sha256_ssse3_algs[] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_base_init,
 	.update		=	sha256_ssse3_update,
@@ -129,64 +130,294 @@ static struct shash_alg algs[] = { {
 	}
 } };
 
+static int register_sha256_ssse3(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		return crypto_register_shashes(sha256_ssse3_algs,
+				ARRAY_SIZE(sha256_ssse3_algs));
+	return 0;
+}
+
+static void unregister_sha256_ssse3(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		crypto_unregister_shashes(sha256_ssse3_algs,
+				ARRAY_SIZE(sha256_ssse3_algs));
+}
+
 #ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
+				     u64 rounds);
+
+static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len)
 {
-	u64 xcr0;
+	return sha256_update(desc, data, len, sha256_transform_avx);
+}
 
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
+static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
+		      unsigned int len, u8 *out)
+{
+	return sha256_finup(desc, data, len, out, sha256_transform_avx);
+}
 
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+static int sha256_avx_final(struct shash_desc *desc, u8 *out)
+{
+	return sha256_avx_finup(desc, NULL, 0, out);
+}
 
+static struct shash_alg sha256_avx_algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_base_init,
+	.update		=	sha256_avx_update,
+	.final		=	sha256_avx_final,
+	.finup		=	sha256_avx_finup,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-avx",
+		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_base_init,
+	.update		=	sha256_avx_update,
+	.final		=	sha256_avx_final,
+	.finup		=	sha256_avx_finup,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-avx",
+		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static bool avx_usable(void)
+{
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
 	return true;
 }
-#endif
 
-static int __init sha256_ssse3_mod_init(void)
+static int register_sha256_avx(void)
 {
-	/* test for SSSE3 first */
-	if (cpu_has_ssse3)
-		sha256_transform_asm = sha256_transform_ssse3;
+	if (avx_usable())
+		return crypto_register_shashes(sha256_avx_algs,
+				ARRAY_SIZE(sha256_avx_algs));
+	return 0;
+}
 
-#ifdef CONFIG_AS_AVX
-	/* allow AVX to override SSSE3, it's a little faster */
-	if (avx_usable()) {
-#ifdef CONFIG_AS_AVX2
-		if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2))
-			sha256_transform_asm = sha256_transform_rorx;
-		else
+static void unregister_sha256_avx(void)
+{
+	if (avx_usable())
+		crypto_unregister_shashes(sha256_avx_algs,
+				ARRAY_SIZE(sha256_avx_algs));
+}
+
+#else
+static inline int register_sha256_avx(void) { return 0; }
+static inline void unregister_sha256_avx(void) { }
 #endif
-			sha256_transform_asm = sha256_transform_avx;
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+				      u64 rounds);
+
+static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len)
+{
+	return sha256_update(desc, data, len, sha256_transform_rorx);
+}
+
+static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
+		      unsigned int len, u8 *out)
+{
+	return sha256_finup(desc, data, len, out, sha256_transform_rorx);
+}
+
+static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
+{
+	return sha256_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx2_algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_base_init,
+	.update		=	sha256_avx2_update,
+	.final		=	sha256_avx2_final,
+	.finup		=	sha256_avx2_finup,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-avx2",
+		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
 	}
-#endif
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_base_init,
+	.update		=	sha256_avx2_update,
+	.final		=	sha256_avx2_final,
+	.finup		=	sha256_avx2_finup,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-avx2",
+		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
 
-	if (sha256_transform_asm) {
-#ifdef CONFIG_AS_AVX
-		if (sha256_transform_asm == sha256_transform_avx)
-			pr_info("Using AVX optimized SHA-256 implementation\n");
-#ifdef CONFIG_AS_AVX2
-		else if (sha256_transform_asm == sha256_transform_rorx)
-			pr_info("Using AVX2 optimized SHA-256 implementation\n");
+static bool avx2_usable(void)
+{
+	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+		return true;
+
+	return false;
+}
+
+static int register_sha256_avx2(void)
+{
+	if (avx2_usable())
+		return crypto_register_shashes(sha256_avx2_algs,
+				ARRAY_SIZE(sha256_avx2_algs));
+	return 0;
+}
+
+static void unregister_sha256_avx2(void)
+{
+	if (avx2_usable())
+		crypto_unregister_shashes(sha256_avx2_algs,
+				ARRAY_SIZE(sha256_avx2_algs));
+}
+
+#else
+static inline int register_sha256_avx2(void) { return 0; }
+static inline void unregister_sha256_avx2(void) { }
 #endif
-		else
+
+#ifdef CONFIG_AS_SHA256_NI
+asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
+				   u64 rounds); /*unsigned int rounds);*/
+
+static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len)
+{
+	return sha256_update(desc, data, len, sha256_ni_transform);
+}
+
+static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
+		      unsigned int len, u8 *out)
+{
+	return sha256_finup(desc, data, len, out, sha256_ni_transform);
+}
+
+static int sha256_ni_final(struct shash_desc *desc, u8 *out)
+{
+	return sha256_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_ni_algs[] = { {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_base_init,
+	.update		=	sha256_ni_update,
+	.final		=	sha256_ni_final,
+	.finup		=	sha256_ni_finup,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-ni",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+}, {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_base_init,
+	.update		=	sha256_ni_update,
+	.final		=	sha256_ni_final,
+	.finup		=	sha256_ni_finup,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name =	"sha224-ni",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static int register_sha256_ni(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI))
+		return crypto_register_shashes(sha256_ni_algs,
+				ARRAY_SIZE(sha256_ni_algs));
+	return 0;
+}
+
+static void unregister_sha256_ni(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI))
+		crypto_unregister_shashes(sha256_ni_algs,
+				ARRAY_SIZE(sha256_ni_algs));
+}
+
+#else
+static inline int register_sha256_ni(void) { return 0; }
+static inline void unregister_sha256_ni(void) { }
 #endif
-			pr_info("Using SSSE3 optimized SHA-256 implementation\n");
-		return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+static int __init sha256_ssse3_mod_init(void)
+{
+	if (register_sha256_ssse3())
+		goto fail;
+
+	if (register_sha256_avx()) {
+		unregister_sha256_ssse3();
+		goto fail;
 	}
-	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
 
+	if (register_sha256_avx2()) {
+		unregister_sha256_avx();
+		unregister_sha256_ssse3();
+		goto fail;
+	}
+
+	if (register_sha256_ni()) {
+		unregister_sha256_avx2();
+		unregister_sha256_avx();
+		unregister_sha256_ssse3();
+		goto fail;
+	}
+
+	return 0;
+fail:
 	return -ENODEV;
 }
 
 static void __exit sha256_ssse3_mod_fini(void)
 {
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+	unregister_sha256_ni();
+	unregister_sha256_avx2();
+	unregister_sha256_avx();
+	unregister_sha256_ssse3();
 }
 
 module_init(sha256_ssse3_mod_init);
diff --git a/kernel/arch/x86/crypto/sha512_ssse3_glue.c b/kernel/arch/x86/crypto/sha512_ssse3_glue.c
index d9fa4c1e0..34e5083d6 100644
--- a/kernel/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/kernel/arch/x86/crypto/sha512_ssse3_glue.c
@@ -35,27 +35,17 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha512_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 
 #include <linux/string.h>
 
 asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
 				       u64 rounds);
-#ifdef CONFIG_AS_AVX
-asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
-				     u64 rounds);
-#endif
-#ifdef CONFIG_AS_AVX2
-asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
-				      u64 rounds);
-#endif
 
-static void (*sha512_transform_asm)(u64 *, const char *, u64);
+typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds);
 
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len)
+static int sha512_update(struct shash_desc *desc, const u8 *data,
+		       unsigned int len, sha512_transform_fn *sha512_xform)
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
 
@@ -68,14 +58,14 @@ static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
 
 	kernel_fpu_begin();
 	sha512_base_do_update(desc, data, len,
-			      (sha512_block_fn *)sha512_transform_asm);
+			      (sha512_block_fn *)sha512_xform);
 	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
+static int sha512_finup(struct shash_desc *desc, const u8 *data,
+	      unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
 {
 	if (!irq_fpu_usable())
 		return crypto_sha512_finup(desc, data, len, out);
@@ -83,20 +73,32 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
 	kernel_fpu_begin();
 	if (len)
 		sha512_base_do_update(desc, data, len,
-				      (sha512_block_fn *)sha512_transform_asm);
-	sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
+				      (sha512_block_fn *)sha512_xform);
+	sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform);
 	kernel_fpu_end();
 
 	return sha512_base_finish(desc, out);
 }
 
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+		       unsigned int len)
+{
+	return sha512_update(desc, data, len, sha512_transform_ssse3);
+}
+
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+	      unsigned int len, u8 *out)
+{
+	return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
+}
+
 /* Add padding and return the message digest. */
 static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
 {
 	return sha512_ssse3_finup(desc, NULL, 0, out);
 }
 
-static struct shash_alg algs[] = { {
+static struct shash_alg sha512_ssse3_algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_base_init,
 	.update		=	sha512_ssse3_update,
@@ -128,64 +130,213 @@ static struct shash_alg algs[] = { {
 	}
 } };
 
-#ifdef CONFIG_AS_AVX
-static bool __init avx_usable(void)
+static int register_sha512_ssse3(void)
 {
-	u64 xcr0;
-
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		return crypto_register_shashes(sha512_ssse3_algs,
+			ARRAY_SIZE(sha512_ssse3_algs));
+	return 0;
+}
 
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+static void unregister_sha512_ssse3(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		crypto_unregister_shashes(sha512_ssse3_algs,
+			ARRAY_SIZE(sha512_ssse3_algs));
+}
 
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
+				     u64 rounds);
+static bool avx_usable(void)
+{
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
 	return true;
 }
-#endif
 
-static int __init sha512_ssse3_mod_init(void)
+static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
+		       unsigned int len)
 {
-	/* test for SSSE3 first */
-	if (cpu_has_ssse3)
-		sha512_transform_asm = sha512_transform_ssse3;
+	return sha512_update(desc, data, len, sha512_transform_avx);
+}
 
-#ifdef CONFIG_AS_AVX
-	/* allow AVX to override SSSE3, it's a little faster */
-	if (avx_usable()) {
-#ifdef CONFIG_AS_AVX2
-		if (boot_cpu_has(X86_FEATURE_AVX2))
-			sha512_transform_asm = sha512_transform_rorx;
-		else
-#endif
-			sha512_transform_asm = sha512_transform_avx;
+static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
+	      unsigned int len, u8 *out)
+{
+	return sha512_finup(desc, data, len, out, sha512_transform_avx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx_final(struct shash_desc *desc, u8 *out)
+{
+	return sha512_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx_algs[] = { {
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.init		=	sha512_base_init,
+	.update		=	sha512_avx_update,
+	.final		=	sha512_avx_final,
+	.finup		=	sha512_avx_finup,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha512",
+		.cra_driver_name =	"sha512-avx",
+		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
 	}
-#endif
+},  {
+	.digestsize	=	SHA384_DIGEST_SIZE,
+	.init		=	sha384_base_init,
+	.update		=	sha512_avx_update,
+	.final		=	sha512_avx_final,
+	.finup		=	sha512_avx_finup,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha384",
+		.cra_driver_name =	"sha384-avx",
+		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
 
-	if (sha512_transform_asm) {
-#ifdef CONFIG_AS_AVX
-		if (sha512_transform_asm == sha512_transform_avx)
-			pr_info("Using AVX optimized SHA-512 implementation\n");
-#ifdef CONFIG_AS_AVX2
-		else if (sha512_transform_asm == sha512_transform_rorx)
-			pr_info("Using AVX2 optimized SHA-512 implementation\n");
+static int register_sha512_avx(void)
+{
+	if (avx_usable())
+		return crypto_register_shashes(sha512_avx_algs,
+			ARRAY_SIZE(sha512_avx_algs));
+	return 0;
+}
+
+static void unregister_sha512_avx(void)
+{
+	if (avx_usable())
+		crypto_unregister_shashes(sha512_avx_algs,
+			ARRAY_SIZE(sha512_avx_algs));
+}
+#else
+static inline int register_sha512_avx(void) { return 0; }
+static inline void unregister_sha512_avx(void) { }
 #endif
-		else
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+				      u64 rounds);
+
+static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
+		       unsigned int len)
+{
+	return sha512_update(desc, data, len, sha512_transform_rorx);
+}
+
+static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
+	      unsigned int len, u8 *out)
+{
+	return sha512_finup(desc, data, len, out, sha512_transform_rorx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
+{
+	return sha512_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx2_algs[] = { {
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.init		=	sha512_base_init,
+	.update		=	sha512_avx2_update,
+	.final		=	sha512_avx2_final,
+	.finup		=	sha512_avx2_finup,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha512",
+		.cra_driver_name =	"sha512-avx2",
+		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+},  {
+	.digestsize	=	SHA384_DIGEST_SIZE,
+	.init		=	sha384_base_init,
+	.update		=	sha512_avx2_update,
+	.final		=	sha512_avx2_final,
+	.finup		=	sha512_avx2_finup,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha384",
+		.cra_driver_name =	"sha384-avx2",
+		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static bool avx2_usable(void)
+{
+	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+		return true;
+
+	return false;
+}
+
+static int register_sha512_avx2(void)
+{
+	if (avx2_usable())
+		return crypto_register_shashes(sha512_avx2_algs,
+			ARRAY_SIZE(sha512_avx2_algs));
+	return 0;
+}
+
+static void unregister_sha512_avx2(void)
+{
+	if (avx2_usable())
+		crypto_unregister_shashes(sha512_avx2_algs,
+			ARRAY_SIZE(sha512_avx2_algs));
+}
+#else
+static inline int register_sha512_avx2(void) { return 0; }
+static inline void unregister_sha512_avx2(void) { }
 #endif
-			pr_info("Using SSSE3 optimized SHA-512 implementation\n");
-		return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+
+static int __init sha512_ssse3_mod_init(void)
+{
+
+	if (register_sha512_ssse3())
+		goto fail;
+
+	if (register_sha512_avx()) {
+		unregister_sha512_ssse3();
+		goto fail;
 	}
-	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
 
+	if (register_sha512_avx2()) {
+		unregister_sha512_avx();
+		unregister_sha512_ssse3();
+		goto fail;
+	}
+
+	return 0;
+fail:
 	return -ENODEV;
 }
 
 static void __exit sha512_ssse3_mod_fini(void)
 {
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+	unregister_sha512_avx2();
+	unregister_sha512_avx();
+	unregister_sha512_ssse3();
 }
 
 module_init(sha512_ssse3_mod_init);
diff --git a/kernel/arch/x86/crypto/twofish_avx_glue.c b/kernel/arch/x86/crypto/twofish_avx_glue.c
index b5e2d5651..b7a3904b9 100644
--- a/kernel/arch/x86/crypto/twofish_avx_glue.c
+++ b/kernel/arch/x86/crypto/twofish_avx_glue.c
@@ -36,9 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/twofish.h>
 #include <asm/crypto/glue_helper.h>
 #include <crypto/scatterwalk.h>
@@ -558,16 +556,10 @@ static struct crypto_alg twofish_algs[10] = { {
 
 static int __init twofish_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		printk(KERN_INFO "AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		printk(KERN_INFO "AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/kernel/arch/x86/entry/Makefile b/kernel/arch/x86/entry/Makefile
new file mode 100644
index 000000000..bd55dedd7
--- /dev/null
+++ b/kernel/arch/x86/entry/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the x86 low level entry code
+#
+obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+obj-y				+= common.o
+
+obj-y				+= vdso/
+obj-y				+= vsyscall/
+
+obj-$(CONFIG_IA32_EMULATION)	+= entry_64_compat.o syscall_32.o
+
diff --git a/kernel/arch/x86/include/asm/calling.h b/kernel/arch/x86/entry/calling.h
index 1c8b50edb..3c71dd947 100644
--- a/kernel/arch/x86/include/asm/calling.h
+++ b/kernel/arch/x86/entry/calling.h
@@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with
 
 */
 
-#include <asm/dwarf2.h>
-
 #ifdef CONFIG_X86_64
 
 /*
@@ -91,28 +89,27 @@ For 32-bit we have the following conventions - kernel is built with
 #define SIZEOF_PTREGS	21*8
 
 	.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
-	subq	$15*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+	addq	$-(15*8+\addskip), %rsp
 	.endm
 
 	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
 	.if \r11
-	movq_cfi r11, 6*8+\offset
+	movq %r11, 6*8+\offset(%rsp)
 	.endif
 	.if \r8910
-	movq_cfi r10, 7*8+\offset
-	movq_cfi r9,  8*8+\offset
-	movq_cfi r8,  9*8+\offset
+	movq %r10, 7*8+\offset(%rsp)
+	movq %r9,  8*8+\offset(%rsp)
+	movq %r8,  9*8+\offset(%rsp)
 	.endif
 	.if \rax
-	movq_cfi rax, 10*8+\offset
+	movq %rax, 10*8+\offset(%rsp)
 	.endif
 	.if \rcx
-	movq_cfi rcx, 11*8+\offset
+	movq %rcx, 11*8+\offset(%rsp)
 	.endif
-	movq_cfi rdx, 12*8+\offset
-	movq_cfi rsi, 13*8+\offset
-	movq_cfi rdi, 14*8+\offset
+	movq %rdx, 12*8+\offset(%rsp)
+	movq %rsi, 13*8+\offset(%rsp)
+	movq %rdi, 14*8+\offset(%rsp)
 	.endm
 	.macro SAVE_C_REGS offset=0
 	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -131,24 +128,21 @@ For 32-bit we have the following conventions - kernel is built with
 	.endm
 
 	.macro SAVE_EXTRA_REGS offset=0
-	movq_cfi r15, 0*8+\offset
-	movq_cfi r14, 1*8+\offset
-	movq_cfi r13, 2*8+\offset
-	movq_cfi r12, 3*8+\offset
-	movq_cfi rbp, 4*8+\offset
-	movq_cfi rbx, 5*8+\offset
-	.endm
-	.macro SAVE_EXTRA_REGS_RBP offset=0
-	movq_cfi rbp, 4*8+\offset
+	movq %r15, 0*8+\offset(%rsp)
+	movq %r14, 1*8+\offset(%rsp)
+	movq %r13, 2*8+\offset(%rsp)
+	movq %r12, 3*8+\offset(%rsp)
+	movq %rbp, 4*8+\offset(%rsp)
+	movq %rbx, 5*8+\offset(%rsp)
 	.endm
 
 	.macro RESTORE_EXTRA_REGS offset=0
-	movq_cfi_restore 0*8+\offset, r15
-	movq_cfi_restore 1*8+\offset, r14
-	movq_cfi_restore 2*8+\offset, r13
-	movq_cfi_restore 3*8+\offset, r12
-	movq_cfi_restore 4*8+\offset, rbp
-	movq_cfi_restore 5*8+\offset, rbx
+	movq 0*8+\offset(%rsp), %r15
+	movq 1*8+\offset(%rsp), %r14
+	movq 2*8+\offset(%rsp), %r13
+	movq 3*8+\offset(%rsp), %r12
+	movq 4*8+\offset(%rsp), %rbp
+	movq 5*8+\offset(%rsp), %rbx
 	.endm
 
 	.macro ZERO_EXTRA_REGS
@@ -162,24 +156,24 @@ For 32-bit we have the following conventions - kernel is built with
 
 	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
 	.if \rstor_r11
-	movq_cfi_restore 6*8, r11
+	movq 6*8(%rsp), %r11
 	.endif
 	.if \rstor_r8910
-	movq_cfi_restore 7*8, r10
-	movq_cfi_restore 8*8, r9
-	movq_cfi_restore 9*8, r8
+	movq 7*8(%rsp), %r10
+	movq 8*8(%rsp), %r9
+	movq 9*8(%rsp), %r8
 	.endif
 	.if \rstor_rax
-	movq_cfi_restore 10*8, rax
+	movq 10*8(%rsp), %rax
 	.endif
 	.if \rstor_rcx
-	movq_cfi_restore 11*8, rcx
+	movq 11*8(%rsp), %rcx
 	.endif
 	.if \rstor_rdx
-	movq_cfi_restore 12*8, rdx
+	movq 12*8(%rsp), %rdx
 	.endif
-	movq_cfi_restore 13*8, rsi
-	movq_cfi_restore 14*8, rdi
+	movq 13*8(%rsp), %rsi
+	movq 14*8(%rsp), %rdi
 	.endm
 	.macro RESTORE_C_REGS
 	RESTORE_C_REGS_HELPER 1,1,1,1,1
@@ -196,16 +190,9 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
 	RESTORE_C_REGS_HELPER 1,0,0,1,1
 	.endm
-	.macro RESTORE_RSI_RDI
-	RESTORE_C_REGS_HELPER 0,0,0,0,0
-	.endm
-	.macro RESTORE_RSI_RDI_RDX
-	RESTORE_C_REGS_HELPER 0,0,0,0,1
-	.endm
 
 	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
-	addq $15*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
+	subq $-(15*8+\addskip), %rsp
 	.endm
 
 	.macro icebp
@@ -224,23 +211,23 @@ For 32-bit we have the following conventions - kernel is built with
  */
 
 	.macro SAVE_ALL
-	pushl_cfi_reg eax
-	pushl_cfi_reg ebp
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg edx
-	pushl_cfi_reg ecx
-	pushl_cfi_reg ebx
+	pushl %eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	pushl %ecx
+	pushl %ebx
 	.endm
 
 	.macro RESTORE_ALL
-	popl_cfi_reg ebx
-	popl_cfi_reg ecx
-	popl_cfi_reg edx
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi_reg ebp
-	popl_cfi_reg eax
+	popl %ebx
+	popl %ecx
+	popl %edx
+	popl %esi
+	popl %edi
+	popl %ebp
+	popl %eax
 	.endm
 
 #endif /* CONFIG_X86_64 */
diff --git a/kernel/arch/x86/entry/common.c b/kernel/arch/x86/entry/common.c
new file mode 100644
index 000000000..3ec240f39
--- /dev/null
+++ b/kernel/arch/x86/entry/common.c
@@ -0,0 +1,493 @@
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ * GPL v2
+ *
+ * Based on asm and ptrace code by many authors.  The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+
+#include <asm/desc.h>
+#include <asm/traps.h>
+#include <asm/vdso.h>
+#include <asm/uaccess.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
+{
+	unsigned long top_of_stack =
+		(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
+	return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+}
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/* Called on entry from user mode with IRQs off. */
+__visible void enter_from_user_mode(void)
+{
+	CT_WARN_ON(ct_state() != CONTEXT_USER);
+	user_exit();
+}
+#endif
+
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		audit_syscall_entry(regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
+#endif
+	{
+		audit_syscall_entry(regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
+
+/*
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2.  If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0:			resume the syscall
+ * 1:			go to phase 2; no seccomp phase 2 needed
+ * anything else:	go to phase 2; pass return value to seccomp
+ */
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	unsigned long ret = 0;
+	u32 work;
+
+	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+		BUG_ON(regs != task_pt_regs(current));
+
+	work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
+
+#ifdef CONFIG_CONTEXT_TRACKING
+	/*
+	 * If TIF_NOHZ is set, we are required to call user_exit() before
+	 * doing anything that could touch RCU.
+	 */
+	if (work & _TIF_NOHZ) {
+		enter_from_user_mode();
+		work &= ~_TIF_NOHZ;
+	}
+#endif
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Do seccomp first -- it should minimize exposure of other
+	 * code, and keeping seccomp fast is probably more valuable
+	 * than the rest of this.
+	 */
+	if (work & _TIF_SECCOMP) {
+		struct seccomp_data sd;
+
+		sd.arch = arch;
+		sd.nr = regs->orig_ax;
+		sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+		if (arch == AUDIT_ARCH_X86_64) {
+			sd.args[0] = regs->di;
+			sd.args[1] = regs->si;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->r10;
+			sd.args[4] = regs->r8;
+			sd.args[5] = regs->r9;
+		} else
+#endif
+		{
+			sd.args[0] = regs->bx;
+			sd.args[1] = regs->cx;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->si;
+			sd.args[4] = regs->di;
+			sd.args[5] = regs->bp;
+		}
+
+		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+		ret = seccomp_phase1(&sd);
+		if (ret == SECCOMP_PHASE1_SKIP) {
+			regs->orig_ax = -1;
+			ret = 0;
+		} else if (ret != SECCOMP_PHASE1_OK) {
+			return ret;  /* Go directly to phase 2 */
+		}
+
+		work &= ~_TIF_SECCOMP;
+	}
+#endif
+
+	/* Do our best to finish without phase 2. */
+	if (work == 0)
+		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (work == _TIF_SYSCALL_AUDIT) {
+		/*
+		 * If there is no more work to be done except auditing,
+		 * then audit in phase 1.  Phase 2 always audits, so, if
+		 * we audit here, then we can't go on to phase 2.
+		 */
+		do_audit_syscall_entry(regs, arch);
+		return 0;
+	}
+#endif
+
+	return 1;  /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+				unsigned long phase1_result)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	long ret = 0;
+	u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
+
+	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+		BUG_ON(regs != task_pt_regs(current));
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
+	 */
+	if (work & _TIF_SINGLESTEP)
+		regs->flags |= X86_EFLAGS_TF;
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Call seccomp_phase2 before running the other hooks so that
+	 * they can see any changes made by a seccomp tracer.
+	 */
+	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
+		/* seccomp failures shouldn't expose any additional code. */
+		return -1;
+	}
+#endif
+
+	if (unlikely(work & _TIF_SYSCALL_EMU))
+		ret = -1L;
+
+	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+	    tracehook_report_syscall_entry(regs))
+		ret = -1L;
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->orig_ax);
+
+	do_audit_syscall_entry(regs, arch);
+
+	return ret ?: regs->orig_ax;
+}
+
+long syscall_trace_enter(struct pt_regs *regs)
+{
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+	if (phase1_result == 0)
+		return regs->orig_ax;
+	else
+		return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
+#define EXIT_TO_USERMODE_LOOP_FLAGS				\
+	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
+	 _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
+
+static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
+{
+	/*
+	 * In order to return to user mode, we need to have IRQs off with
+	 * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
+	 * _TIF_UPROBE, or _TIF_NEED_RESCHED set.  Several of these flags
+	 * can be set at any time on preemptable kernels if we have IRQs on,
+	 * so we need to loop.  Disabling preemption wouldn't help: doing the
+	 * work to clear some of the flags can sleep.
+	 */
+	while (true) {
+		/* We have work to do. */
+		local_irq_enable();
+
+		if (cached_flags & _TIF_NEED_RESCHED_MASK)
+			schedule();
+
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+		if (unlikely(current->forced_info.si_signo)) {
+			struct task_struct *t = current;
+			force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
+			t->forced_info.si_signo = 0;
+		}
+#endif
+		if (cached_flags & _TIF_UPROBE)
+			uprobe_notify_resume(regs);
+
+		/* deal with pending signal delivery */
+		if (cached_flags & _TIF_SIGPENDING)
+			do_signal(regs);
+
+		if (cached_flags & _TIF_NOTIFY_RESUME) {
+			clear_thread_flag(TIF_NOTIFY_RESUME);
+			tracehook_notify_resume(regs);
+		}
+
+		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
+			fire_user_return_notifiers();
+
+		/* Disable IRQs and retry */
+		local_irq_disable();
+
+		cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+		if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+			break;
+
+	}
+}
+
+/* Called with IRQs disabled. */
+__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+	u32 cached_flags;
+
+	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
+		local_irq_disable();
+
+	lockdep_sys_exit();
+
+	cached_flags =
+		READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+		exit_to_usermode_loop(regs, cached_flags);
+
+	user_enter();
+}
+
+#define SYSCALL_EXIT_WORK_FLAGS				\
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |	\
+	 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+
+static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
+{
+	bool step;
+
+	audit_syscall_exit(regs);
+
+	if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+		trace_sys_exit(regs, regs->ax);
+
+	/*
+	 * If TIF_SYSCALL_EMU is set, we only get here because of
+	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+	 * We already reported this syscall instruction in
+	 * syscall_trace_enter().
+	 */
+	step = unlikely(
+		(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+		== _TIF_SINGLESTEP);
+	if (step || cached_flags & _TIF_SYSCALL_TRACE)
+		tracehook_report_syscall_exit(regs, step);
+}
+
+/*
+ * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible inline void syscall_return_slowpath(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	u32 cached_flags = READ_ONCE(ti->flags);
+
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+	    WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
+		local_irq_enable();
+
+	/*
+	 * First do one-time work.  If these work items are enabled, we
+	 * want to run them exactly once per syscall exit with IRQs on.
+	 */
+	if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
+		syscall_slow_exit_work(regs, cached_flags);
+
+#ifdef CONFIG_COMPAT
+	/*
+	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+	 * returning to user mode.
+	 */
+	ti->status &= ~TS_COMPAT;
+#endif
+
+	local_irq_disable();
+	prepare_exit_to_usermode(regs);
+}
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+/*
+ * Does a 32-bit syscall.  Called with IRQs on and does all entry and
+ * exit work and returns with IRQs off.  This function is extremely hot
+ * in workloads that use it, and it's usually called from
+ * do_fast_syscall_32, so forcibly inline it to improve performance.
+ */
+#ifdef CONFIG_X86_32
+/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
+__visible
+#else
+/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
+static
+#endif
+__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	unsigned int nr = (unsigned int)regs->orig_ax;
+
+#ifdef CONFIG_IA32_EMULATION
+	ti->status |= TS_COMPAT;
+#endif
+
+	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
+		/*
+		 * Subtlety here: if ptrace pokes something larger than
+		 * 2^32-1 into orig_ax, this truncates it.  This may or
+		 * may not be necessary, but it matches the old asm
+		 * behavior.
+		 */
+		nr = syscall_trace_enter(regs);
+	}
+
+	if (likely(nr < IA32_NR_syscalls)) {
+		/*
+		 * It's possible that a 32-bit syscall implementation
+		 * takes a 64-bit parameter but nonetheless assumes that
+		 * the high bits are zero.  Make sure we zero-extend all
+		 * of the args.
+		 */
+		regs->ax = ia32_sys_call_table[nr](
+			(unsigned int)regs->bx, (unsigned int)regs->cx,
+			(unsigned int)regs->dx, (unsigned int)regs->si,
+			(unsigned int)regs->di, (unsigned int)regs->bp);
+	}
+
+	syscall_return_slowpath(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* Handles INT80 on 64-bit kernels */
+__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+{
+	local_irq_enable();
+	do_syscall_32_irqs_on(regs);
+}
+#endif
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible long do_fast_syscall_32(struct pt_regs *regs)
+{
+	/*
+	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+	 * convention.  Adjust regs so it looks like we entered using int80.
+	 */
+
+	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+		vdso_image_32.sym_int80_landing_pad;
+
+	/*
+	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+	 * Fix it up.
+	 */
+	regs->ip = landing_pad;
+
+	/*
+	 * Fetch EBP from where the vDSO stashed it.
+	 *
+	 * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
+	 */
+	local_irq_enable();
+	if (
+#ifdef CONFIG_X86_64
+		/*
+		 * Micro-optimization: the pointer we're following is explicitly
+		 * 32 bits, so it can't be out of range.
+		 */
+		__get_user(*(u32 *)&regs->bp,
+			    (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#else
+		get_user(*(u32 *)&regs->bp,
+			 (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#endif
+		) {
+
+		/* User code screwed up. */
+		local_irq_disable();
+		regs->ax = -EFAULT;
+#ifdef CONFIG_CONTEXT_TRACKING
+		enter_from_user_mode();
+#endif
+		prepare_exit_to_usermode(regs);
+		return 0;	/* Keep it simple: use IRET. */
+	}
+
+	/* Now this is just like a normal syscall. */
+	do_syscall_32_irqs_on(regs);
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
+	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
+	 * bother with SYSEXIT.
+	 *
+	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+	 * because the ECX fixup above will ensure that this is essentially
+	 * never the case.
+	 */
+	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
+		regs->ip == landing_pad &&
+		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
+#else
+	/*
+	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
+	 *
+	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+	 * because the ECX fixup above will ensure that this is essentially
+	 * never the case.
+	 *
+	 * We don't allow syscalls at all from VM86 mode, but we still
+	 * need to check VM, because we might be returning from sys_vm86.
+	 */
+	return static_cpu_has(X86_FEATURE_SEP) &&
+		regs->cs == __USER_CS && regs->ss == __USER_DS &&
+		regs->ip == landing_pad &&
+		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
+#endif
+}
+#endif
diff --git a/kernel/arch/x86/entry/entry_32.S b/kernel/arch/x86/entry/entry_32.S
new file mode 100644
index 000000000..2d722ee01
--- /dev/null
+++ b/kernel/arch/x86/entry/entry_32.S
@@ -0,0 +1,1097 @@
+/*
+ *  Copyright (C) 1991,1992  Linus Torvalds
+ *
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
+ *
+ * Stack layout while running C code:
+ *	ptrace needs to have all registers on the stack.
+ *	If the order here is changed, it needs to be
+ *	updated in fork.c:copy_process(), signal.c:do_signal(),
+ *	ptrace.c and ptrace.h
+ *
+ *	 0(%esp) - %ebx
+ *	 4(%esp) - %ecx
+ *	 8(%esp) - %edx
+ *	 C(%esp) - %esi
+ *	10(%esp) - %edi
+ *	14(%esp) - %ebp
+ *	18(%esp) - %eax
+ *	1C(%esp) - %ds
+ *	20(%esp) - %es
+ *	24(%esp) - %fs
+ *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
+ *	2C(%esp) - orig_eax
+ *	30(%esp) - %eip
+ *	34(%esp) - %cs
+ *	38(%esp) - %eflags
+ *	3C(%esp) - %oldesp
+ *	40(%esp) - %oldss
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+
+	.section .entry.text, "ax"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#ifdef CONFIG_PREEMPT
+# define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+# define preempt_stop(clobbers)
+# define resume_kernel		restore_all
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
+	jz	1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc.  Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+	pushl	$0
+.endm
+.macro POP_GS pop=0
+	addl	$(4 + \pop), %esp
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else	/* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+	pushl	%gs
+.endm
+
+.macro POP_GS pop=0
+98:	popl	%gs
+  .if \pop <> 0
+	add	$\pop, %esp
+  .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99:	movl	$0, (%esp)
+	jmp	98b
+.popsection
+	_ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro PTGS_TO_GS
+98:	mov	PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99:	movl	$0, PT_GS(%esp)
+	jmp	98b
+.popsection
+	_ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro GS_TO_REG reg
+	movl	%gs, \reg
+.endm
+.macro REG_TO_PTGS reg
+	movl	\reg, PT_GS(%esp)
+.endm
+.macro SET_KERNEL_GS reg
+	movl	$(__KERNEL_STACK_CANARY), \reg
+	movl	\reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL pt_regs_ax=%eax
+	cld
+	PUSH_GS
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	\pt_regs_ax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	movl	$(__USER_DS), %edx
+	movl	%edx, %ds
+	movl	%edx, %es
+	movl	$(__KERNEL_PERCPU), %edx
+	movl	%edx, %fs
+	SET_KERNEL_GS %edx
+.endm
+
+.macro RESTORE_INT_REGS
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%eax
+.endm
+
+.macro RESTORE_REGS pop=0
+	RESTORE_INT_REGS
+1:	popl	%ds
+2:	popl	%es
+3:	popl	%fs
+	POP_GS \pop
+.pushsection .fixup, "ax"
+4:	movl	$0, (%esp)
+	jmp	1b
+5:	movl	$0, (%esp)
+	jmp	2b
+6:	movl	$0, (%esp)
+	jmp	3b
+.popsection
+	_ASM_EXTABLE(1b, 4b)
+	_ASM_EXTABLE(2b, 5b)
+	_ASM_EXTABLE(3b, 6b)
+	POP_GS_EX
+.endm
+
+ENTRY(ret_from_fork)
+	pushl	%eax
+	call	schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl	%eax
+	pushl	$0x0202				# Reset kernel eflags
+	popfl
+
+	/* When we fork, we trace the syscall return in the child, too. */
+	movl    %esp, %eax
+	call    syscall_return_slowpath
+	jmp     restore_all
+END(ret_from_fork)
+
+ENTRY(ret_from_kernel_thread)
+	pushl	%eax
+	call	schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl	%eax
+	pushl	$0x0202				# Reset kernel eflags
+	popfl
+	movl	PT_EBP(%esp), %eax
+	call	*PT_EBX(%esp)
+	movl	$0, PT_EAX(%esp)
+
+	/*
+	 * Kernel threads return to userspace as if returning from a syscall.
+	 * We should check whether anything actually uses this path and, if so,
+	 * consider switching it over to ret_from_fork.
+	 */
+	movl    %esp, %eax
+	call    syscall_return_slowpath
+	jmp     restore_all
+ENDPROC(ret_from_kernel_thread)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+	# userspace resumption stub bypassing syscall exit tracing
+	ALIGN
+ret_from_exception:
+	preempt_stop(CLBR_ANY)
+ret_from_intr:
+	GET_THREAD_INFO(%ebp)
+#ifdef CONFIG_VM86
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+	/*
+	 * We can be coming here from child spawned by kernel_thread().
+	 */
+	movl	PT_CS(%esp), %eax
+	andl	$SEGMENT_RPL_MASK, %eax
+#endif
+	cmpl	$USER_RPL, %eax
+	jb	resume_kernel			# not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl	%esp, %eax
+	call	prepare_exit_to_usermode
+	jmp	restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+	DISABLE_INTERRUPTS(CLBR_ANY)
+need_resched:
+	# preempt count == 0 + NEED_RS set?
+	cmpl	$0, PER_CPU_VAR(__preempt_count)
+#ifndef CONFIG_PREEMPT_LAZY
+	jnz	restore_all
+#else
+	jz test_int_off
+
+	# atleast preempt count == 0 ?
+	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
+	jne restore_all
+
+	cmpl $0,TI_preempt_lazy_count(%ebp)	# non-zero preempt_lazy_count ?
+	jnz restore_all
+
+	testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
+	jz restore_all
+test_int_off:
+#endif
+	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+	jz	restore_all
+	call	preempt_schedule_irq
+	jmp	need_resched
+END(resume_kernel)
+#endif
+
+	# SYSENTER  call handler stub
+ENTRY(entry_SYSENTER_32)
+	movl	TSS_sysenter_sp0(%esp), %esp
+sysenter_past_esp:
+	pushl	$__USER_DS		/* pt_regs->ss */
+	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
+	pushfl				/* pt_regs->flags (except IF = 0) */
+	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
+	pushl	$__USER_CS		/* pt_regs->cs */
+	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
+	pushl	%eax			/* pt_regs->orig_ax */
+	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
+
+	/*
+	 * User mode is traced as though IRQs are on, and SYSENTER
+	 * turned them off.
+	 */
+	TRACE_IRQS_OFF
+
+	movl	%esp, %eax
+	call	do_fast_syscall_32
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+
+/* Opportunistic SYSEXIT */
+	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
+	movl	PT_EIP(%esp), %edx	/* pt_regs->ip */
+	movl	PT_OLDESP(%esp), %ecx	/* pt_regs->sp */
+1:	mov	PT_FS(%esp), %fs
+	PTGS_TO_GS
+	popl	%ebx			/* pt_regs->bx */
+	addl	$2*4, %esp		/* skip pt_regs->cx and pt_regs->dx */
+	popl	%esi			/* pt_regs->si */
+	popl	%edi			/* pt_regs->di */
+	popl	%ebp			/* pt_regs->bp */
+	popl	%eax			/* pt_regs->ax */
+
+	/*
+	 * Return back to the vDSO, which will pop ecx and edx.
+	 * Don't bother with DS and ES (they already contain __USER_DS).
+	 */
+	ENABLE_INTERRUPTS_SYSEXIT
+
+.pushsection .fixup, "ax"
+2:	movl	$0, PT_FS(%esp)
+	jmp	1b
+.popsection
+	_ASM_EXTABLE(1b, 2b)
+	PTGS_TO_GS_EX
+ENDPROC(entry_SYSENTER_32)
+
+	# system call handler stub
+ENTRY(entry_INT80_32)
+	ASM_CLAC
+	pushl	%eax			/* pt_regs->orig_ax */
+	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
+
+	/*
+	 * User mode is traced as though IRQs are on.  Unlike the 64-bit
+	 * case, INT80 is a trap gate on 32-bit kernels, so interrupts
+	 * are already on (unless user code is messing around with iopl).
+	 */
+
+	movl	%esp, %eax
+	call	do_syscall_32_irqs_on
+.Lsyscall_32_done:
+
+restore_all:
+	TRACE_IRQS_IRET
+restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
+	/*
+	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+	 * are returning to the kernel.
+	 * See comments in process.c:copy_thread() for details.
+	 */
+	movb	PT_OLDSS(%esp), %ah
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
+	je ldt_ss				# returning to user-space with LDT SS
+#endif
+restore_nocheck:
+	RESTORE_REGS 4				# skip orig_eax/error_code
+irq_return:
+	INTERRUPT_RETURN
+.section .fixup, "ax"
+ENTRY(iret_exc	)
+	pushl	$0				# no error code
+	pushl	$do_iret_error
+	jmp	error_code
+.previous
+	_ASM_EXTABLE(irq_return, iret_exc)
+
+#ifdef CONFIG_X86_ESPFIX32
+ldt_ss:
+#ifdef CONFIG_PARAVIRT
+	/*
+	 * The kernel can't run on a non-flat stack if paravirt mode
+	 * is active.  Rather than try to fixup the high bits of
+	 * ESP, bypass this code entirely.  This may break DOSemu
+	 * and/or Wine support in a paravirt VM, although the option
+	 * is still available to implement the setting of the high
+	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
+	 */
+	cmpl	$0, pv_info+PARAVIRT_enabled
+	jne	restore_nocheck
+#endif
+
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+	mov	%esp, %edx			/* load kernel esp */
+	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
+	mov	%dx, %ax			/* eax: new kernel esp */
+	sub	%eax, %edx			/* offset (low word is 0) */
+	shr	$16, %edx
+	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
+	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
+	pushl	$__ESPFIX_SS
+	pushl	%eax				/* new kernel esp */
+	/*
+	 * Disable interrupts, but do not irqtrace this section: we
+	 * will soon execute iret and the tracer was already set to
+	 * the irqstate after the IRET:
+	 */
+	DISABLE_INTERRUPTS(CLBR_EAX)
+	lss	(%esp), %esp			/* switch to espfix segment */
+	jmp	restore_nocheck
+#endif
+ENDPROC(entry_INT80_32)
+
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+#ifdef CONFIG_X86_ESPFIX32
+	/* fixup the stack */
+	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+	shl	$16, %eax
+	addl	%esp, %eax			/* the adjusted stack pointer */
+	pushl	$__KERNEL_DS
+	pushl	%eax
+	lss	(%esp), %esp			/* switch to the normal stack segment */
+#endif
+.endm
+.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
+	movl	%ss, %eax
+	/* see if on espfix stack */
+	cmpw	$__ESPFIX_SS, %ax
+	jne	27f
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	/* switch to normal stack */
+	FIXUP_ESPFIX_STACK
+27:
+#endif
+.endm
+
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+	.align 8
+ENTRY(irq_entries_start)
+    vector=FIRST_EXTERNAL_VECTOR
+    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
+    vector=vector+1
+	jmp	common_interrupt
+	.align	8
+    .endr
+END(irq_entries_start)
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+	ASM_CLAC
+	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	movl	%esp, %eax
+	call	do_IRQ
+	jmp	ret_from_intr
+ENDPROC(common_interrupt)
+
+#define BUILD_INTERRUPT3(name, nr, fn)	\
+ENTRY(name)				\
+	ASM_CLAC;			\
+	pushl	$~(nr);			\
+	SAVE_ALL;			\
+	TRACE_IRQS_OFF			\
+	movl	%esp, %eax;		\
+	call	fn;			\
+	jmp	ret_from_intr;		\
+ENDPROC(name)
+
+
+#ifdef CONFIG_TRACING
+# define TRACE_BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+#else
+# define TRACE_BUILD_INTERRUPT(name, nr)
+#endif
+
+#define BUILD_INTERRUPT(name, nr)		\
+	BUILD_INTERRUPT3(name, nr, smp_##name);	\
+	TRACE_BUILD_INTERRUPT(name, nr)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
+
+ENTRY(coprocessor_error)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_coprocessor_error
+	jmp	error_code
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+	ASM_CLAC
+	pushl	$0
+#ifdef CONFIG_X86_INVD_BUG
+	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+	ALTERNATIVE "pushl	$do_general_protection",	\
+		    "pushl	$do_simd_coprocessor_error",	\
+		    X86_FEATURE_XMM
+#else
+	pushl	$do_simd_coprocessor_error
+#endif
+	jmp	error_code
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+	ASM_CLAC
+	pushl	$-1				# mark this as an int
+	pushl	$do_device_not_available
+	jmp	error_code
+END(device_not_available)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+	iret
+	_ASM_EXTABLE(native_iret, iret_exc)
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+	sti
+	sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+ENTRY(overflow)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_overflow
+	jmp	error_code
+END(overflow)
+
+ENTRY(bounds)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_bounds
+	jmp	error_code
+END(bounds)
+
+ENTRY(invalid_op)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_invalid_op
+	jmp	error_code
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_coprocessor_segment_overrun
+	jmp	error_code
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+	ASM_CLAC
+	pushl	$do_invalid_TSS
+	jmp	error_code
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+	ASM_CLAC
+	pushl	$do_segment_not_present
+	jmp	error_code
+END(segment_not_present)
+
+ENTRY(stack_segment)
+	ASM_CLAC
+	pushl	$do_stack_segment
+	jmp	error_code
+END(stack_segment)
+
+ENTRY(alignment_check)
+	ASM_CLAC
+	pushl	$do_alignment_check
+	jmp	error_code
+END(alignment_check)
+
+ENTRY(divide_error)
+	ASM_CLAC
+	pushl	$0				# no error code
+	pushl	$do_divide_error
+	jmp	error_code
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+	ASM_CLAC
+	pushl	$0
+	pushl	machine_check_vector
+	jmp	error_code
+END(machine_check)
+#endif
+
+ENTRY(spurious_interrupt_bug)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_spurious_interrupt_bug
+	jmp	error_code
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+	addl	$5*4, %esp			/* remove xen-provided frame */
+	jmp	sysenter_past_esp
+
+ENTRY(xen_hypervisor_callback)
+	pushl	$-1				/* orig_ax = -1 => not a system call */
+	SAVE_ALL
+	TRACE_IRQS_OFF
+
+	/*
+	 * Check to see if we got the event in the critical
+	 * region in xen_iret_direct, after we've reenabled
+	 * events and checked for pending events.  This simulates
+	 * iret instruction's behaviour where it delivers a
+	 * pending interrupt when enabling interrupts:
+	 */
+	movl	PT_EIP(%esp), %eax
+	cmpl	$xen_iret_start_crit, %eax
+	jb	1f
+	cmpl	$xen_iret_end_crit, %eax
+	jae	1f
+
+	jmp	xen_iret_crit_fixup
+
+ENTRY(xen_do_upcall)
+1:	mov	%esp, %eax
+	call	xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+	call	xen_maybe_preempt_hcall
+#endif
+	jmp	ret_from_intr
+ENDPROC(xen_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ *  1. Fault while reloading DS, ES, FS or GS
+ *  2. Fault while executing IRET
+ * Category 1 we fix up by reattempting the load, and zeroing the segment
+ * register if the load fails.
+ * Category 2 we fix up by jumping to do_iret_error. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by maintaining a status value in EAX.
+ */
+ENTRY(xen_failsafe_callback)
+	pushl	%eax
+	movl	$1, %eax
+1:	mov	4(%esp), %ds
+2:	mov	8(%esp), %es
+3:	mov	12(%esp), %fs
+4:	mov	16(%esp), %gs
+	/* EAX == 0 => Category 1 (Bad segment)
+	   EAX != 0 => Category 2 (Bad IRET) */
+	testl	%eax, %eax
+	popl	%eax
+	lea	16(%esp), %esp
+	jz	5f
+	jmp	iret_exc
+5:	pushl	$-1				/* orig_ax = -1 => not a system call */
+	SAVE_ALL
+	jmp	ret_from_exception
+
+.section .fixup, "ax"
+6:	xorl	%eax, %eax
+	movl	%eax, 4(%esp)
+	jmp	1b
+7:	xorl	%eax, %eax
+	movl	%eax, 8(%esp)
+	jmp	2b
+8:	xorl	%eax, %eax
+	movl	%eax, 12(%esp)
+	jmp	3b
+9:	xorl	%eax, %eax
+	movl	%eax, 16(%esp)
+	jmp	4b
+.previous
+	_ASM_EXTABLE(1b, 6b)
+	_ASM_EXTABLE(2b, 7b)
+	_ASM_EXTABLE(3b, 8b)
+	_ASM_EXTABLE(4b, 9b)
+ENDPROC(xen_failsafe_callback)
+
+BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+		xen_evtchn_do_upcall)
+
+#endif /* CONFIG_XEN */
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+	hyperv_vector_handler)
+
+#endif /* CONFIG_HYPERV */
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	pushl	$0				/* Pass NULL as regs pointer */
+	movl	4*4(%esp), %eax
+	movl	0x4(%ebp), %edx
+	movl	function_trace_op, %ecx
+	subl	$MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+	call	ftrace_stub
+
+	addl	$4, %esp			/* skip NULL pointer */
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+ftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp	ftrace_stub
+#endif
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+	pushf	/* push flags before compare (in cs location) */
+
+	/*
+	 * i386 does not save SS and ESP when coming from kernel.
+	 * Instead, to get sp, &regs->sp is used (see ptrace.h).
+	 * Unfortunately, that means eflags must be at the same location
+	 * as the current return ip is. We move the return ip into the
+	 * ip location, and move flags into the return ip location.
+	 */
+	pushl	4(%esp)				/* save return ip into ip slot */
+
+	pushl	$0				/* Load 0 into orig_ax */
+	pushl	%gs
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+
+	movl	13*4(%esp), %eax		/* Get the saved flags */
+	movl	%eax, 14*4(%esp)		/* Move saved flags into regs->flags location */
+						/* clobbering return ip */
+	movl	$__KERNEL_CS, 13*4(%esp)
+
+	movl	12*4(%esp), %eax		/* Load ip (1st parameter) */
+	subl	$MCOUNT_INSN_SIZE, %eax		/* Adjust ip */
+	movl	0x4(%ebp), %edx			/* Load parent ip (2nd parameter) */
+	movl	function_trace_op, %ecx		/* Save ftrace_pos in 3rd parameter */
+	pushl	%esp				/* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+	call	ftrace_stub
+
+	addl	$4, %esp			/* Skip pt_regs */
+	movl	14*4(%esp), %eax		/* Move flags back into cs */
+	movl	%eax, 13*4(%esp)		/* Needed to keep addl	from modifying flags */
+	movl	12*4(%esp), %eax		/* Get return ip from regs->ip */
+	movl	%eax, 14*4(%esp)		/* Put return ip back for ret */
+
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%eax
+	popl	%ds
+	popl	%es
+	popl	%fs
+	popl	%gs
+	addl	$8, %esp			/* Skip orig_ax and ip */
+	popf					/* Pop flags at end (no addl to corrupt flags) */
+	jmp	ftrace_ret
+
+	popf
+	jmp	ftrace_stub
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+	cmpl	$__PAGE_OFFSET, %esp
+	jb	ftrace_stub			/* Paging not enabled yet? */
+
+	cmpl	$ftrace_stub, ftrace_trace_function
+	jnz	trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl	$ftrace_stub, ftrace_graph_return
+	jnz	ftrace_graph_caller
+
+	cmpl	$ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz	ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	movl	0xc(%esp), %eax
+	movl	0x4(%ebp), %edx
+	subl	$MCOUNT_INSN_SIZE, %eax
+
+	call	*ftrace_trace_function
+
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+	jmp	ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	movl	0xc(%esp), %eax
+	lea	0x4(%ebp), %edx
+	movl	(%ebp), %ecx
+	subl	$MCOUNT_INSN_SIZE, %eax
+	call	prepare_ftrace_return
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+	ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+	pushl	%eax
+	pushl	%edx
+	movl	%ebp, %eax
+	call	ftrace_return_to_handler
+	movl	%eax, %ecx
+	popl	%edx
+	popl	%eax
+	jmp	*%ecx
+#endif
+
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+	ASM_CLAC
+	pushl	$trace_do_page_fault
+	jmp	error_code
+END(trace_page_fault)
+#endif
+
+ENTRY(page_fault)
+	ASM_CLAC
+	pushl	$do_page_fault
+	ALIGN
+error_code:
+	/* the function address is in %gs's slot on the stack */
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	cld
+	movl	$(__KERNEL_PERCPU), %ecx
+	movl	%ecx, %fs
+	UNWIND_ESPFIX_STACK
+	GS_TO_REG %ecx
+	movl	PT_GS(%esp), %edi		# get the function address
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
+	movl	$(__USER_DS), %ecx
+	movl	%ecx, %ds
+	movl	%ecx, %es
+	TRACE_IRQS_OFF
+	movl	%esp, %eax			# pt_regs pointer
+	call	*%edi
+	jmp	ret_from_exception
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+.macro FIX_STACK offset ok label
+	cmpw	$__KERNEL_CS, 4(%esp)
+	jne	\ok
+\label:
+	movl	TSS_sysenter_sp0 + \offset(%esp), %esp
+	pushfl
+	pushl	$__KERNEL_CS
+	pushl	$sysenter_past_esp
+.endm
+
+ENTRY(debug)
+	ASM_CLAC
+	cmpl	$entry_SYSENTER_32, (%esp)
+	jne	debug_stack_correct
+	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
+debug_stack_correct:
+	pushl	$-1				# mark this as an int
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl	%edx, %edx			# error code 0
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_debug
+	jmp	ret_from_exception
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+	ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
+	pushl	%eax
+	movl	%ss, %eax
+	cmpw	$__ESPFIX_SS, %ax
+	popl	%eax
+	je	nmi_espfix_stack
+#endif
+	cmpl	$entry_SYSENTER_32, (%esp)
+	je	nmi_stack_fixup
+	pushl	%eax
+	movl	%esp, %eax
+	/*
+	 * Do not access memory above the end of our stack page,
+	 * it might not exist.
+	 */
+	andl	$(THREAD_SIZE-1), %eax
+	cmpl	$(THREAD_SIZE-20), %eax
+	popl	%eax
+	jae	nmi_stack_correct
+	cmpl	$entry_SYSENTER_32, 12(%esp)
+	je	nmi_debug_stack_check
+nmi_stack_correct:
+	pushl	%eax
+	SAVE_ALL
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_nmi
+	jmp	restore_all_notrace
+
+nmi_stack_fixup:
+	FIX_STACK 12, nmi_stack_correct, 1
+	jmp	nmi_stack_correct
+
+nmi_debug_stack_check:
+	cmpw	$__KERNEL_CS, 16(%esp)
+	jne	nmi_stack_correct
+	cmpl	$debug, (%esp)
+	jb	nmi_stack_correct
+	cmpl	$debug_esp_fix_insn, (%esp)
+	ja	nmi_stack_correct
+	FIX_STACK 24, nmi_stack_correct, 1
+	jmp	nmi_stack_correct
+
+#ifdef CONFIG_X86_ESPFIX32
+nmi_espfix_stack:
+	/*
+	 * create the pointer to lss back
+	 */
+	pushl	%ss
+	pushl	%esp
+	addl	$4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl	16(%esp)
+	.endr
+	pushl	%eax
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK			# %eax == %esp
+	xorl	%edx, %edx			# zero error code
+	call	do_nmi
+	RESTORE_REGS
+	lss	12+4(%esp), %esp		# back to espfix stack
+	jmp	irq_return
+#endif
+END(nmi)
+
+ENTRY(int3)
+	ASM_CLAC
+	pushl	$-1				# mark this as an int
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_int3
+	jmp	ret_from_exception
+END(int3)
+
+ENTRY(general_protection)
+	pushl	$do_general_protection
+	jmp	error_code
+END(general_protection)
+
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+	ASM_CLAC
+	pushl	$do_async_page_fault
+	jmp	error_code
+END(async_page_fault)
+#endif
diff --git a/kernel/arch/x86/kernel/entry_64.S b/kernel/arch/x86/entry/entry_64.S
index 8a3445bdf..316081a2c 100644
--- a/kernel/arch/x86/kernel/entry_64.S
+++ b/kernel/arch/x86/entry/entry_64.S
@@ -4,34 +4,25 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
- */
-
-/*
+ *
  * entry.S contains the system-call and fault low-level handling routines.
  *
  * Some of this is documented in Documentation/x86/entry_64.txt
  *
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
  * A note on terminology:
- * - iret frame: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ * - iret frame:	Architecture defined interrupt frame from SS to RIP
+ *			at the top of the kernel process stack.
  *
  * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - ENTRY/END Define functions in the symbol table.
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - idtentry - Define exception entry points.
+ * - ENTRY/END:		Define functions in the symbol table.
+ * - TRACE_IRQ_*:	Trace hardirq state for lock debugging.
+ * - idtentry:		Define exception entry points.
  */
-
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
 #include <asm/unistd.h>
@@ -42,20 +33,18 @@
 #include <asm/paravirt.h>
 #include <asm/percpu.h>
 #include <asm/asm.h>
-#include <asm/context_tracking.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE	   0x40000000
-
-	.code64
-	.section .entry.text, "ax"
+#define AUDIT_ARCH_X86_64			(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT			0x80000000
+#define __AUDIT_ARCH_LE				0x40000000
 
+.code64
+.section .entry.text, "ax"
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret64)
@@ -64,11 +53,10 @@ ENTRY(native_usergs_sysret64)
 ENDPROC(native_usergs_sysret64)
 #endif /* CONFIG_PARAVIRT */
 
-
 .macro TRACE_IRQS_IRETQ
 #ifdef CONFIG_TRACE_IRQFLAGS
-	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
-	jnc  1f
+	bt	$9, EFLAGS(%rsp)		/* interrupts off? */
+	jnc	1f
 	TRACE_IRQS_ON
 1:
 #endif
@@ -88,89 +76,34 @@ ENDPROC(native_usergs_sysret64)
 #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
 
 .macro TRACE_IRQS_OFF_DEBUG
-	call debug_stack_set_zero
+	call	debug_stack_set_zero
 	TRACE_IRQS_OFF
-	call debug_stack_reset
+	call	debug_stack_reset
 .endm
 
 .macro TRACE_IRQS_ON_DEBUG
-	call debug_stack_set_zero
+	call	debug_stack_set_zero
 	TRACE_IRQS_ON
-	call debug_stack_reset
+	call	debug_stack_reset
 .endm
 
 .macro TRACE_IRQS_IRETQ_DEBUG
-	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
-	jnc  1f
+	bt	$9, EFLAGS(%rsp)		/* interrupts off? */
+	jnc	1f
 	TRACE_IRQS_ON_DEBUG
 1:
 .endm
 
 #else
-# define TRACE_IRQS_OFF_DEBUG		TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG		TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG		TRACE_IRQS_IRETQ
+# define TRACE_IRQS_OFF_DEBUG			TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG			TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG			TRACE_IRQS_IRETQ
 #endif
 
 /*
- * empty frame
- */
-	.macro EMPTY_FRAME start=1 offset=0
-	.if \start
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA rsp,8+\offset
-	.else
-	CFI_DEF_CFA_OFFSET 8+\offset
-	.endif
-	.endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
-	.macro INTR_FRAME start=1 offset=0
-	EMPTY_FRAME \start, 5*8+\offset
-	/*CFI_REL_OFFSET ss, 4*8+\offset*/
-	CFI_REL_OFFSET rsp, 3*8+\offset
-	/*CFI_REL_OFFSET rflags, 2*8+\offset*/
-	/*CFI_REL_OFFSET cs, 1*8+\offset*/
-	CFI_REL_OFFSET rip, 0*8+\offset
-	.endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
-	.macro XCPT_FRAME start=1 offset=0
-	INTR_FRAME \start, 1*8+\offset
-	.endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
-	.macro DEFAULT_FRAME start=1 offset=0
-	XCPT_FRAME \start, ORIG_RAX+\offset
-	CFI_REL_OFFSET rdi, RDI+\offset
-	CFI_REL_OFFSET rsi, RSI+\offset
-	CFI_REL_OFFSET rdx, RDX+\offset
-	CFI_REL_OFFSET rcx, RCX+\offset
-	CFI_REL_OFFSET rax, RAX+\offset
-	CFI_REL_OFFSET r8, R8+\offset
-	CFI_REL_OFFSET r9, R9+\offset
-	CFI_REL_OFFSET r10, R10+\offset
-	CFI_REL_OFFSET r11, R11+\offset
-	CFI_REL_OFFSET rbx, RBX+\offset
-	CFI_REL_OFFSET rbp, RBP+\offset
-	CFI_REL_OFFSET r12, R12+\offset
-	CFI_REL_OFFSET r13, R13+\offset
-	CFI_REL_OFFSET r14, R14+\offset
-	CFI_REL_OFFSET r15, R15+\offset
-	.endm
-
-/*
- * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
- * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  * then loads new ss, cs, and rip from previously programmed MSRs.
  * rflags gets masked by a value from another MSR (so CLD and CLAC
  * are not needed). SYSCALL does not save anything on the stack
@@ -186,7 +119,7 @@ ENDPROC(native_usergs_sysret64)
  * r10  arg3 (needs to be moved to rcx to conform to C ABI)
  * r8   arg4
  * r9   arg5
- * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
  *
  * Only called from user space.
  *
@@ -195,13 +128,7 @@ ENDPROC(native_usergs_sysret64)
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
-ENTRY(system_call)
-	CFI_STARTPROC	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rip,rcx
-	/*CFI_REGISTER	rflags,r11*/
-
+ENTRY(entry_SYSCALL_64)
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -213,14 +140,14 @@ ENTRY(system_call)
 	 * after the swapgs, so that it can do the swapgs
 	 * for the guest and jump here on syscall.
 	 */
-GLOBAL(system_call_after_swapgs)
+GLOBAL(entry_SYSCALL_64_after_swapgs)
 
-	movq	%rsp,PER_CPU_VAR(rsp_scratch)
-	movq	PER_CPU_VAR(kernel_stack),%rsp
+	movq	%rsp, PER_CPU_VAR(rsp_scratch)
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/* Construct struct pt_regs on stack */
-	pushq_cfi $__USER_DS			/* pt_regs->ss */
-	pushq_cfi PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
+	pushq	$__USER_DS			/* pt_regs->ss */
+	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
 	/*
 	 * Re-enable interrupts.
 	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
@@ -229,36 +156,34 @@ GLOBAL(system_call_after_swapgs)
 	 * with using rsp_scratch:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi	%r11			/* pt_regs->flags */
-	pushq_cfi	$__USER_CS		/* pt_regs->cs */
-	pushq_cfi	%rcx			/* pt_regs->ip */
-	CFI_REL_OFFSET rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
-	pushq_cfi_reg	r8			/* pt_regs->r8 */
-	pushq_cfi_reg	r9			/* pt_regs->r9 */
-	pushq_cfi_reg	r10			/* pt_regs->r10 */
-	pushq_cfi_reg	r11			/* pt_regs->r11 */
-	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 6*8
-
-	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz tracesys
-system_call_fastpath:
+	pushq	%r11				/* pt_regs->flags */
+	pushq	$__USER_CS			/* pt_regs->cs */
+	pushq	%rcx				/* pt_regs->ip */
+	pushq	%rax				/* pt_regs->orig_ax */
+	pushq	%rdi				/* pt_regs->di */
+	pushq	%rsi				/* pt_regs->si */
+	pushq	%rdx				/* pt_regs->dx */
+	pushq	%rcx				/* pt_regs->cx */
+	pushq	$-ENOSYS			/* pt_regs->ax */
+	pushq	%r8				/* pt_regs->r8 */
+	pushq	%r9				/* pt_regs->r9 */
+	pushq	%r10				/* pt_regs->r10 */
+	pushq	%r11				/* pt_regs->r11 */
+	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
+
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	tracesys
+entry_SYSCALL_64_fastpath:
 #if __SYSCALL_MASK == ~0
-	cmpq $__NR_syscall_max,%rax
+	cmpq	$__NR_syscall_max, %rax
 #else
-	andl $__SYSCALL_MASK,%eax
-	cmpl $__NR_syscall_max,%eax
+	andl	$__SYSCALL_MASK, %eax
+	cmpl	$__NR_syscall_max, %eax
 #endif
-	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
-	movq %r10,%rcx
-	call *sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
+	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	movq	%r10, %rcx
+	call	*sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
 1:
 /*
  * Syscall return path ending with SYSRET (fast path).
@@ -279,19 +204,15 @@ system_call_fastpath:
 	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
 	 * very bad.
 	 */
-	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz int_ret_from_sys_call_irqs_off	/* Go to the slow path */
-
-	CFI_REMEMBER_STATE
+	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
 	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RIP(%rsp),%rcx
-	CFI_REGISTER	rip,rcx
-	movq	EFLAGS(%rsp),%r11
-	/*CFI_REGISTER	rflags,r11*/
-	movq	RSP(%rsp),%rsp
+	movq	RIP(%rsp), %rcx
+	movq	EFLAGS(%rsp), %r11
+	movq	RSP(%rsp), %rsp
 	/*
-	 * 64bit SYSRET restores rip from rcx,
+	 * 64-bit SYSRET restores rip from rcx,
 	 * rflags from r11 (but RF and VM bits are forced to 0),
 	 * cs and ss are loaded from MSRs.
 	 * Restoration of rflags re-enables interrupts.
@@ -307,25 +228,28 @@ system_call_fastpath:
 	 */
 	USERGS_SYSRET64
 
-	CFI_RESTORE_STATE
+GLOBAL(int_ret_from_sys_call_irqs_off)
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	jmp int_ret_from_sys_call
 
 	/* Do syscall entry tracing */
 tracesys:
-	movq %rsp, %rdi
-	movl $AUDIT_ARCH_X86_64, %esi
-	call syscall_trace_enter_phase1
-	test %rax, %rax
-	jnz tracesys_phase2		/* if needed, run the slow path */
-	RESTORE_C_REGS_EXCEPT_RAX	/* else restore clobbered regs */
-	movq ORIG_RAX(%rsp), %rax
-	jmp system_call_fastpath	/*      and return to the fast path */
+	movq	%rsp, %rdi
+	movl	$AUDIT_ARCH_X86_64, %esi
+	call	syscall_trace_enter_phase1
+	test	%rax, %rax
+	jnz	tracesys_phase2			/* if needed, run the slow path */
+	RESTORE_C_REGS_EXCEPT_RAX		/* else restore clobbered regs */
+	movq	ORIG_RAX(%rsp), %rax
+	jmp	entry_SYSCALL_64_fastpath	/* and return to the fast path */
 
 tracesys_phase2:
 	SAVE_EXTRA_REGS
-	movq %rsp, %rdi
-	movl $AUDIT_ARCH_X86_64, %esi
-	movq %rax,%rdx
-	call syscall_trace_enter_phase2
+	movq	%rsp, %rdi
+	movl	$AUDIT_ARCH_X86_64, %esi
+	movq	%rax, %rdx
+	call	syscall_trace_enter_phase2
 
 	/*
 	 * Reload registers from stack in case ptrace changed them.
@@ -335,15 +259,15 @@ tracesys_phase2:
 	RESTORE_C_REGS_EXCEPT_RAX
 	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
-	cmpq $__NR_syscall_max,%rax
+	cmpq	$__NR_syscall_max, %rax
 #else
-	andl $__SYSCALL_MASK,%eax
-	cmpl $__NR_syscall_max,%eax
+	andl	$__SYSCALL_MASK, %eax
+	cmpl	$__NR_syscall_max, %eax
 #endif
-	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
-	movq %r10,%rcx	/* fixup for C */
-	call *sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
+	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	movq	%r10, %rcx			/* fixup for C */
+	call	*sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
 1:
 	/* Use IRET because user could have changed pt_regs->foo */
 
@@ -352,100 +276,47 @@ tracesys_phase2:
  * Has correct iret frame.
  */
 GLOBAL(int_ret_from_sys_call)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
-	TRACE_IRQS_OFF
-	movl $_TIF_ALLWORK_MASK,%edi
-	/* edi:	mask to check */
-GLOBAL(int_with_check)
-	LOCKDEP_SYS_EXIT_IRQ
-	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%edx
-	andl %edi,%edx
-	jnz   int_careful
-	andl	$~TS_COMPAT,TI_status(%rcx)
-	jmp	syscall_return
-
-	/* Either reschedule or signal or syscall exit tracking needed. */
-	/* First do a reschedule test. */
-	/* edx:	work, edi: workmask */
-int_careful:
-	testl $_TIF_NEED_RESCHED_MASK,%edx
-	jz  int_very_careful
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
-	SCHEDULE_USER
-	popq_cfi %rdi
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp int_with_check
-
-	/* handle signals and tracing -- both require a full pt_regs */
-int_very_careful:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
-	/* Check for syscall exit trace */
-	testl $_TIF_WORK_SYSCALL_EXIT,%edx
-	jz int_signal
-	pushq_cfi %rdi
-	leaq 8(%rsp),%rdi	# &ptregs -> arg1
-	call syscall_trace_leave
-	popq_cfi %rdi
-	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
-	jmp int_restore_rest
-
-int_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz 1f
-	movq %rsp,%rdi		# &ptregs -> arg1
-	xorl %esi,%esi		# oldset -> arg2
-	call do_notify_resume
-1:	movl $_TIF_WORK_MASK,%edi
-int_restore_rest:
+	movq	%rsp, %rdi
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	RESTORE_EXTRA_REGS
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp int_with_check
-
-syscall_return:
-	/* The IRETQ could re-enable interrupts: */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_IRETQ
+	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * a completely clean 64-bit userspace context.
 	 */
-	movq RCX(%rsp),%rcx
-	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
-	jne opportunistic_sysret_failed
+	movq	RCX(%rsp), %rcx
+	movq	RIP(%rsp), %r11
+	cmpq	%rcx, %r11			/* RCX == RIP */
+	jne	opportunistic_sysret_failed
 
 	/*
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
 	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.  It's not worth
-	 * testing for canonicalness exactly -- this check detects any
-	 * of the 17 high bits set, which is true for non-canonical
-	 * or kernel addresses.  (This will pessimize vsyscall=native.
-	 * Big deal.)
+	 * the kernel, since userspace controls RSP.
 	 *
-	 * If virtual addresses ever become wider, this will need
+	 * If width of "canonical tail" ever becomes variable, this will need
 	 * to be updated to remain correct on both old and new CPUs.
 	 */
 	.ifne __VIRTUAL_MASK_SHIFT - 47
 	.error "virtual address width changed -- SYSRET checks need update"
 	.endif
-	shr $__VIRTUAL_MASK_SHIFT, %rcx
-	jnz opportunistic_sysret_failed
 
-	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
-	jne opportunistic_sysret_failed
+	/* Change top 16 bits to be the sign-extension of 47th bit */
+	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
 
-	movq R11(%rsp),%r11
-	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne opportunistic_sysret_failed
+	/* If this changed %rcx, it was not canonical */
+	cmpq	%rcx, %r11
+	jne	opportunistic_sysret_failed
+
+	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
+	jne	opportunistic_sysret_failed
+
+	movq	R11(%rsp), %r11
+	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
+	jne	opportunistic_sysret_failed
 
 	/*
 	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
@@ -454,47 +325,41 @@ syscall_return:
 	 * with register state that satisfies the opportunistic SYSRET
 	 * conditions.  For example, single-stepping this user code:
 	 *
-	 *           movq $stuck_here,%rcx
+	 *           movq	$stuck_here, %rcx
 	 *           pushfq
 	 *           popq %r11
 	 *   stuck_here:
 	 *
 	 * would never get past 'stuck_here'.
 	 */
-	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz opportunistic_sysret_failed
+	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+	jnz	opportunistic_sysret_failed
 
 	/* nothing to check for RSP */
 
-	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
-	jne opportunistic_sysret_failed
+	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
+	jne	opportunistic_sysret_failed
 
 	/*
-	 * We win!  This label is here just for ease of understanding
-	 * perf profiles.  Nothing jumps here.
+	 * We win! This label is here just for ease of understanding
+	 * perf profiles. Nothing jumps here.
 	 */
 syscall_return_via_sysret:
-	CFI_REMEMBER_STATE
-	/* r11 is already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_R11
-	movq RSP(%rsp),%rsp
+	/* rcx and r11 are already restored (see code above) */
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
-	CFI_RESTORE_STATE
 
 opportunistic_sysret_failed:
 	SWAPGS
 	jmp	restore_c_regs_and_iret
-	CFI_ENDPROC
-END(system_call)
+END(entry_SYSCALL_64)
 
 
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	SAVE_EXTRA_REGS 8
-	jmp sys_\func
-	CFI_ENDPROC
+	jmp	sys_\func
 END(stub_\func)
 	.endm
 
@@ -503,8 +368,6 @@ END(stub_\func)
 	FORK_LIKE  vfork
 
 ENTRY(stub_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	sys_execve
 return_from_execve:
 	testl	%eax, %eax
@@ -514,11 +377,9 @@ return_from_execve:
 1:
 	/* must use IRET code path (pt_regs->cs may have changed) */
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	ZERO_EXTRA_REGS
-	movq	%rax,RAX(%rsp)
+	movq	%rax, RAX(%rsp)
 	jmp	int_ret_from_sys_call
-	CFI_ENDPROC
 END(stub_execve)
 /*
  * Remaining execve stubs are only 7 bytes long.
@@ -526,56 +387,28 @@ END(stub_execve)
  */
 	.align	8
 GLOBAL(stub_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub_execveat)
 
-#ifdef CONFIG_X86_X32_ABI
+#if defined(CONFIG_X86_X32_ABI)
 	.align	8
 GLOBAL(stub_x32_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	compat_sys_execve
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub_x32_execve)
 	.align	8
 GLOBAL(stub_x32_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	compat_sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub_x32_execveat)
 #endif
 
-#ifdef CONFIG_IA32_EMULATION
-	.align	8
-GLOBAL(stub32_execve)
-	CFI_STARTPROC
-	call	compat_sys_execve
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub32_execve)
-	.align	8
-GLOBAL(stub32_execveat)
-	CFI_STARTPROC
-	call	compat_sys_execveat
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub32_execveat)
-#endif
-
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
  */
 ENTRY(stub_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	/*
 	 * SAVE_EXTRA_REGS result is not normally needed:
 	 * sigreturn overwrites all pt_regs->GPREGS.
@@ -584,24 +417,19 @@ ENTRY(stub_rt_sigreturn)
 	 * we SAVE_EXTRA_REGS here.
 	 */
 	SAVE_EXTRA_REGS 8
-	call sys_rt_sigreturn
+	call	sys_rt_sigreturn
 return_from_stub:
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	RESTORE_EXTRA_REGS
-	movq %rax,RAX(%rsp)
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
+	movq	%rax, RAX(%rsp)
+	jmp	int_ret_from_sys_call
 END(stub_rt_sigreturn)
 
 #ifdef CONFIG_X86_X32_ABI
 ENTRY(stub_x32_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	SAVE_EXTRA_REGS 8
-	call sys32_x32_rt_sigreturn
-	jmp  return_from_stub
-	CFI_ENDPROC
+	call	sys32_x32_rt_sigreturn
+	jmp	return_from_stub
 END(stub_x32_rt_sigreturn)
 #endif
 
@@ -611,36 +439,36 @@ END(stub_x32_rt_sigreturn)
  * rdi: prev task we switched from
  */
 ENTRY(ret_from_fork)
-	DEFAULT_FRAME
 
-	LOCK ; btr $TIF_FORK,TI_flags(%r8)
+	LOCK ; btr $TIF_FORK, TI_flags(%r8)
 
-	pushq_cfi $0x0002
-	popfq_cfi				# reset kernel eflags
+	pushq	$0x0002
+	popfq					/* reset kernel eflags */
 
-	call schedule_tail			# rdi: 'prev' task parameter
+	call	schedule_tail			/* rdi: 'prev' task parameter */
 
 	RESTORE_EXTRA_REGS
 
-	testl $3,CS(%rsp)			# from kernel_thread?
+	testb	$3, CS(%rsp)			/* from kernel_thread? */
 
 	/*
 	 * By the time we get here, we have no idea whether our pt_regs,
 	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-	 * the slow path, or one of the ia32entry paths.
+	 * the slow path, or one of the 32-bit compat paths.
 	 * Use IRET code path to return, since it can safely handle
 	 * all of the above.
 	 */
 	jnz	int_ret_from_sys_call
 
-	/* We came from kernel_thread */
-	/* nb: we depend on RESTORE_EXTRA_REGS above */
-	movq %rbp, %rdi
-	call *%rbx
-	movl $0, RAX(%rsp)
+	/*
+	 * We came from kernel_thread
+	 * nb: we depend on RESTORE_EXTRA_REGS above
+	 */
+	movq	%rbp, %rdi
+	call	*%rbx
+	movl	$0, RAX(%rsp)
 	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
+	jmp	int_ret_from_sys_call
 END(ret_from_fork)
 
 /*
@@ -649,16 +477,13 @@ END(ret_from_fork)
  */
 	.align 8
 ENTRY(irq_entries_start)
-	INTR_FRAME
     vector=FIRST_EXTERNAL_VECTOR
     .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
-	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+	pushq	$(~vector+0x80)			/* Note: always in signed byte range */
     vector=vector+1
 	jmp	common_interrupt
-	CFI_ADJUST_CFA_OFFSET -8
 	.align	8
     .endr
-	CFI_ENDPROC
 END(irq_entries_start)
 
 /*
@@ -672,23 +497,33 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
 	cld
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
+
+	testb	$3, CS(%rsp)
+	jz	1f
+
+	/*
+	 * IRQ from user mode.  Switch to kernel gsbase and inform context
+	 * tracking that we're in kernel mode.
+	 */
+	SWAPGS
+
 	/*
-	 * Since nothing in interrupt handling code touches r12...r15 members
-	 * of "struct pt_regs", and since interrupts can nest, we can save
-	 * four stack slots and simultaneously provide
-	 * an unwind-friendly stack layout by saving "truncated" pt_regs
-	 * exactly up to rbp slot, without these members.
+	 * We need to tell lockdep that IRQs are off.  We can't do this until
+	 * we fix gsbase, and we should do it before enter_from_user_mode
+	 * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
+	 * the simplest way to handle it is to just call it twice if
+	 * we enter from user mode.  There's no reason to optimize this since
+	 * TRACE_IRQS_OFF is a no-op if lockdep is off.
 	 */
-	ALLOC_PT_GPREGS_ON_STACK -RBP
-	SAVE_C_REGS -RBP
-	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
-	SAVE_EXTRA_REGS_RBP -RBP
+	TRACE_IRQS_OFF
 
-	leaq -RBP(%rsp),%rdi	/* arg1 for \func (pointer to pt_regs) */
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
 
-	testl $3, CS-RBP(%rsp)
-	je 1f
-	SWAPGS
 1:
 	/*
 	 * Save previous stack pointer, optionally switch to interrupt stack.
@@ -697,24 +532,14 @@ END(irq_entries_start)
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-	movq %rsp, %rsi
-	incl PER_CPU_VAR(irq_count)
-	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	CFI_DEF_CFA_REGISTER	rsi
-	pushq %rsi
-	/*
-	 * For debugger:
-	 * "CFA (Current Frame Address) is the value on stack + offset"
-	 */
-	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
-			0x77 /* DW_OP_breg7 (rsp) */, 0, \
-			0x06 /* DW_OP_deref */, \
-			0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
-			0x22 /* DW_OP_plus */
+	movq	%rsp, %rdi
+	incl	PER_CPU_VAR(irq_count)
+	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
+	pushq	%rdi
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 
-	call \func
+	call	\func	/* rdi points to pt_regs */
 	.endm
 
 	/*
@@ -723,59 +548,37 @@ END(irq_entries_start)
 	 */
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 common_interrupt:
-	XCPT_FRAME
 	ASM_CLAC
-	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
+	addq	$-0x80, (%rsp)			/* Adjust vector to [-256, -1] range */
 	interrupt do_IRQ
 	/* 0(%rsp): old RSP */
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	decl PER_CPU_VAR(irq_count)
+	decl	PER_CPU_VAR(irq_count)
 
 	/* Restore saved previous stack */
-	popq %rsi
-	CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
-	/* return code expects complete pt_regs - adjust rsp accordingly: */
-	leaq -RBP(%rsi),%rsp
-	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	RBP
-
-	testl $3,CS(%rsp)
-	je retint_kernel
-	/* Interrupt came from user space */
+	popq	%rsp
 
-	GET_THREAD_INFO(%rcx)
-	/*
-	 * %rcx: thread info. Interrupts off.
-	 */
-retint_with_reschedule:
-	movl $_TIF_WORK_MASK,%edi
-retint_check:
-	LOCKDEP_SYS_EXIT_IRQ
-	movl TI_flags(%rcx),%edx
-	andl %edi,%edx
-	CFI_REMEMBER_STATE
-	jnz  retint_careful
-
-retint_swapgs:		/* return to user-space */
-	/*
-	 * The iretq could re-enable interrupts:
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_IRETQ
+	testb	$3, CS(%rsp)
+	jz	retint_kernel
 
+	/* Interrupt came from user space */
+GLOBAL(retint_user)
+	mov	%rsp,%rdi
+	call	prepare_exit_to_usermode
+	TRACE_IRQS_IRETQ
 	SWAPGS
-	jmp	restore_c_regs_and_iret
+	jmp	restore_regs_and_iret
 
 /* Returning to kernel space */
 retint_kernel:
 #ifdef CONFIG_PREEMPT
 	/* Interrupts are off */
 	/* Check if we need preemption */
-	bt	$9,EFLAGS(%rsp)	/* interrupts were off? */
+	bt	$9, EFLAGS(%rsp)		/* were interrupts off? */
 	jnc	1f
-0:	cmpl	$0,PER_CPU_VAR(__preempt_count)
+0:	cmpl	$0, PER_CPU_VAR(__preempt_count)
 #ifndef CONFIG_PREEMPT_LAZY
 	jnz	1f
 #else
@@ -806,6 +609,8 @@ do_preempt_schedule_irq:
  * At this label, code paths which return to kernel and to user,
  * which come from interrupts/exception and from syscalls, merge.
  */
+GLOBAL(restore_regs_and_iret)
+	RESTORE_EXTRA_REGS
 restore_c_regs_and_iret:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
@@ -817,8 +622,8 @@ ENTRY(native_iret)
 	 * 64-bit mode SS:RSP on the exception stack is always valid.
 	 */
 #ifdef CONFIG_X86_ESPFIX64
-	testb $4,(SS-RIP)(%rsp)
-	jnz native_irq_return_ldt
+	testb	$4, (SS-RIP)(%rsp)
+	jnz	native_irq_return_ldt
 #endif
 
 .global native_irq_return_iret
@@ -833,62 +638,29 @@ native_irq_return_iret:
 
 #ifdef CONFIG_X86_ESPFIX64
 native_irq_return_ldt:
-	pushq_cfi %rax
-	pushq_cfi %rdi
+	pushq	%rax
+	pushq	%rdi
 	SWAPGS
-	movq PER_CPU_VAR(espfix_waddr),%rdi
-	movq %rax,(0*8)(%rdi)	/* RAX */
-	movq (2*8)(%rsp),%rax	/* RIP */
-	movq %rax,(1*8)(%rdi)
-	movq (3*8)(%rsp),%rax	/* CS */
-	movq %rax,(2*8)(%rdi)
-	movq (4*8)(%rsp),%rax	/* RFLAGS */
-	movq %rax,(3*8)(%rdi)
-	movq (6*8)(%rsp),%rax	/* SS */
-	movq %rax,(5*8)(%rdi)
-	movq (5*8)(%rsp),%rax	/* RSP */
-	movq %rax,(4*8)(%rdi)
-	andl $0xffff0000,%eax
-	popq_cfi %rdi
-	orq PER_CPU_VAR(espfix_stack),%rax
+	movq	PER_CPU_VAR(espfix_waddr), %rdi
+	movq	%rax, (0*8)(%rdi)		/* RAX */
+	movq	(2*8)(%rsp), %rax		/* RIP */
+	movq	%rax, (1*8)(%rdi)
+	movq	(3*8)(%rsp), %rax		/* CS */
+	movq	%rax, (2*8)(%rdi)
+	movq	(4*8)(%rsp), %rax		/* RFLAGS */
+	movq	%rax, (3*8)(%rdi)
+	movq	(6*8)(%rsp), %rax		/* SS */
+	movq	%rax, (5*8)(%rdi)
+	movq	(5*8)(%rsp), %rax		/* RSP */
+	movq	%rax, (4*8)(%rdi)
+	andl	$0xffff0000, %eax
+	popq	%rdi
+	orq	PER_CPU_VAR(espfix_stack), %rax
 	SWAPGS
-	movq %rax,%rsp
-	popq_cfi %rax
-	jmp native_irq_return_iret
+	movq	%rax, %rsp
+	popq	%rax
+	jmp	native_irq_return_iret
 #endif
-
-	/* edi: workmask, edx: work */
-retint_careful:
-	CFI_RESTORE_STATE
-	testl $_TIF_NEED_RESCHED_MASK,%edx
-	jz   retint_signal
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
-	SCHEDULE_USER
-	popq_cfi %rdi
-	GET_THREAD_INFO(%rcx)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp retint_check
-
-retint_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz    retint_swapgs
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_EXTRA_REGS
-	movq $-1,ORIG_RAX(%rsp)
-	xorl %esi,%esi		# oldset
-	movq %rsp,%rdi		# &pt_regs
-	call do_notify_resume
-	RESTORE_EXTRA_REGS
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	jmp retint_with_reschedule
-
-	CFI_ENDPROC
 END(common_interrupt)
 
 /*
@@ -896,13 +668,11 @@ END(common_interrupt)
  */
 .macro apicinterrupt3 num sym do_sym
 ENTRY(\sym)
-	INTR_FRAME
 	ASM_CLAC
-	pushq_cfi $~(\num)
+	pushq	$~(\num)
 .Lcommon_\sym:
 	interrupt \do_sym
-	jmp ret_from_intr
-	CFI_ENDPROC
+	jmp	ret_from_intr
 END(\sym)
 .endm
 
@@ -924,53 +694,45 @@ trace_apicinterrupt \num \sym
 .endm
 
 #ifdef CONFIG_SMP
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
-	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt3 REBOOT_VECTOR \
-	reboot_interrupt smp_reboot_interrupt
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR		irq_move_cleanup_interrupt	smp_irq_move_cleanup_interrupt
+apicinterrupt3 REBOOT_VECTOR			reboot_interrupt		smp_reboot_interrupt
 #endif
 
 #ifdef CONFIG_X86_UV
-apicinterrupt3 UV_BAU_MESSAGE \
-	uv_bau_message_intr1 uv_bau_message_interrupt
+apicinterrupt3 UV_BAU_MESSAGE			uv_bau_message_intr1		uv_bau_message_interrupt
 #endif
-apicinterrupt LOCAL_TIMER_VECTOR \
-	apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR \
-	x86_platform_ipi smp_x86_platform_ipi
+
+apicinterrupt LOCAL_TIMER_VECTOR		apic_timer_interrupt		smp_apic_timer_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR		x86_platform_ipi		smp_x86_platform_ipi
 
 #ifdef CONFIG_HAVE_KVM
-apicinterrupt3 POSTED_INTR_VECTOR \
-	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_VECTOR		kvm_posted_intr_ipi		smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR	kvm_posted_intr_wakeup_ipi	smp_kvm_posted_intr_wakeup_ipi
 #endif
 
 #ifdef CONFIG_X86_MCE_THRESHOLD
-apicinterrupt THRESHOLD_APIC_VECTOR \
-	threshold_interrupt smp_threshold_interrupt
+apicinterrupt THRESHOLD_APIC_VECTOR		threshold_interrupt		smp_threshold_interrupt
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+apicinterrupt DEFERRED_ERROR_VECTOR		deferred_error_interrupt	smp_deferred_error_interrupt
 #endif
 
 #ifdef CONFIG_X86_THERMAL_VECTOR
-apicinterrupt THERMAL_APIC_VECTOR \
-	thermal_interrupt smp_thermal_interrupt
+apicinterrupt THERMAL_APIC_VECTOR		thermal_interrupt		smp_thermal_interrupt
 #endif
 
 #ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
-	call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR \
-	call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR \
-	reschedule_interrupt smp_reschedule_interrupt
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR	call_function_single_interrupt	smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR		call_function_interrupt		smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR			reschedule_interrupt		smp_reschedule_interrupt
 #endif
 
-apicinterrupt ERROR_APIC_VECTOR \
-	error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR \
-	spurious_interrupt smp_spurious_interrupt
+apicinterrupt ERROR_APIC_VECTOR			error_interrupt			smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR		spurious_interrupt		smp_spurious_interrupt
 
 #ifdef CONFIG_IRQ_WORK
-apicinterrupt IRQ_WORK_VECTOR \
-	irq_work_interrupt smp_irq_work_interrupt
+apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 #endif
 
 /*
@@ -985,100 +747,87 @@ ENTRY(\sym)
 	.error "using shift_ist requires paranoid=1"
 	.endif
 
-	.if \has_error_code
-	XCPT_FRAME
-	.else
-	INTR_FRAME
-	.endif
-
 	ASM_CLAC
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 
 	.ifeq \has_error_code
-	pushq_cfi $-1			/* ORIG_RAX: no syscall to restart */
+	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 
 	ALLOC_PT_GPREGS_ON_STACK
 
 	.if \paranoid
 	.if \paranoid == 1
-	CFI_REMEMBER_STATE
-	testl $3, CS(%rsp)		/* If coming from userspace, switch */
-	jnz 1f				/* stacks. */
+	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
+	jnz	1f
 	.endif
-	call paranoid_entry
+	call	paranoid_entry
 	.else
-	call error_entry
+	call	error_entry
 	.endif
 	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
-	DEFAULT_FRAME 0
-
 	.if \paranoid
 	.if \shift_ist != -1
-	TRACE_IRQS_OFF_DEBUG		/* reload IDT in case of recursion */
+	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
 	.else
 	TRACE_IRQS_OFF
 	.endif
 	.endif
 
-	movq %rsp,%rdi			/* pt_regs pointer */
+	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
-	movq ORIG_RAX(%rsp),%rsi	/* get error code */
-	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
 	.else
-	xorl %esi,%esi			/* no error code */
+	xorl	%esi, %esi			/* no error code */
 	.endif
 
 	.if \shift_ist != -1
-	subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+	subq	$EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
-	call \do_sym
+	call	\do_sym
 
 	.if \shift_ist != -1
-	addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+	addq	$EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
 	/* these procedures expect "no swapgs" flag in ebx */
 	.if \paranoid
-	jmp paranoid_exit
+	jmp	paranoid_exit
 	.else
-	jmp error_exit
+	jmp	error_exit
 	.endif
 
 	.if \paranoid == 1
-	CFI_RESTORE_STATE
 	/*
 	 * Paranoid entry from userspace.  Switch stacks and treat it
 	 * as a normal entry.  This means that paranoid handlers
 	 * run in real process context if user_mode(regs).
 	 */
 1:
-	call error_entry
+	call	error_entry
 
-	DEFAULT_FRAME 0
 
-	movq %rsp,%rdi			/* pt_regs pointer */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack */
+	movq	%rsp, %rdi			/* pt_regs pointer */
+	call	sync_regs
+	movq	%rax, %rsp			/* switch stack */
 
-	movq %rsp,%rdi			/* pt_regs pointer */
+	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
-	movq ORIG_RAX(%rsp),%rsi	/* get error code */
-	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
 	.else
-	xorl %esi,%esi			/* no error code */
+	xorl	%esi, %esi			/* no error code */
 	.endif
 
-	call \do_sym
+	call	\do_sym
 
-	jmp error_exit			/* %ebx: no swapgs flag */
+	jmp	error_exit			/* %ebx: no swapgs flag */
 	.endif
-
-	CFI_ENDPROC
 END(\sym)
 .endm
 
@@ -1093,66 +842,59 @@ idtentry \sym \do_sym has_error_code=\has_error_code
 .endm
 #endif
 
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
-
-	/* Reload gs selector with exception handling */
-	/* edi:  new selector */
+idtentry divide_error			do_divide_error			has_error_code=0
+idtentry overflow			do_overflow			has_error_code=0
+idtentry bounds				do_bounds			has_error_code=0
+idtentry invalid_op			do_invalid_op			has_error_code=0
+idtentry device_not_available		do_device_not_available		has_error_code=0
+idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2
+idtentry coprocessor_segment_overrun	do_coprocessor_segment_overrun	has_error_code=0
+idtentry invalid_TSS			do_invalid_TSS			has_error_code=1
+idtentry segment_not_present		do_segment_not_present		has_error_code=1
+idtentry spurious_interrupt_bug		do_spurious_interrupt_bug	has_error_code=0
+idtentry coprocessor_error		do_coprocessor_error		has_error_code=0
+idtentry alignment_check		do_alignment_check		has_error_code=1
+idtentry simd_coprocessor_error		do_simd_coprocessor_error	has_error_code=0
+
+
+	/*
+	 * Reload gs selector with exception handling
+	 * edi:  new selector
+	 */
 ENTRY(native_load_gs_index)
-	CFI_STARTPROC
-	pushfq_cfi
+	pushfq
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
 gs_change:
-	movl %edi,%gs
-2:	mfence		/* workaround */
+	movl	%edi, %gs
+2:	mfence					/* workaround */
 	SWAPGS
-	popfq_cfi
+	popfq
 	ret
-	CFI_ENDPROC
 END(native_load_gs_index)
 
-	_ASM_EXTABLE(gs_change,bad_gs)
-	.section .fixup,"ax"
+	_ASM_EXTABLE(gs_change, bad_gs)
+	.section .fixup, "ax"
 	/* running with kernelgs */
 bad_gs:
-	SWAPGS			/* switch back to user gs */
-	xorl %eax,%eax
-	movl %eax,%gs
-	jmp  2b
+	SWAPGS					/* switch back to user gs */
+	xorl	%eax, %eax
+	movl	%eax, %gs
+	jmp	2b
 	.previous
 
 #ifndef CONFIG_PREEMPT_RT_FULL
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(do_softirq_own_stack)
-	CFI_STARTPROC
-	pushq_cfi %rbp
-	CFI_REL_OFFSET rbp,0
-	mov  %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
-	incl PER_CPU_VAR(irq_count)
-	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
-	push  %rbp			# backlink for old unwinder
-	call __do_softirq
+	pushq	%rbp
+	mov	%rsp, %rbp
+	incl	PER_CPU_VAR(irq_count)
+	cmove	PER_CPU_VAR(irq_stack_ptr), %rsp
+	push	%rbp				/* frame pointer backlink */
+	call	__do_softirq
 	leaveq
-	CFI_RESTORE		rbp
-	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET   -8
-	decl PER_CPU_VAR(irq_count)
+	decl	PER_CPU_VAR(irq_count)
 	ret
-	CFI_ENDPROC
 END(do_softirq_own_stack)
 #endif
 
@@ -1172,29 +914,24 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  * existing activation in its critical region -- if so, we pop the current
  * activation and restart the handler using the previous one.
  */
-ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
-	CFI_STARTPROC
+ENTRY(xen_do_hypervisor_callback)		/* do_hypervisor_callback(struct *pt_regs) */
+
 /*
  * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
  * see the correct pointer to the pt_regs
  */
-	movq %rdi, %rsp            # we don't return, adjust the stack frame
-	CFI_ENDPROC
-	DEFAULT_FRAME
-11:	incl PER_CPU_VAR(irq_count)
-	movq %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
-	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	pushq %rbp			# backlink for old unwinder
-	call xen_evtchn_do_upcall
-	popq %rsp
-	CFI_DEF_CFA_REGISTER rsp
-	decl PER_CPU_VAR(irq_count)
+	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
+11:	incl	PER_CPU_VAR(irq_count)
+	movq	%rsp, %rbp
+	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
+	pushq	%rbp				/* frame pointer backlink */
+	call	xen_evtchn_do_upcall
+	popq	%rsp
+	decl	PER_CPU_VAR(irq_count)
 #ifndef CONFIG_PREEMPT
-	call xen_maybe_preempt_hcall
+	call	xen_maybe_preempt_hcall
 #endif
-	jmp  error_exit
-	CFI_ENDPROC
+	jmp	error_exit
 END(xen_do_hypervisor_callback)
 
 /*
@@ -1211,51 +948,35 @@ END(xen_do_hypervisor_callback)
  * with its current contents: any discrepancy means we in category 1.
  */
 ENTRY(xen_failsafe_callback)
-	INTR_FRAME 1 (6*8)
-	/*CFI_REL_OFFSET gs,GS*/
-	/*CFI_REL_OFFSET fs,FS*/
-	/*CFI_REL_OFFSET es,ES*/
-	/*CFI_REL_OFFSET ds,DS*/
-	CFI_REL_OFFSET r11,8
-	CFI_REL_OFFSET rcx,0
-	movw %ds,%cx
-	cmpw %cx,0x10(%rsp)
-	CFI_REMEMBER_STATE
-	jne 1f
-	movw %es,%cx
-	cmpw %cx,0x18(%rsp)
-	jne 1f
-	movw %fs,%cx
-	cmpw %cx,0x20(%rsp)
-	jne 1f
-	movw %gs,%cx
-	cmpw %cx,0x28(%rsp)
-	jne 1f
+	movl	%ds, %ecx
+	cmpw	%cx, 0x10(%rsp)
+	jne	1f
+	movl	%es, %ecx
+	cmpw	%cx, 0x18(%rsp)
+	jne	1f
+	movl	%fs, %ecx
+	cmpw	%cx, 0x20(%rsp)
+	jne	1f
+	movl	%gs, %ecx
+	cmpw	%cx, 0x28(%rsp)
+	jne	1f
 	/* All segments match their saved values => Category 2 (Bad IRET). */
-	movq (%rsp),%rcx
-	CFI_RESTORE rcx
-	movq 8(%rsp),%r11
-	CFI_RESTORE r11
-	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	pushq_cfi $0	/* RIP */
-	pushq_cfi %r11
-	pushq_cfi %rcx
-	jmp general_protection
-	CFI_RESTORE_STATE
+	movq	(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	pushq	$0				/* RIP */
+	pushq	%r11
+	pushq	%rcx
+	jmp	general_protection
 1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
-	movq (%rsp),%rcx
-	CFI_RESTORE rcx
-	movq 8(%rsp),%r11
-	CFI_RESTORE r11
-	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	pushq_cfi $-1 /* orig_ax = -1 => not a system call */
+	movq	(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
-	jmp error_exit
-	CFI_ENDPROC
+	jmp	error_exit
 END(xen_failsafe_callback)
 
 apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@ -1268,21 +989,25 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
 	hyperv_callback_vector hyperv_vector_handler
 #endif /* CONFIG_HYPERV */
 
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry stack_segment do_stack_segment has_error_code=1
+idtentry debug			do_debug		has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3			do_int3			has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment		do_stack_segment	has_error_code=1
+
 #ifdef CONFIG_XEN
-idtentry xen_debug do_debug has_error_code=0
-idtentry xen_int3 do_int3 has_error_code=0
-idtentry xen_stack_segment do_stack_segment has_error_code=1
+idtentry xen_debug		do_debug		has_error_code=0
+idtentry xen_int3		do_int3			has_error_code=0
+idtentry xen_stack_segment	do_stack_segment	has_error_code=1
 #endif
-idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
+
+idtentry general_protection	do_general_protection	has_error_code=1
+trace_idtentry page_fault	do_page_fault		has_error_code=1
+
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
+idtentry async_page_fault	do_async_page_fault	has_error_code=1
 #endif
+
 #ifdef CONFIG_X86_MCE
-idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
+idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vector(%rip)
 #endif
 
 /*
@@ -1291,19 +1016,17 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
-	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
-	movl $1,%ebx
-	movl $MSR_GS_BASE,%ecx
+	movl	$1, %ebx
+	movl	$MSR_GS_BASE, %ecx
 	rdmsr
-	testl %edx,%edx
-	js 1f	/* negative -> in kernel */
+	testl	%edx, %edx
+	js	1f				/* negative -> in kernel */
 	SWAPGS
-	xorl %ebx,%ebx
+	xorl	%ebx, %ebx
 1:	ret
-	CFI_ENDPROC
 END(paranoid_entry)
 
 /*
@@ -1315,17 +1038,17 @@ END(paranoid_entry)
  * in syscall entry), so checking for preemption here would
  * be complicated.  Fortunately, we there's no good reason
  * to try to handle preemption here.
+ *
+ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
  */
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(paranoid_exit)
-	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_exit_no_swapgs
+	testl	%ebx, %ebx			/* swapgs needed? */
+	jnz	paranoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
-	jmp paranoid_exit_restore
+	jmp	paranoid_exit_restore
 paranoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
 paranoid_exit_restore:
@@ -1333,24 +1056,40 @@ paranoid_exit_restore:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
 	INTERRUPT_RETURN
-	CFI_ENDPROC
 END(paranoid_exit)
 
 /*
  * Save all registers in pt_regs, and switch gs if needed.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
-	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
-	xorl %ebx,%ebx
-	testl $3,CS+8(%rsp)
-	je error_kernelspace
-error_swapgs:
+	xorl	%ebx, %ebx
+	testb	$3, CS+8(%rsp)
+	jz	.Lerror_kernelspace
+
+.Lerror_entry_from_usermode_swapgs:
+	/*
+	 * We entered from user mode or we're pretending to have entered
+	 * from user mode due to an IRET fault.
+	 */
 	SWAPGS
-error_sti:
+
+.Lerror_entry_from_usermode_after_swapgs:
+	/*
+	 * We need to tell lockdep that IRQs are off.  We can't do this until
+	 * we fix gsbase, and we should do it before enter_from_user_mode
+	 * (which can take locks).
+	 */
+	TRACE_IRQS_OFF
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+	ret
+
+.Lerror_entry_done:
 	TRACE_IRQS_OFF
 	ret
 
@@ -1360,58 +1099,77 @@ error_sti:
 	 * truncated RIP for IRET exceptions returning to compat mode. Check
 	 * for these here too.
 	 */
-error_kernelspace:
-	CFI_REL_OFFSET rcx, RCX+8
-	incl %ebx
-	leaq native_irq_return_iret(%rip),%rcx
-	cmpq %rcx,RIP+8(%rsp)
-	je error_bad_iret
-	movl %ecx,%eax	/* zero extend */
-	cmpq %rax,RIP+8(%rsp)
-	je bstep_iret
-	cmpq $gs_change,RIP+8(%rsp)
-	je error_swapgs
-	jmp error_sti
-
-bstep_iret:
+.Lerror_kernelspace:
+	incl	%ebx
+	leaq	native_irq_return_iret(%rip), %rcx
+	cmpq	%rcx, RIP+8(%rsp)
+	je	.Lerror_bad_iret
+	movl	%ecx, %eax			/* zero extend */
+	cmpq	%rax, RIP+8(%rsp)
+	je	.Lbstep_iret
+	cmpq	$gs_change, RIP+8(%rsp)
+	jne	.Lerror_entry_done
+
+	/*
+	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
+	 * gsbase and proceed.  We'll fix up the exception and land in
+	 * gs_change's error handler with kernel gsbase.
+	 */
+	jmp	.Lerror_entry_from_usermode_swapgs
+
+.Lbstep_iret:
 	/* Fix truncated RIP */
-	movq %rcx,RIP+8(%rsp)
+	movq	%rcx, RIP+8(%rsp)
 	/* fall through */
 
-error_bad_iret:
+.Lerror_bad_iret:
+	/*
+	 * We came from an IRET to user mode, so we have user gsbase.
+	 * Switch to kernel gsbase:
+	 */
 	SWAPGS
-	mov %rsp,%rdi
-	call fixup_bad_iret
-	mov %rax,%rsp
-	decl %ebx	/* Return to usergs */
-	jmp error_sti
-	CFI_ENDPROC
+
+	/*
+	 * Pretend that the exception came from user mode: set up pt_regs
+	 * as if we faulted immediately after IRET and clear EBX so that
+	 * error_exit knows that we will be returning to user mode.
+	 */
+	mov	%rsp, %rdi
+	call	fixup_bad_iret
+	mov	%rax, %rsp
+	decl	%ebx
+	jmp	.Lerror_entry_from_usermode_after_swapgs
 END(error_entry)
 
 
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
+/*
+ * On entry, EBS is a "return to kernel mode" flag:
+ *   1: already in kernel mode, don't need SWAPGS
+ *   0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
+ */
 ENTRY(error_exit)
-	DEFAULT_FRAME
-	movl %ebx,%eax
-	RESTORE_EXTRA_REGS
+	movl	%ebx, %eax
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	testl %eax,%eax
-	jne retint_kernel
-	LOCKDEP_SYS_EXIT_IRQ
-	movl TI_flags(%rcx),%edx
-	movl $_TIF_WORK_MASK,%edi
-	andl %edi,%edx
-	jnz retint_careful
-	jmp retint_swapgs
-	CFI_ENDPROC
+	testl	%eax, %eax
+	jnz	retint_kernel
+	jmp	retint_user
 END(error_exit)
 
 /* Runs on exception stack */
 ENTRY(nmi)
-	INTR_FRAME
+	/*
+	 * Fix up the exception frame if we're on Xen.
+	 * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
+	 * one value to the stack on native, so it may clobber the rdx
+	 * scratch slot, but it won't clobber any of the important
+	 * slots past it.
+	 *
+	 * Xen is a different story, because the Xen frame itself overlaps
+	 * the "NMI executing" variable.
+	 */
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
+
 	/*
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
 	 * the iretq it performs will take us out of NMI context.
@@ -1451,8 +1209,7 @@ ENTRY(nmi)
 	 */
 
 	/* Use %rdx as our temp variable throughout */
-	pushq_cfi %rdx
-	CFI_REL_OFFSET rdx, 0
+	pushq	%rdx
 
 	testb	$3, CS-RIP+8(%rsp)
 	jz	.Lnmi_from_kernel
@@ -1463,12 +1220,15 @@ ENTRY(nmi)
 	 * we don't want to enable interrupts, because then we'll end
 	 * up in an awkward situation in which IRQs are on but NMIs
 	 * are off.
+	 *
+	 * We also must not push anything to the stack before switching
+	 * stacks lest we corrupt the "NMI executing" variable.
 	 */
 
-	SWAPGS
+	SWAPGS_UNSAFE_STACK
 	cld
 	movq	%rsp, %rdx
-	movq	PER_CPU_VAR(kernel_stack), %rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	pushq	5*8(%rdx)	/* pt_regs->ss */
 	pushq	4*8(%rdx)	/* pt_regs->rsp */
 	pushq	3*8(%rdx)	/* pt_regs->flags */
@@ -1496,6 +1256,7 @@ ENTRY(nmi)
 	 * due to nesting -- we're on the normal thread stack and we're
 	 * done with the NMI stack.
 	 */
+
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
 	call	do_nmi
@@ -1574,8 +1335,8 @@ ENTRY(nmi)
 	 * This will not detect if we interrupted an outer NMI just
 	 * before IRET.
 	 */
-	cmpl $1, -8(%rsp)
-	je nested_nmi
+	cmpl	$1, -8(%rsp)
+	je	nested_nmi
 
 	/*
 	 * Now test if the previous stack was an NMI stack.  This covers
@@ -1594,6 +1355,7 @@ ENTRY(nmi)
 	cmpq	%rdx, 4*8(%rsp)
 	/* If the stack pointer is above the NMI stack, this is a normal NMI */
 	ja	first_nmi
+
 	subq	$EXCEPTION_STKSZ, %rdx
 	cmpq	%rdx, 4*8(%rsp)
 	/* If it is below the NMI stack, it is a normal NMI */
@@ -1606,55 +1368,60 @@ ENTRY(nmi)
 
 	/* This is a nested NMI. */
 
-	CFI_REMEMBER_STATE
-
 nested_nmi:
 	/*
 	 * Modify the "iret" frame to point to repeat_nmi, forcing another
 	 * iteration of NMI handling.
 	 */
-	leaq -1*8(%rsp), %rdx
-	movq %rdx, %rsp
-	CFI_ADJUST_CFA_OFFSET 1*8
-	leaq -10*8(%rsp), %rdx
-	pushq_cfi $__KERNEL_DS
-	pushq_cfi %rdx
-	pushfq_cfi
-	pushq_cfi $__KERNEL_CS
-	pushq_cfi $repeat_nmi
+	subq	$8, %rsp
+	leaq	-10*8(%rsp), %rdx
+	pushq	$__KERNEL_DS
+	pushq	%rdx
+	pushfq
+	pushq	$__KERNEL_CS
+	pushq	$repeat_nmi
 
 	/* Put stack back */
-	addq $(6*8), %rsp
-	CFI_ADJUST_CFA_OFFSET -6*8
+	addq	$(6*8), %rsp
 
 nested_nmi_out:
-	popq_cfi %rdx
-	CFI_RESTORE rdx
+	popq	%rdx
 
 	/* We are returning to kernel mode, so this cannot result in a fault. */
 	INTERRUPT_RETURN
 
-	CFI_RESTORE_STATE
 first_nmi:
 	/* Restore rdx. */
-	movq (%rsp), %rdx
-	CFI_RESTORE rdx
+	movq	(%rsp), %rdx
 
-	/* Set "NMI executing" on the stack. */
-	pushq_cfi $1
+	/* Make room for "NMI executing". */
+	pushq	$0
 
 	/* Leave room for the "iret" frame */
-	subq $(5*8), %rsp
-	CFI_ADJUST_CFA_OFFSET 5*8
+	subq	$(5*8), %rsp
 
 	/* Copy the "original" frame to the "outermost" frame */
 	.rept 5
-	pushq_cfi 11*8(%rsp)
+	pushq	11*8(%rsp)
 	.endr
-	CFI_DEF_CFA_OFFSET 5*8
 
 	/* Everything up to here is safe from nested NMIs */
 
+#ifdef CONFIG_DEBUG_ENTRY
+	/*
+	 * For ease of testing, unmask NMIs right away.  Disabled by
+	 * default because IRET is very expensive.
+	 */
+	pushq	$0		/* SS */
+	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
+	addq	$8, (%rsp)	/* Fix up RSP */
+	pushfq			/* RFLAGS */
+	pushq	$__KERNEL_CS	/* CS */
+	pushq	$1f		/* RIP */
+	INTERRUPT_RETURN	/* continues at repeat_nmi below */
+1:
+#endif
+
 repeat_nmi:
 	/*
 	 * If there was a nested NMI, the first NMI's iret will return
@@ -1668,24 +1435,21 @@ repeat_nmi:
 	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
 	 * we're repeating an NMI, gsbase has the same value that it had on
 	 * the first iteration.  paranoid_entry will load the kernel
-	 * gsbase if needed before we call do_nmi.
-	 *
-	 * Set "NMI executing" in case we came back here via IRET.
+	 * gsbase if needed before we call do_nmi.  "NMI executing"
+	 * is zero.
 	 */
-	movq $1, 10*8(%rsp)
+	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
 
 	/*
 	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
 	 * here must not modify the "iret" frame while we're writing to
 	 * it or it will end up containing garbage.
 	 */
-	addq $(10*8), %rsp
-	CFI_ADJUST_CFA_OFFSET -10*8
+	addq	$(10*8), %rsp
 	.rept 5
-	pushq_cfi -6*8(%rsp)
+	pushq	-6*8(%rsp)
 	.endr
-	subq $(5*8), %rsp
-	CFI_DEF_CFA_OFFSET 5*8
+	subq	$(5*8), %rsp
 end_repeat_nmi:
 
 	/*
@@ -1693,7 +1457,7 @@ end_repeat_nmi:
 	 * If this happens, then the inner NMI will change the "iret"
 	 * frame to point back to repeat_nmi.
 	 */
-	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
@@ -1703,16 +1467,15 @@ end_repeat_nmi:
 	 * setting NEED_RESCHED or anything that normal interrupts and
 	 * exceptions might do.
 	 */
-	call paranoid_entry
-	DEFAULT_FRAME 0
+	call	paranoid_entry
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-	movq %rsp,%rdi
-	movq $-1,%rsi
-	call do_nmi
+	movq	%rsp, %rdi
+	movq	$-1, %rsi
+	call	do_nmi
 
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz nmi_restore
+	testl	%ebx, %ebx			/* swapgs needed? */
+	jnz	nmi_restore
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
@@ -1739,13 +1502,9 @@ nmi_restore:
 	 * mode, so this cannot result in a fault.
 	 */
 	INTERRUPT_RETURN
-	CFI_ENDPROC
 END(nmi)
 
 ENTRY(ignore_sysret)
-	CFI_STARTPROC
-	mov $-ENOSYS,%eax
+	mov	$-ENOSYS, %eax
 	sysret
-	CFI_ENDPROC
 END(ignore_sysret)
-
diff --git a/kernel/arch/x86/entry/entry_64_compat.S b/kernel/arch/x86/entry/entry_64_compat.S
new file mode 100644
index 000000000..15cfebaa7
--- /dev/null
+++ b/kernel/arch/x86/entry/entry_64_compat.S
@@ -0,0 +1,328 @@
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+#include "calling.h"
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+	.section .entry.text, "ax"
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+	swapgs
+	sysretl
+ENDPROC(native_usergs_sysret32)
+#endif
+
+/*
+ * 32-bit SYSENTER instruction entry.
+ *
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(entry_SYSENTER_compat)
+	/* Interrupts are off on entry. */
+	SWAPGS_UNSAFE_STACK
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+	/*
+	 * User tracing code (ptrace or signal handlers) might assume that
+	 * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+	 * syscall.  Just in case the high bits are nonzero, zero-extend
+	 * the syscall number.  (This could almost certainly be deleted
+	 * with no ill effects.)
+	 */
+	movl	%eax, %eax
+
+	/* Construct struct pt_regs on stack */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	%rbp			/* pt_regs->sp (stashed in bp) */
+
+	/*
+	 * Push flags.  This is nasty.  First, interrupts are currently
+	 * off, but we need pt_regs->flags to have IF set.  Second, even
+	 * if TF was set when SYSENTER started, it's clear by now.  We fix
+	 * that later using TIF_SINGLESTEP.
+	 */
+	pushfq				/* pt_regs->flags (except IF = 0) */
+	orl	$X86_EFLAGS_IF, (%rsp)	/* Fix saved flags */
+	ASM_CLAC			/* Clear AC after saving FLAGS */
+
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	xorq    %r8,%r8
+	pushq	%r8			/* pt_regs->ip = 0 (placeholder) */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	pushq   %r8                     /* pt_regs->r8  = 0 */
+	pushq   %r8                     /* pt_regs->r9  = 0 */
+	pushq   %r8                     /* pt_regs->r10 = 0 */
+	pushq   %r8                     /* pt_regs->r11 = 0 */
+	pushq   %rbx                    /* pt_regs->rbx */
+	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
+	pushq   %r8                     /* pt_regs->r12 = 0 */
+	pushq   %r8                     /* pt_regs->r13 = 0 */
+	pushq   %r8                     /* pt_regs->r14 = 0 */
+	pushq   %r8                     /* pt_regs->r15 = 0 */
+	cld
+
+	/*
+	 * Sysenter doesn't filter flags, so we need to clear NT
+	 * ourselves.  To save a few cycles, we can check whether
+	 * NT was set instead of doing an unconditional popfq.
+	 * This needs to happen before enabling interrupts so that
+	 * we don't get preempted with NT set.
+	 *
+	 * NB.: sysenter_fix_flags is a label with the code under it moved
+	 * out-of-line as an optimization: NT is unlikely to be set in the
+	 * majority of the cases and instead of polluting the I$ unnecessarily,
+	 * we're keeping that code behind a branch which will predict as
+	 * not-taken and therefore its instructions won't be fetched.
+	 */
+	testl	$X86_EFLAGS_NT, EFLAGS(%rsp)
+	jnz	sysenter_fix_flags
+sysenter_flags_fixed:
+
+	/*
+	 * User mode is traced as though IRQs are on, and SYSENTER
+	 * turned them off.
+	 */
+	TRACE_IRQS_OFF
+
+	movq	%rsp, %rdi
+	call	do_fast_syscall_32
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+	jmp	sysret32_from_system_call
+
+sysenter_fix_flags:
+	pushq	$X86_EFLAGS_FIXED
+	popfq
+	jmp	sysenter_flags_fixed
+ENDPROC(entry_SYSENTER_compat)
+
+/*
+ * 32-bit SYSCALL instruction entry.
+ *
+ * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
+ * Arguments:
+ * eax  system call number
+ * ecx  return address
+ * ebx  arg1
+ * ebp  arg2	(note: not saved in the stack frame, should not be touched)
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * esp  user stack
+ * 0(%esp) arg6
+ */
+ENTRY(entry_SYSCALL_compat)
+	/* Interrupts are off on entry. */
+	SWAPGS_UNSAFE_STACK
+
+	/* Stash user ESP and switch to the kernel stack. */
+	movl	%esp, %r8d
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+	/* Zero-extending 32-bit regs, do not remove */
+	movl	%eax, %eax
+
+	/* Construct struct pt_regs on stack */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	%r8			/* pt_regs->sp */
+	pushq	%r11			/* pt_regs->flags */
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	pushq	%rcx			/* pt_regs->ip */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rbp			/* pt_regs->cx (stashed in bp) */
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	xorq    %r8,%r8
+	pushq   %r8                     /* pt_regs->r8  = 0 */
+	pushq   %r8                     /* pt_regs->r9  = 0 */
+	pushq   %r8                     /* pt_regs->r10 = 0 */
+	pushq   %r8                     /* pt_regs->r11 = 0 */
+	pushq   %rbx                    /* pt_regs->rbx */
+	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
+	pushq   %r8                     /* pt_regs->r12 = 0 */
+	pushq   %r8                     /* pt_regs->r13 = 0 */
+	pushq   %r8                     /* pt_regs->r14 = 0 */
+	pushq   %r8                     /* pt_regs->r15 = 0 */
+
+	/*
+	 * User mode is traced as though IRQs are on, and SYSENTER
+	 * turned them off.
+	 */
+	TRACE_IRQS_OFF
+
+	movq	%rsp, %rdi
+	call	do_fast_syscall_32
+	/* XEN PV guests always use IRET path */
+	ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+		    "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+
+	/* Opportunistic SYSRET */
+sysret32_from_system_call:
+	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
+	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
+	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
+	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */
+	movq	RIP(%rsp), %rcx		/* pt_regs->ip (in rcx) */
+	addq	$RAX, %rsp		/* Skip r8-r15 */
+	popq	%rax			/* pt_regs->rax */
+	popq	%rdx			/* Skip pt_regs->cx */
+	popq	%rdx			/* pt_regs->dx */
+	popq	%rsi			/* pt_regs->si */
+	popq	%rdi			/* pt_regs->di */
+
+        /*
+         * USERGS_SYSRET32 does:
+         *  GSBASE = user's GS base
+         *  EIP = ECX
+         *  RFLAGS = R11
+         *  CS = __USER32_CS
+         *  SS = __USER_DS
+         *
+	 * ECX will not match pt_regs->cx, but we're returning to a vDSO
+	 * trampoline that will fix up RCX, so this is okay.
+	 *
+	 * R12-R15 are callee-saved, so they contain whatever was in them
+	 * when the system call started, which is already known to user
+	 * code.  We zero R8-R10 to avoid info leaks.
+         */
+	xorq	%r8, %r8
+	xorq	%r9, %r9
+	xorq	%r10, %r10
+	movq	RSP-ORIG_RAX(%rsp), %rsp
+        USERGS_SYSRET32
+END(entry_SYSCALL_compat)
+
+/*
+ * Emulated IA32 system calls via int 0x80.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6	(note: not saved in the stack frame, should not be touched)
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
+
+ENTRY(entry_INT80_compat)
+	/*
+	 * Interrupts are off on entry.
+	 */
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
+	ASM_CLAC			/* Do this early to minimize exposure */
+	SWAPGS
+
+	/*
+	 * User tracing code (ptrace or signal handlers) might assume that
+	 * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+	 * syscall.  Just in case the high bits are nonzero, zero-extend
+	 * the syscall number.  (This could almost certainly be deleted
+	 * with no ill effects.)
+	 */
+	movl	%eax, %eax
+
+	/* Construct struct pt_regs on stack (iret frame is already on stack) */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	xorq    %r8,%r8
+	pushq   %r8                     /* pt_regs->r8  = 0 */
+	pushq   %r8                     /* pt_regs->r9  = 0 */
+	pushq   %r8                     /* pt_regs->r10 = 0 */
+	pushq   %r8                     /* pt_regs->r11 = 0 */
+	pushq   %rbx                    /* pt_regs->rbx */
+	pushq   %rbp                    /* pt_regs->rbp */
+	pushq   %r12                    /* pt_regs->r12 */
+	pushq   %r13                    /* pt_regs->r13 */
+	pushq   %r14                    /* pt_regs->r14 */
+	pushq   %r15                    /* pt_regs->r15 */
+	cld
+
+	/*
+	 * User mode is traced as though IRQs are on, and the interrupt
+	 * gate turned them off.
+	 */
+	TRACE_IRQS_OFF
+
+	movq	%rsp, %rdi
+	call	do_syscall_32_irqs_off
+.Lsyscall_32_done:
+
+	/* Go back to user mode. */
+	TRACE_IRQS_ON
+	SWAPGS
+	jmp	restore_regs_and_iret
+END(entry_INT80_compat)
+
+	ALIGN
+GLOBAL(stub32_clone)
+	/*
+	 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
+	 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
+	 *
+	 * The native 64-bit kernel's sys_clone() implements the latter,
+	 * so we need to swap arguments here before calling it:
+	 */
+	xchg	%r8, %rcx
+	jmp	sys_clone
diff --git a/kernel/arch/x86/kernel/syscall_32.c b/kernel/arch/x86/entry/syscall_32.c
index 3777189c4..9a6649857 100644
--- a/kernel/arch/x86/kernel/syscall_32.c
+++ b/kernel/arch/x86/entry/syscall_32.c
@@ -4,30 +4,27 @@
 #include <linux/sys.h>
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
+#include <asm/syscall.h>
 
 #ifdef CONFIG_IA32_EMULATION
 #define SYM(sym, compat) compat
 #else
 #define SYM(sym, compat) sym
-#define ia32_sys_call_table sys_call_table
-#define __NR_ia32_syscall_max __NR_syscall_max
 #endif
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
 #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
 
-typedef asmlinkage void (*sys_call_ptr_t)(void);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
-extern asmlinkage void sys_ni_syscall(void);
-
-__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
-	[0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
+	[0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
 #include <asm/syscalls_32.h>
 };
diff --git a/kernel/arch/x86/kernel/syscall_64.c b/kernel/arch/x86/entry/syscall_64.c
index 4ac730b37..41283d22b 100644
--- a/kernel/arch/x86/kernel/syscall_64.c
+++ b/kernel/arch/x86/entry/syscall_64.c
@@ -14,13 +14,13 @@
 # define __SYSCALL_X32(nr, sym, compat) /* nothing */
 #endif
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
 #define __SYSCALL_64(nr, sym, compat) [nr] = sym,
 
-extern void sys_ni_syscall(void);
+extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
 asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/*
diff --git a/kernel/arch/x86/syscalls/Makefile b/kernel/arch/x86/entry/syscalls/Makefile
index a55abb9f6..57aa59fd1 100644
--- a/kernel/arch/x86/syscalls/Makefile
+++ b/kernel/arch/x86/entry/syscalls/Makefile
@@ -1,5 +1,5 @@
-out := $(obj)/../include/generated/asm
-uapi := $(obj)/../include/generated/uapi/asm
+out := $(obj)/../../include/generated/asm
+uapi := $(obj)/../../include/generated/uapi/asm
 
 # Create output directory if not already present
 _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
diff --git a/kernel/arch/x86/syscalls/syscall_32.tbl b/kernel/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f9d..f17705e13 100644
--- a/kernel/arch/x86/syscalls/syscall_32.tbl
+++ b/kernel/arch/x86/entry/syscalls/syscall_32.tbl
@@ -8,7 +8,7 @@
 #
 0	i386	restart_syscall		sys_restart_syscall
 1	i386	exit			sys_exit
-2	i386	fork			sys_fork			stub32_fork
+2	i386	fork			sys_fork			sys_fork
 3	i386	read			sys_read
 4	i386	write			sys_write
 5	i386	open			sys_open			compat_sys_open
@@ -17,7 +17,7 @@
 8	i386	creat			sys_creat
 9	i386	link			sys_link
 10	i386	unlink			sys_unlink
-11	i386	execve			sys_execve			stub32_execve
+11	i386	execve			sys_execve			compat_sys_execve
 12	i386	chdir			sys_chdir
 13	i386	time			sys_time			compat_sys_time
 14	i386	mknod			sys_mknod
@@ -125,7 +125,7 @@
 116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
 117	i386	ipc			sys_ipc				compat_sys_ipc
 118	i386	fsync			sys_fsync
-119	i386	sigreturn		sys_sigreturn			stub32_sigreturn
+119	i386	sigreturn		sys_sigreturn			sys32_sigreturn
 120	i386	clone			sys_clone			stub32_clone
 121	i386	setdomainname		sys_setdomainname
 122	i386	uname			sys_newuname
@@ -179,7 +179,7 @@
 170	i386	setresgid		sys_setresgid16
 171	i386	getresgid		sys_getresgid16
 172	i386	prctl			sys_prctl
-173	i386	rt_sigreturn		sys_rt_sigreturn		stub32_rt_sigreturn
+173	i386	rt_sigreturn		sys_rt_sigreturn		sys32_rt_sigreturn
 174	i386	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
 175	i386	rt_sigprocmask		sys_rt_sigprocmask
 176	i386	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
@@ -196,7 +196,7 @@
 187	i386	sendfile		sys_sendfile			compat_sys_sendfile
 188	i386	getpmsg
 189	i386	putpmsg
-190	i386	vfork			sys_vfork			stub32_vfork
+190	i386	vfork			sys_vfork			sys_vfork
 191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
 192	i386	mmap2			sys_mmap_pgoff
 193	i386	truncate64		sys_truncate64			sys32_truncate64
@@ -364,4 +364,22 @@
 355	i386	getrandom		sys_getrandom
 356	i386	memfd_create		sys_memfd_create
 357	i386	bpf			sys_bpf
-358	i386	execveat		sys_execveat			stub32_execveat
+358	i386	execveat		sys_execveat			compat_sys_execveat
+359	i386	socket			sys_socket
+360	i386	socketpair		sys_socketpair
+361	i386	bind			sys_bind
+362	i386	connect			sys_connect
+363	i386	listen			sys_listen
+364	i386	accept4			sys_accept4
+365	i386	getsockopt		sys_getsockopt			compat_sys_getsockopt
+366	i386	setsockopt		sys_setsockopt			compat_sys_setsockopt
+367	i386	getsockname		sys_getsockname
+368	i386	getpeername		sys_getpeername
+369	i386	sendto			sys_sendto
+370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg
+371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
+372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
+373	i386	shutdown		sys_shutdown
+374	i386	userfaultfd		sys_userfaultfd
+375	i386	membarrier		sys_membarrier
+376	i386	mlock2			sys_mlock2
diff --git a/kernel/arch/x86/syscalls/syscall_64.tbl b/kernel/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1..314a90bfc 100644
--- a/kernel/arch/x86/syscalls/syscall_64.tbl
+++ b/kernel/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,9 @@
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
 322	64	execveat		stub_execveat
+323	common	userfaultfd		sys_userfaultfd
+324	common	membarrier		sys_membarrier
+325	common	mlock2			sys_mlock2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/kernel/arch/x86/syscalls/syscallhdr.sh b/kernel/arch/x86/entry/syscalls/syscallhdr.sh
index 31fd5f1f3..31fd5f1f3 100644
--- a/kernel/arch/x86/syscalls/syscallhdr.sh
+++ b/kernel/arch/x86/entry/syscalls/syscallhdr.sh
diff --git a/kernel/arch/x86/syscalls/syscalltbl.sh b/kernel/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec07..0e7f8ec07 100644
--- a/kernel/arch/x86/syscalls/syscalltbl.sh
+++ b/kernel/arch/x86/entry/syscalls/syscalltbl.sh
diff --git a/kernel/arch/x86/lib/thunk_32.S b/kernel/arch/x86/entry/thunk_32.S
index 5eb715087..e5a17114a 100644
--- a/kernel/arch/x86/lib/thunk_32.S
+++ b/kernel/arch/x86/entry/thunk_32.S
@@ -6,16 +6,14 @@
  */
 	#include <linux/linkage.h>
 	#include <asm/asm.h>
-	#include <asm/dwarf2.h>
 
 	/* put return address in eax (arg1) */
 	.macro THUNK name, func, put_ret_addr_in_eax=0
 	.globl \name
 \name:
-	CFI_STARTPROC
-	pushl_cfi_reg eax
-	pushl_cfi_reg ecx
-	pushl_cfi_reg edx
+	pushl %eax
+	pushl %ecx
+	pushl %edx
 
 	.if \put_ret_addr_in_eax
 	/* Place EIP in the arg1 */
@@ -23,11 +21,10 @@
 	.endif
 
 	call \func
-	popl_cfi_reg edx
-	popl_cfi_reg ecx
-	popl_cfi_reg eax
+	popl %edx
+	popl %ecx
+	popl %eax
 	ret
-	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
 	.endm
 
@@ -38,8 +35,6 @@
 
 #ifdef CONFIG_PREEMPT
 	THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-	THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+	THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
 #endif
 
diff --git a/kernel/arch/x86/lib/thunk_64.S b/kernel/arch/x86/entry/thunk_64.S
index f89ba4e93..efb2b932b 100644
--- a/kernel/arch/x86/lib/thunk_64.S
+++ b/kernel/arch/x86/entry/thunk_64.S
@@ -6,35 +6,32 @@
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
 #include <asm/asm.h>
 
 	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */
 	.macro THUNK name, func, put_ret_addr_in_rdi=0
 	.globl \name
 \name:
-	CFI_STARTPROC
 
 	/* this one pushes 9 elems, the next one would be %rIP */
-	pushq_cfi_reg rdi
-	pushq_cfi_reg rsi
-	pushq_cfi_reg rdx
-	pushq_cfi_reg rcx
-	pushq_cfi_reg rax
-	pushq_cfi_reg r8
-	pushq_cfi_reg r9
-	pushq_cfi_reg r10
-	pushq_cfi_reg r11
+	pushq %rdi
+	pushq %rsi
+	pushq %rdx
+	pushq %rcx
+	pushq %rax
+	pushq %r8
+	pushq %r9
+	pushq %r10
+	pushq %r11
 
 	.if \put_ret_addr_in_rdi
 	/* 9*8(%rsp) is return addr on stack */
-	movq_cfi_restore 9*8, rdi
+	movq 9*8(%rsp), %rdi
 	.endif
 
 	call \func
 	jmp  restore
-	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
 	.endm
 
@@ -49,27 +46,22 @@
 
 #ifdef CONFIG_PREEMPT
 	THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-	THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+	THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
 #endif
 
 #if defined(CONFIG_TRACE_IRQFLAGS) \
  || defined(CONFIG_DEBUG_LOCK_ALLOC) \
  || defined(CONFIG_PREEMPT)
-	CFI_STARTPROC
-	CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-	popq_cfi_reg r11
-	popq_cfi_reg r10
-	popq_cfi_reg r9
-	popq_cfi_reg r8
-	popq_cfi_reg rax
-	popq_cfi_reg rcx
-	popq_cfi_reg rdx
-	popq_cfi_reg rsi
-	popq_cfi_reg rdi
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	popq %rdi
 	ret
-	CFI_ENDPROC
 	_ASM_NOKPROBE(restore)
 #endif
diff --git a/kernel/arch/x86/vdso/.gitignore b/kernel/arch/x86/entry/vdso/.gitignore
index aae8ffdd5..aae8ffdd5 100644
--- a/kernel/arch/x86/vdso/.gitignore
+++ b/kernel/arch/x86/entry/vdso/.gitignore
diff --git a/kernel/arch/x86/vdso/Makefile b/kernel/arch/x86/entry/vdso/Makefile
index e97032069..265c0ed68 100644
--- a/kernel/arch/x86/vdso/Makefile
+++ b/kernel/arch/x86/entry/vdso/Makefile
@@ -8,7 +8,7 @@ KASAN_SANITIZE := n
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
-VDSO32-$(CONFIG_COMPAT)		:= y
+VDSO32-$(CONFIG_IA32_EMULATION)	:= y
 
 # files to link into the vdso
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
@@ -19,9 +19,7 @@ obj-y				+= vma.o
 # vDSO images to build
 vdso_img-$(VDSO64-y)		+= 64
 vdso_img-$(VDSOX32-y)		+= x32
-vdso_img-$(VDSO32-y)		+= 32-int80
-vdso_img-$(CONFIG_COMPAT)	+= 32-syscall
-vdso_img-$(VDSO32-y)		+= 32-sysenter
+vdso_img-$(VDSO32-y)		+= 32
 
 obj-$(VDSO32-y)			+= vdso32-setup.o
 
@@ -69,7 +67,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
 CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
        $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
        -fno-omit-frame-pointer -foptimize-sibling-calls \
-       -DDISABLE_BRANCH_PROFILING
+       -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 
 $(vobjs): KBUILD_CFLAGS += $(CFL)
 
@@ -122,15 +120,6 @@ $(obj)/%.so: $(obj)/%.so.dbg
 $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
 	$(call if_changed,vdso)
 
-#
-# Build multiple 32-bit vDSO images to choose from at boot time.
-#
-vdso32.so-$(VDSO32-y)		+= int80
-vdso32.so-$(CONFIG_COMPAT)	+= syscall
-vdso32.so-$(VDSO32-y)		+= sysenter
-
-vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)
-
 CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
 VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
 
@@ -139,14 +128,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
 
 targets += vdso32/vdso32.lds
-targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
+targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o
 targets += vdso32/vclock_gettime.o
 
-$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
-
-KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
-$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
-$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
+$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
 
 KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
 KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
@@ -157,13 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
 KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
 KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
 KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
-$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
 
-$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
-				 $(obj)/vdso32/vdso32.lds \
-				 $(obj)/vdso32/vclock_gettime.o \
-				 $(obj)/vdso32/note.o \
-				 $(obj)/vdso32/%.o
+$(obj)/vdso32.so.dbg: FORCE \
+		      $(obj)/vdso32/vdso32.lds \
+		      $(obj)/vdso32/vclock_gettime.o \
+		      $(obj)/vdso32/note.o \
+		      $(obj)/vdso32/system_call.o
 	$(call if_changed,vdso)
 
 #
@@ -175,7 +162,7 @@ quiet_cmd_vdso = VDSO    $@
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
 		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
 
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
 	$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
 GCOV_PROFILE := n
 
@@ -206,4 +193,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
 PHONY += vdso_install $(vdso_img_insttargets)
 vdso_install: $(vdso_img_insttargets) FORCE
 
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*
+clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/kernel/arch/x86/vdso/checkundef.sh b/kernel/arch/x86/entry/vdso/checkundef.sh
index 7ee90a9b5..7ee90a9b5 100755
--- a/kernel/arch/x86/vdso/checkundef.sh
+++ b/kernel/arch/x86/entry/vdso/checkundef.sh
diff --git a/kernel/arch/x86/vdso/vclock_gettime.c b/kernel/arch/x86/entry/vdso/vclock_gettime.c
index 979332275..ca94fa649 100644
--- a/kernel/arch/x86/vdso/vclock_gettime.c
+++ b/kernel/arch/x86/entry/vdso/vclock_gettime.c
@@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 notrace static cycle_t vread_tsc(void)
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)__native_read_tsc();
-
-	last = gtod->cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = gtod->cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
diff --git a/kernel/arch/x86/vdso/vdso-layout.lds.S b/kernel/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c92102..de2c92102 100644
--- a/kernel/arch/x86/vdso/vdso-layout.lds.S
+++ b/kernel/arch/x86/entry/vdso/vdso-layout.lds.S
diff --git a/kernel/arch/x86/vdso/vdso-note.S b/kernel/arch/x86/entry/vdso/vdso-note.S
index 79a071e43..79a071e43 100644
--- a/kernel/arch/x86/vdso/vdso-note.S
+++ b/kernel/arch/x86/entry/vdso/vdso-note.S
diff --git a/kernel/arch/x86/vdso/vdso.lds.S b/kernel/arch/x86/entry/vdso/vdso.lds.S
index 680793264..680793264 100644
--- a/kernel/arch/x86/vdso/vdso.lds.S
+++ b/kernel/arch/x86/entry/vdso/vdso.lds.S
diff --git a/kernel/arch/x86/vdso/vdso2c.c b/kernel/arch/x86/entry/vdso/vdso2c.c
index 8627db24a..785d9922b 100644
--- a/kernel/arch/x86/vdso/vdso2c.c
+++ b/kernel/arch/x86/entry/vdso/vdso2c.c
@@ -98,10 +98,10 @@ struct vdso_sym required_syms[] = {
 		"VDSO_FAKE_SECTION_TABLE_END", false
 	},
 	{"VDSO32_NOTE_MASK", true},
-	{"VDSO32_SYSENTER_RETURN", true},
 	{"__kernel_vsyscall", true},
 	{"__kernel_sigreturn", true},
 	{"__kernel_rt_sigreturn", true},
+	{"int80_landing_pad", true},
 };
 
 __attribute__((format(printf, 1, 2))) __attribute__((noreturn))
diff --git a/kernel/arch/x86/vdso/vdso2c.h b/kernel/arch/x86/entry/vdso/vdso2c.h
index 022498755..022498755 100644
--- a/kernel/arch/x86/vdso/vdso2c.h
+++ b/kernel/arch/x86/entry/vdso/vdso2c.h
diff --git a/kernel/arch/x86/vdso/vdso32-setup.c b/kernel/arch/x86/entry/vdso/vdso32-setup.c
index e904c2705..08a317a9a 100644
--- a/kernel/arch/x86/vdso/vdso32-setup.c
+++ b/kernel/arch/x86/entry/vdso/vdso32-setup.c
@@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup);
 __setup_param("vdso=", vdso_setup, vdso32_setup, 0);
 #endif
 
-#ifdef CONFIG_X86_64
-
-#define	vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SYSENTER32))
-#define	vdso32_syscall()	(boot_cpu_has(X86_FEATURE_SYSCALL32))
-
-#else  /* CONFIG_X86_32 */
-
-#define vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SEP))
-#define vdso32_syscall()	(0)
-
-#endif	/* CONFIG_X86_64 */
-
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
-const struct vdso_image *selected_vdso32;
-#endif
-
 int __init sysenter_setup(void)
 {
-#ifdef CONFIG_COMPAT
-	if (vdso32_syscall())
-		selected_vdso32 = &vdso_image_32_syscall;
-	else
-#endif
-	if (vdso32_sysenter())
-		selected_vdso32 = &vdso_image_32_sysenter;
-	else
-		selected_vdso32 = &vdso_image_32_int80;
-
-	init_vdso_image(selected_vdso32);
+	init_vdso_image(&vdso_image_32);
 
 	return 0;
 }
diff --git a/kernel/arch/x86/vdso/vdso32/.gitignore b/kernel/arch/x86/entry/vdso/vdso32/.gitignore
index e45fba9d0..e45fba9d0 100644
--- a/kernel/arch/x86/vdso/vdso32/.gitignore
+++ b/kernel/arch/x86/entry/vdso/vdso32/.gitignore
diff --git a/kernel/arch/x86/vdso/vdso32/note.S b/kernel/arch/x86/entry/vdso/vdso32/note.S
index c83f25734..c83f25734 100644
--- a/kernel/arch/x86/vdso/vdso32/note.S
+++ b/kernel/arch/x86/entry/vdso/vdso32/note.S
diff --git a/kernel/arch/x86/vdso/vdso32/sigreturn.S b/kernel/arch/x86/entry/vdso/vdso32/sigreturn.S
index d7ec4e251..d7ec4e251 100644
--- a/kernel/arch/x86/vdso/vdso32/sigreturn.S
+++ b/kernel/arch/x86/entry/vdso/vdso32/sigreturn.S
diff --git a/kernel/arch/x86/entry/vdso/vdso32/system_call.S b/kernel/arch/x86/entry/vdso/vdso32/system_call.S
new file mode 100644
index 000000000..3a1d92970
--- /dev/null
+++ b/kernel/arch/x86/entry/vdso/vdso32/system_call.S
@@ -0,0 +1,89 @@
+/*
+ * AT_SYSINFO entry point
+*/
+
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+/*
+ * First get the common code for the sigreturn entry points.
+ * This must come first.
+ */
+#include "sigreturn.S"
+
+	.text
+	.globl __kernel_vsyscall
+	.type __kernel_vsyscall,@function
+	ALIGN
+__kernel_vsyscall:
+	CFI_STARTPROC
+	/*
+	 * Reshuffle regs so that all of any of the entry instructions
+	 * will preserve enough state.
+	 *
+	 * A really nice entry sequence would be:
+	 *  pushl %edx
+	 *  pushl %ecx
+	 *  movl  %esp, %ecx
+	 *
+	 * Unfortunately, naughty Android versions between July and December
+	 * 2015 actually hardcode the traditional Linux SYSENTER entry
+	 * sequence.  That is severely broken for a number of reasons (ask
+	 * anyone with an AMD CPU, for example).  Nonetheless, we try to keep
+	 * it working approximately as well as it ever worked.
+	 *
+	 * This link may eludicate some of the history:
+	 *   https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+	 * personally, I find it hard to understand what's going on there.
+	 *
+	 * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+	 * Execute an indirect call to the address in the AT_SYSINFO auxv
+	 * entry.  That is the ONLY correct way to make a fast 32-bit system
+	 * call on Linux.  (Open-coding int $0x80 is also fine, but it's
+	 * slow.)
+	 */
+	pushl	%ecx
+	CFI_ADJUST_CFA_OFFSET	4
+	CFI_REL_OFFSET		ecx, 0
+	pushl	%edx
+	CFI_ADJUST_CFA_OFFSET	4
+	CFI_REL_OFFSET		edx, 0
+	pushl	%ebp
+	CFI_ADJUST_CFA_OFFSET	4
+	CFI_REL_OFFSET		ebp, 0
+
+	#define SYSENTER_SEQUENCE	"movl %esp, %ebp; sysenter"
+	#define SYSCALL_SEQUENCE	"movl %ecx, %ebp; syscall"
+
+#ifdef CONFIG_X86_64
+	/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
+	ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+	                  SYSCALL_SEQUENCE,  X86_FEATURE_SYSCALL32
+#else
+	ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
+#endif
+
+	/* Enter using int $0x80 */
+	int	$0x80
+GLOBAL(int80_landing_pad)
+
+	/*
+	 * Restore EDX and ECX in case they were clobbered.  EBP is not
+	 * clobbered (the kernel restores it), but it's cleaner and
+	 * probably faster to pop it than to adjust ESP using addl.
+	 */
+	popl	%ebp
+	CFI_RESTORE		ebp
+	CFI_ADJUST_CFA_OFFSET	-4
+	popl	%edx
+	CFI_RESTORE		edx
+	CFI_ADJUST_CFA_OFFSET	-4
+	popl	%ecx
+	CFI_RESTORE		ecx
+	CFI_ADJUST_CFA_OFFSET	-4
+	ret
+	CFI_ENDPROC
+
+	.size __kernel_vsyscall,.-__kernel_vsyscall
+	.previous
diff --git a/kernel/arch/x86/vdso/vdso32/vclock_gettime.c b/kernel/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 175cc72c0..87a86e017 100644
--- a/kernel/arch/x86/vdso/vdso32/vclock_gettime.c
+++ b/kernel/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -14,11 +14,13 @@
  */
 #undef CONFIG_64BIT
 #undef CONFIG_X86_64
+#undef CONFIG_PGTABLE_LEVELS
 #undef CONFIG_ILLEGAL_POINTER_VALUE
 #undef CONFIG_SPARSEMEM_VMEMMAP
 #undef CONFIG_NR_CPUS
 
 #define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
 #define CONFIG_PAGE_OFFSET 0
 #define CONFIG_ILLEGAL_POINTER_VALUE 0
 #define CONFIG_NR_CPUS 1
diff --git a/kernel/arch/x86/vdso/vdso32/vdso-fakesections.c b/kernel/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
index 541468e25..541468e25 100644
--- a/kernel/arch/x86/vdso/vdso32/vdso-fakesections.c
+++ b/kernel/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
diff --git a/kernel/arch/x86/vdso/vdso32/vdso32.lds.S b/kernel/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 31056cf29..31056cf29 100644
--- a/kernel/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/kernel/arch/x86/entry/vdso/vdso32/vdso32.lds.S
diff --git a/kernel/arch/x86/vdso/vdsox32.lds.S b/kernel/arch/x86/entry/vdso/vdsox32.lds.S
index 697c11ece..697c11ece 100644
--- a/kernel/arch/x86/vdso/vdsox32.lds.S
+++ b/kernel/arch/x86/entry/vdso/vdsox32.lds.S
diff --git a/kernel/arch/x86/vdso/vgetcpu.c b/kernel/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f4c..8ec3d1f4c 100644
--- a/kernel/arch/x86/vdso/vgetcpu.c
+++ b/kernel/arch/x86/entry/vdso/vgetcpu.c
diff --git a/kernel/arch/x86/vdso/vma.c b/kernel/arch/x86/entry/vdso/vma.c
index 1c9f750c3..64df47148 100644
--- a/kernel/arch/x86/vdso/vma.c
+++ b/kernel/arch/x86/entry/vdso/vma.c
@@ -177,24 +177,13 @@ up_fail:
 	return ret;
 }
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static int load_vdso32(void)
 {
-	int ret;
-
 	if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
 		return 0;
 
-	ret = map_vdso(selected_vdso32, false);
-	if (ret)
-		return ret;
-
-	if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
-		current_thread_info()->sysenter_return =
-			current->mm->context.vdso +
-			selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
-
-	return 0;
+	return map_vdso(&vdso_image_32, false);
 }
 #endif
 
@@ -219,8 +208,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
 		return map_vdso(&vdso_image_x32, true);
 	}
 #endif
-
+#ifdef CONFIG_IA32_EMULATION
 	return load_vdso32();
+#else
+	return 0;
+#endif
 }
 #endif
 #else
diff --git a/kernel/arch/x86/entry/vsyscall/Makefile b/kernel/arch/x86/entry/vsyscall/Makefile
new file mode 100644
index 000000000..a9f4856f6
--- /dev/null
+++ b/kernel/arch/x86/entry/vsyscall/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the x86 low level vsyscall code
+#
+obj-y					:= vsyscall_gtod.o
+
+obj-$(CONFIG_X86_VSYSCALL_EMULATION)	+= vsyscall_64.o vsyscall_emu_64.o
+
diff --git a/kernel/arch/x86/kernel/vsyscall_64.c b/kernel/arch/x86/entry/vsyscall/vsyscall_64.c
index 2dcc6ff6f..174c25499 100644
--- a/kernel/arch/x86/kernel/vsyscall_64.c
+++ b/kernel/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -38,7 +38,14 @@
 #define CREATE_TRACE_POINTS
 #include "vsyscall_trace.h"
 
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
+#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE)
+	NATIVE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_NONE)
+	NONE;
+#else
+	EMULATE;
+#endif
 
 static int __init vsyscall_setup(char *str)
 {
@@ -277,7 +284,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
 {
 	return "[vsyscall]";
 }
-static struct vm_operations_struct gate_vma_ops = {
+static const struct vm_operations_struct gate_vma_ops = {
 	.name = gate_vma_name,
 };
 static struct vm_area_struct gate_vma = {
@@ -290,7 +297,7 @@ static struct vm_area_struct gate_vma = {
 
 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
-#ifdef CONFIG_IA32_EMULATION
+#ifdef CONFIG_COMPAT
 	if (!mm || mm->context.ia32_compat)
 		return NULL;
 #endif
diff --git a/kernel/arch/x86/kernel/vsyscall_emu_64.S b/kernel/arch/x86/entry/vsyscall/vsyscall_emu_64.S
index c9596a9af..c9596a9af 100644
--- a/kernel/arch/x86/kernel/vsyscall_emu_64.S
+++ b/kernel/arch/x86/entry/vsyscall/vsyscall_emu_64.S
diff --git a/kernel/arch/x86/kernel/vsyscall_gtod.c b/kernel/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e330416..51e330416 100644
--- a/kernel/arch/x86/kernel/vsyscall_gtod.c
+++ b/kernel/arch/x86/entry/vsyscall/vsyscall_gtod.c
diff --git a/kernel/arch/x86/kernel/vsyscall_trace.h b/kernel/arch/x86/entry/vsyscall/vsyscall_trace.h
index a8b2edec5..9dd7359a3 100644
--- a/kernel/arch/x86/kernel/vsyscall_trace.h
+++ b/kernel/arch/x86/entry/vsyscall/vsyscall_trace.h
@@ -24,6 +24,6 @@ TRACE_EVENT(emulate_vsyscall,
 #endif
 
 #undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../arch/x86/kernel
+#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
 #define TRACE_INCLUDE_FILE vsyscall_trace
 #include <trace/define_trace.h>
diff --git a/kernel/arch/x86/ia32/Makefile b/kernel/arch/x86/ia32/Makefile
index bb635c641..cd4339bae 100644
--- a/kernel/arch/x86/ia32/Makefile
+++ b/kernel/arch/x86/ia32/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the ia32 kernel emulation subsystem.
 #
 
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
+obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o
 
 obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
 
diff --git a/kernel/arch/x86/ia32/ia32_signal.c b/kernel/arch/x86/ia32/ia32_signal.c
index c81d35e6c..0552884da 100644
--- a/kernel/arch/x86/ia32/ia32_signal.c
+++ b/kernel/arch/x86/ia32/ia32_signal.c
@@ -21,12 +21,12 @@
 #include <linux/binfmts.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
 #include <asm/user32.h>
-#include <asm/sigcontext32.h>
+#include <uapi/asm/sigcontext.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
 #include <asm/sigframe.h>
@@ -34,99 +34,6 @@
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
-{
-	int err = 0;
-	bool ia32 = test_thread_flag(TIF_IA32);
-
-	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	put_user_try {
-		/* If you change siginfo_t structure, please make sure that
-		   this code is fixed accordingly.
-		   It should never copy any pad contained in the structure
-		   to avoid security leaks, but must copy the generic
-		   3 ints plus the relevant union member.  */
-		put_user_ex(from->si_signo, &to->si_signo);
-		put_user_ex(from->si_errno, &to->si_errno);
-		put_user_ex((short)from->si_code, &to->si_code);
-
-		if (from->si_code < 0) {
-			put_user_ex(from->si_pid, &to->si_pid);
-			put_user_ex(from->si_uid, &to->si_uid);
-			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
-		} else {
-			/*
-			 * First 32bits of unions are always present:
-			 * si_pid === si_band === si_tid === si_addr(LS half)
-			 */
-			put_user_ex(from->_sifields._pad[0],
-					  &to->_sifields._pad[0]);
-			switch (from->si_code >> 16) {
-			case __SI_FAULT >> 16:
-				break;
-			case __SI_SYS >> 16:
-				put_user_ex(from->si_syscall, &to->si_syscall);
-				put_user_ex(from->si_arch, &to->si_arch);
-				break;
-			case __SI_CHLD >> 16:
-				if (ia32) {
-					put_user_ex(from->si_utime, &to->si_utime);
-					put_user_ex(from->si_stime, &to->si_stime);
-				} else {
-					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
-					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
-				}
-				put_user_ex(from->si_status, &to->si_status);
-				/* FALL THROUGH */
-			default:
-			case __SI_KILL >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				break;
-			case __SI_POLL >> 16:
-				put_user_ex(from->si_fd, &to->si_fd);
-				break;
-			case __SI_TIMER >> 16:
-				put_user_ex(from->si_overrun, &to->si_overrun);
-				put_user_ex(ptr_to_compat(from->si_ptr),
-					    &to->si_ptr);
-				break;
-				 /* This is not generated by the kernel as of now.  */
-			case __SI_RT >> 16:
-			case __SI_MESGQ >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				put_user_ex(from->si_int, &to->si_int);
-				break;
-			}
-		}
-	} put_user_catch(err);
-
-	return err;
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
-	int err = 0;
-	u32 ptr32;
-
-	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	get_user_try {
-		get_user_ex(to->si_signo, &from->si_signo);
-		get_user_ex(to->si_errno, &from->si_errno);
-		get_user_ex(to->si_code, &from->si_code);
-
-		get_user_ex(to->si_pid, &from->si_pid);
-		get_user_ex(to->si_uid, &from->si_uid);
-		get_user_ex(ptr32, &from->si_ptr);
-		to->si_ptr = compat_ptr(ptr32);
-	} get_user_catch(err);
-
-	return err;
-}
-
 /*
  * Do a signal return; undo the signal stack.
  */
@@ -161,7 +68,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 }
 
 static int ia32_restore_sigcontext(struct pt_regs *regs,
-				   struct sigcontext_ia32 __user *sc)
+				   struct sigcontext_32 __user *sc)
 {
 	unsigned int tmpflags, err = 0;
 	void __user *buf;
@@ -198,7 +105,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 		buf = compat_ptr(tmp);
 	} get_user_catch(err);
 
-	err |= restore_xstate_sig(buf, 1);
+	err |= fpu__restore_sig(buf, 1);
 
 	force_iret();
 
@@ -263,7 +170,7 @@ badframe:
  * Set up a signal frame.
  */
 
-static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
+static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
 				 void __user *fpstate,
 				 struct pt_regs *regs, unsigned int mask)
 {
@@ -308,6 +215,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 				 size_t frame_size,
 				 void __user **fpstate)
 {
+	struct fpu *fpu = &current->thread.fpu;
 	unsigned long sp;
 
 	/* Default to using normal stack */
@@ -322,12 +230,12 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
 
-	if (used_math()) {
+	if (fpu->fpstate_active) {
 		unsigned long fx_aligned, math_size;
 
-		sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size);
-		*fpstate = (struct _fpstate_ia32 __user *) sp;
-		if (save_xstate_sig(*fpstate, (void __user *)fx_aligned,
+		sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
+		*fpstate = (struct _fpstate_32 __user *) sp;
+		if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
 				    math_size) < 0)
 			return (void __user *) -1L;
 	}
@@ -381,7 +289,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 		/* Return stub is in 32bit vsyscall page */
 		if (current->mm->context.vdso)
 			restorer = current->mm->context.vdso +
-				selected_vdso32->sym___kernel_sigreturn;
+				vdso_image_32.sym___kernel_sigreturn;
 		else
 			restorer = &frame->retcode;
 	}
@@ -460,7 +368,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 			restorer = ksig->ka.sa.sa_restorer;
 		else
 			restorer = current->mm->context.vdso +
-				selected_vdso32->sym___kernel_rt_sigreturn;
+				vdso_image_32.sym___kernel_rt_sigreturn;
 		put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
 
 		/*
diff --git a/kernel/arch/x86/ia32/ia32entry.S b/kernel/arch/x86/ia32/ia32entry.S
deleted file mode 100644
index 72bf2680f..000000000
--- a/kernel/arch/x86/ia32/ia32entry.S
+++ /dev/null
@@ -1,611 +0,0 @@
-/*
- * Compatibility mode system call entry point for x86-64. 
- * 		
- * Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */		 
-
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/current.h>
-#include <asm/errno.h>
-#include <asm/ia32_unistd.h>	
-#include <asm/thread_info.h>	
-#include <asm/segment.h>
-#include <asm/irqflags.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-#include <linux/linkage.h>
-#include <linux/err.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE	   0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysexit_audit ia32_ret_from_sys_call
-#define sysretl_audit ia32_ret_from_sys_call
-#endif
-
-	.section .entry.text, "ax"
-
-	/* clobbers %rax */
-	.macro  CLEAR_RREGS _r9=rax
-	xorl 	%eax,%eax
-	movq	%rax,R11(%rsp)
-	movq	%rax,R10(%rsp)
-	movq	%\_r9,R9(%rsp)
-	movq	%rax,R8(%rsp)
-	.endm
-
-	/*
-	 * Reload arg registers from stack in case ptrace changed them.
-	 * We don't reload %eax because syscall_trace_enter() returned
-	 * the %rax value we should see.  Instead, we just truncate that
-	 * value to 32 bits again as we did on entry from user mode.
-	 * If it's a new value set by user_regset during entry tracing,
-	 * this matches the normal truncation of the user-mode value.
-	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
-	 * an appropriately invalid value.
-	 */
-	.macro LOAD_ARGS32 _r9=0
-	.if \_r9
-	movl R9(%rsp),%r9d
-	.endif
-	movl RCX(%rsp),%ecx
-	movl RDX(%rsp),%edx
-	movl RSI(%rsp),%esi
-	movl RDI(%rsp),%edi
-	movl %eax,%eax			/* zero extension */
-	.endm
-	
-	.macro CFI_STARTPROC32 simple
-	CFI_STARTPROC	\simple
-	CFI_UNDEFINED	r8
-	CFI_UNDEFINED	r9
-	CFI_UNDEFINED	r10
-	CFI_UNDEFINED	r11
-	CFI_UNDEFINED	r12
-	CFI_UNDEFINED	r13
-	CFI_UNDEFINED	r14
-	CFI_UNDEFINED	r15
-	.endm
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
-	swapgs
-	sysretl
-ENDPROC(native_usergs_sysret32)
-
-ENTRY(native_irq_enable_sysexit)
-	swapgs
-	sti
-	sysexit
-ENDPROC(native_irq_enable_sysexit)
-#endif
-
-/*
- * 32bit SYSENTER instruction entry.
- *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
- * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
- *
- * Arguments:
- * eax  system call number
- * ebx  arg1
- * ecx  arg2
- * edx  arg3
- * esi  arg4
- * edi  arg5
- * ebp  user stack
- * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rsp,rbp
-
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%ebp, %ebp
-	movl	%eax, %eax
-
-	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
-	CFI_REGISTER rip,r10
-
-	/* Construct struct pt_regs on stack */
-	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
-	pushq_cfi	%rbp			/* pt_regs->sp */
-	CFI_REL_OFFSET	rsp,0
-	pushfq_cfi				/* pt_regs->flags */
-	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
-	pushq_cfi	%r10 /* pt_regs->ip = thread_info->sysenter_return */
-	CFI_REL_OFFSET	rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
-	cld
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
-
-	/*
-	 * no need to do an access_ok check here because rbp has been
-	 * 32bit zero extended
-	 */
-	ASM_STAC
-1:	movl	(%rbp),%ebp
-	_ASM_EXTABLE(1b,ia32_badarg)
-	ASM_CLAC
-
-	/*
-	 * Sysenter doesn't filter flags, so we need to clear NT
-	 * ourselves.  To save a few cycles, we can check whether
-	 * NT was set instead of doing an unconditional popfq.
-	 */
-	testl $X86_EFLAGS_NT,EFLAGS(%rsp)
-	jnz sysenter_fix_flags
-sysenter_flags_fixed:
-
-	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	CFI_REMEMBER_STATE
-	jnz  sysenter_tracesys
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	ia32_badsys
-sysenter_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl	%edi,%r8d	/* arg5 */
-	movl	%ebp,%r9d	/* arg6 */
-	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
-	movl	%ebx,%edi	/* arg1 */
-	movl	%edx,%edx	/* arg3 (zero extension) */
-sysenter_dispatch:
-	call	*ia32_sys_call_table(,%rax,8)
-	movq	%rax,RAX(%rsp)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	sysexit_audit
-sysexit_from_sys_call:
-	/*
-	 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
-	 * NMI between STI and SYSEXIT has poorly specified behavior,
-	 * and and NMI followed by an IRQ with usergs is fatal.  So
-	 * we just pretend we're using SYSEXIT but we really use
-	 * SYSRETL instead.
-	 *
-	 * This code path is still called 'sysexit' because it pairs
-	 * with 'sysenter' and it uses the SYSENTER calling convention.
-	 */
-	andl    $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	movl	RIP(%rsp),%ecx		/* User %eip */
-	CFI_REGISTER rip,rcx
-	RESTORE_RSI_RDI
-	xorl	%edx,%edx		/* avoid info leaks */
-	xorq	%r8,%r8
-	xorq	%r9,%r9
-	xorq	%r10,%r10
-	movl	EFLAGS(%rsp),%r11d	/* User eflags */
-	/*CFI_RESTORE rflags*/
-	TRACE_IRQS_ON
-
-	/*
-	 * SYSRETL works even on Intel CPUs.  Use it in preference to SYSEXIT,
-	 * since it avoids a dicey window with interrupts enabled.
-	 */
-	movl	RSP(%rsp),%esp
-
-	/*
-	 * USERGS_SYSRET32 does:
-	 *  gsbase = user's gs base
-	 *  eip = ecx
-	 *  rflags = r11
-	 *  cs = __USER32_CS
-	 *  ss = __USER_DS
-	 *
-	 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
-	 *
-	 *  pop %ebp
-	 *  pop %edx
-	 *  pop %ecx
-	 *
-	 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
-	 * avoid info leaks.  R11 ends up with VDSO32_SYSENTER_RETURN's
-	 * address (already known to user code), and R12-R15 are
-	 * callee-saved and therefore don't contain any interesting
-	 * kernel data.
-	 */
-	USERGS_SYSRET32
-
-	CFI_RESTORE_STATE
-
-#ifdef CONFIG_AUDITSYSCALL
-	.macro auditsys_entry_common
-	movl %esi,%r8d			/* 5th arg: 4th syscall arg */
-	movl %ecx,%r9d			/*swap with edx*/
-	movl %edx,%ecx			/* 4th arg: 3rd syscall arg */
-	movl %r9d,%edx			/* 3rd arg: 2nd syscall arg */
-	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
-	movl %eax,%edi			/* 1st arg: syscall number */
-	call __audit_syscall_entry
-	movl RAX(%rsp),%eax	/* reload syscall number */
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
-	movl %ebx,%edi			/* reload 1st syscall arg */
-	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
-	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
-	movl RSI(%rsp),%ecx	/* reload 4th syscall arg */
-	movl RDI(%rsp),%r8d	/* reload 5th syscall arg */
-	.endm
-
-	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz ia32_ret_from_sys_call
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	jbe 1f
-	movslq %eax, %rsi	/* if error sign extend to 64 bits */
-1:	setbe %al		/* 1 if error, 0 if not */
-	movzbl %al,%edi		/* zero-extend that into %edi */
-	call __audit_syscall_exit
-	movq RAX(%rsp),%rax	/* reload syscall return value */
-	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz \exit
-	CLEAR_RREGS
-	jmp int_with_check
-	.endm
-
-sysenter_auditsys:
-	auditsys_entry_common
-	movl %ebp,%r9d			/* reload 6th syscall arg */
-	jmp sysenter_dispatch
-
-sysexit_audit:
-	auditsys_exit sysexit_from_sys_call
-#endif
-
-sysenter_fix_flags:
-	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
-	popfq_cfi
-	jmp sysenter_flags_fixed
-
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz	sysenter_auditsys
-#endif
-	SAVE_EXTRA_REGS
-	CLEAR_RREGS
-	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
-	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
-	call	syscall_trace_enter
-	LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
-	RESTORE_EXTRA_REGS
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
-	jmp	sysenter_do_call
-	CFI_ENDPROC
-ENDPROC(ia32_sysenter_target)
-
-/*
- * 32bit SYSCALL instruction entry.
- *
- * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
- *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
- * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
- * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
- * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
- *
- * Arguments:
- * eax  system call number
- * ecx  return address
- * ebx  arg1
- * ebp  arg2	(note: not saved in the stack frame, should not be touched)
- * edx  arg3
- * esi  arg4
- * edi  arg5
- * esp  user stack
- * 0(%esp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_cstar_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rip,rcx
-	/*CFI_REGISTER	rflags,r11*/
-
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	SWAPGS_UNSAFE_STACK
-	movl	%esp,%r8d
-	CFI_REGISTER	rsp,r8
-	movq	PER_CPU_VAR(kernel_stack),%rsp
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax,%eax
-
-	/* Construct struct pt_regs on stack */
-	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
-	pushq_cfi	%r8			/* pt_regs->sp */
-	CFI_REL_OFFSET rsp,0
-	pushq_cfi	%r11			/* pt_regs->flags */
-	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
-	pushq_cfi	%rcx			/* pt_regs->ip */
-	CFI_REL_OFFSET rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rbp			/* pt_regs->cx */
-	movl	%ebp,%ecx
-	pushq_cfi_reg	rax			/* pt_regs->ax */
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
-
-	/*
-	 * no need to do an access_ok check here because r8 has been
-	 * 32bit zero extended
-	 */
-	ASM_STAC
-1:	movl	(%r8),%r9d
-	_ASM_EXTABLE(1b,ia32_badarg)
-	ASM_CLAC
-	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	CFI_REMEMBER_STATE
-	jnz   cstar_tracesys
-	cmpq $IA32_NR_syscalls-1,%rax
-	ja  ia32_badsys
-cstar_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl	%edi,%r8d	/* arg5 */
-	/* r9 already loaded */	/* arg6 */
-	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
-	movl	%ebx,%edi	/* arg1 */
-	movl	%edx,%edx	/* arg3 (zero extension) */
-cstar_dispatch:
-	call *ia32_sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz sysretl_audit
-sysretl_from_sys_call:
-	andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	RESTORE_RSI_RDI_RDX
-	movl RIP(%rsp),%ecx
-	CFI_REGISTER rip,rcx
-	movl EFLAGS(%rsp),%r11d
-	/*CFI_REGISTER rflags,r11*/
-	xorq	%r10,%r10
-	xorq	%r9,%r9
-	xorq	%r8,%r8
-	TRACE_IRQS_ON
-	movl RSP(%rsp),%esp
-	CFI_RESTORE rsp
-	/*
-	 * 64bit->32bit SYSRET restores eip from ecx,
-	 * eflags from r11 (but RF and VM bits are forced to 0),
-	 * cs and ss are loaded from MSRs.
-	 * (Note: 32bit->32bit SYSRET is different: since r11
-	 * does not exist, it merely sets eflags.IF=1).
-	 *
-	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-	 * descriptor is not reinitialized.  This means that we must
-	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
-	 * exit the kernel, and re-enter using an interrupt vector.  (All
-	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-	 * from happening by reloading SS in __switch_to.
-	 */
-	USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
-cstar_auditsys:
-	CFI_RESTORE_STATE
-	movl %r9d,R9(%rsp)	/* register to be clobbered by call */
-	auditsys_entry_common
-	movl R9(%rsp),%r9d	/* reload 6th syscall arg */
-	jmp cstar_dispatch
-
-sysretl_audit:
-	auditsys_exit sysretl_from_sys_call
-#endif
-
-cstar_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz cstar_auditsys
-#endif
-	xchgl %r9d,%ebp
-	SAVE_EXTRA_REGS
-	CLEAR_RREGS r9
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
-	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace_enter
-	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
-	RESTORE_EXTRA_REGS
-	xchgl %ebp,%r9d
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
-	jmp cstar_do_call
-END(ia32_cstar_target)
-				
-ia32_badarg:
-	ASM_CLAC
-	movq $-EFAULT,%rax
-	jmp ia32_sysret
-	CFI_ENDPROC
-
-/*
- * Emulated IA32 system calls via int 0x80.
- *
- * Arguments:
- * eax  system call number
- * ebx  arg1
- * ecx  arg2
- * edx  arg3
- * esi  arg4
- * edi  arg5
- * ebp  arg6	(note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
-
-ENTRY(ia32_syscall)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,5*8
-	/*CFI_REL_OFFSET	ss,4*8 */
-	CFI_REL_OFFSET	rsp,3*8
-	/*CFI_REL_OFFSET	rflags,2*8 */
-	/*CFI_REL_OFFSET	cs,1*8 */
-	CFI_REL_OFFSET	rip,0*8
-
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	SWAPGS
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax,%eax
-
-	/* Construct struct pt_regs on stack (iret frame is already on stack) */
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
-	cld
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
-
-	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz ia32_tracesys
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
-ia32_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl %edi,%r8d	/* arg5 */
-	movl %ebp,%r9d	/* arg6 */
-	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
-	movl %ebx,%edi	/* arg1 */
-	movl %edx,%edx	/* arg3 (zero extension) */
-	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
-ia32_sysret:
-	movq %rax,RAX(%rsp)
-ia32_ret_from_sys_call:
-	CLEAR_RREGS
-	jmp int_ret_from_sys_call
-
-ia32_tracesys:
-	SAVE_EXTRA_REGS
-	CLEAR_RREGS
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
-	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace_enter
-	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
-	RESTORE_EXTRA_REGS
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
-	jmp ia32_do_call
-END(ia32_syscall)
-
-ia32_badsys:
-	movq $0,ORIG_RAX(%rsp)
-	movq $-ENOSYS,%rax
-	jmp ia32_sysret
-
-	CFI_ENDPROC
-	
-	.macro PTREGSCALL label, func
-	ALIGN
-GLOBAL(\label)
-	leaq \func(%rip),%rax
-	jmp  ia32_ptregs_common	
-	.endm
-
-	CFI_STARTPROC32
-
-	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
-	PTREGSCALL stub32_sigreturn, sys32_sigreturn
-	PTREGSCALL stub32_fork, sys_fork
-	PTREGSCALL stub32_vfork, sys_vfork
-
-	ALIGN
-GLOBAL(stub32_clone)
-	leaq sys_clone(%rip),%rax
-	mov	%r8, %rcx
-	jmp  ia32_ptregs_common	
-
-	ALIGN
-ia32_ptregs_common:
-	CFI_ENDPROC
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SIZEOF_PTREGS
-	CFI_REL_OFFSET	rax,RAX
-	CFI_REL_OFFSET	rcx,RCX
-	CFI_REL_OFFSET	rdx,RDX
-	CFI_REL_OFFSET	rsi,RSI
-	CFI_REL_OFFSET	rdi,RDI
-	CFI_REL_OFFSET	rip,RIP
-/*	CFI_REL_OFFSET	cs,CS*/
-/*	CFI_REL_OFFSET	rflags,EFLAGS*/
-	CFI_REL_OFFSET	rsp,RSP
-/*	CFI_REL_OFFSET	ss,SS*/
-	SAVE_EXTRA_REGS 8
-	call *%rax
-	RESTORE_EXTRA_REGS 8
-	ret
-	CFI_ENDPROC
-END(ia32_ptregs_common)
diff --git a/kernel/arch/x86/include/asm/Kbuild b/kernel/arch/x86/include/asm/Kbuild
index d55a210a4..aeac434c9 100644
--- a/kernel/arch/x86/include/asm/Kbuild
+++ b/kernel/arch/x86/include/asm/Kbuild
@@ -9,4 +9,4 @@ generic-y += cputime.h
 generic-y += dma-contiguous.h
 generic-y += early_ioremap.h
 generic-y += mcs_spinlock.h
-generic-y += scatterlist.h
+generic-y += mm-arch-hooks.h
diff --git a/kernel/arch/x86/include/asm/acpi.h b/kernel/arch/x86/include/asm/acpi.h
index 3a45668f6..94c18ebfd 100644
--- a/kernel/arch/x86/include/asm/acpi.h
+++ b/kernel/arch/x86/include/asm/acpi.h
@@ -32,6 +32,10 @@
 #include <asm/mpspec.h>
 #include <asm/realmode.h>
 
+#ifdef CONFIG_ACPI_APEI
+# include <asm/pgtable_types.h>
+#endif
+
 #ifdef CONFIG_ACPI
 extern int acpi_lapic;
 extern int acpi_ioapic;
@@ -147,4 +151,23 @@ extern int x86_acpi_numa_init(void);
 
 #define acpi_unlazy_tlb(x)	leave_mm(x)
 
+#ifdef CONFIG_ACPI_APEI
+static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
+{
+	/*
+	 * We currently have no way to look up the EFI memory map
+	 * attributes for a region in a consistent way, because the
+	 * memmap is discarded after efi_free_boot_services(). So if
+	 * you call efi_mem_attributes() during boot and at runtime,
+	 * you could theoretically see different attributes.
+	 *
+	 * Since we are yet to see any x86 platforms that require
+	 * anything other than PAGE_KERNEL (some arm64 platforms
+	 * require the equivalent of PAGE_KERNEL_NOCACHE), return that
+	 * until we know differently.
+	 */
+	 return PAGE_KERNEL;
+}
+#endif
+
 #endif /* _ASM_X86_ACPI_H */
diff --git a/kernel/arch/x86/include/asm/alternative-asm.h b/kernel/arch/x86/include/asm/alternative-asm.h
index bdf02eeee..e7636bac7 100644
--- a/kernel/arch/x86/include/asm/alternative-asm.h
+++ b/kernel/arch/x86/include/asm/alternative-asm.h
@@ -18,6 +18,12 @@
 	.endm
 #endif
 
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
 .macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 	.long \orig - .
 	.long \alt - .
@@ -27,6 +33,12 @@
 	.byte \pad_len
 .endm
 
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
 .macro ALTERNATIVE oldinstr, newinstr, feature
 140:
 	\oldinstr
@@ -55,6 +67,12 @@
  */
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
 .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
 140:
 	\oldinstr
diff --git a/kernel/arch/x86/include/asm/alternative.h b/kernel/arch/x86/include/asm/alternative.h
index ba32af062..7bfc85bbb 100644
--- a/kernel/arch/x86/include/asm/alternative.h
+++ b/kernel/arch/x86/include/asm/alternative.h
@@ -52,6 +52,12 @@ struct alt_instr {
 	u8  padlen;		/* length of build-time padding */
 } __packed;
 
+/*
+ * Debug flag that can be tested to see whether alternative
+ * instructions were patched in already:
+ */
+extern int alternatives_patched;
+
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
diff --git a/kernel/arch/x86/include/asm/amd_nb.h b/kernel/arch/x86/include/asm/amd_nb.h
index aaac3b2fb..3c56ef1ae 100644
--- a/kernel/arch/x86/include/asm/amd_nb.h
+++ b/kernel/arch/x86/include/asm/amd_nb.h
@@ -81,7 +81,7 @@ static inline struct amd_northbridge *node_to_amd_nb(int node)
 	return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
 }
 
-static inline u16 amd_get_node_id(struct pci_dev *pdev)
+static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
 {
 	struct pci_dev *misc;
 	int i;
@@ -98,11 +98,22 @@ static inline u16 amd_get_node_id(struct pci_dev *pdev)
 	return 0;
 }
 
+static inline bool amd_gart_present(void)
+{
+	/* GART present only on Fam15h, upto model 0fh */
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+	    (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+		return true;
+
+	return false;
+}
+
 #else
 
 #define amd_nb_num(x)		0
 #define amd_nb_has_feature(x)	false
 #define node_to_amd_nb(x)	NULL
+#define amd_gart_present(x)	false
 
 #endif
 
diff --git a/kernel/arch/x86/include/asm/apic.h b/kernel/arch/x86/include/asm/apic.h
index 976b86a32..a30316bf8 100644
--- a/kernel/arch/x86/include/asm/apic.h
+++ b/kernel/arch/x86/include/asm/apic.h
@@ -115,6 +115,59 @@ static inline bool apic_is_x2apic_enabled(void)
 	return msr & X2APIC_ENABLE;
 }
 
+extern void enable_IR_x2apic(void);
+
+extern int get_physical_broadcast(void);
+
+extern int lapic_get_maxlvt(void);
+extern void clear_local_APIC(void);
+extern void disconnect_bsp_APIC(int virt_wire_setup);
+extern void disable_local_APIC(void);
+extern void lapic_shutdown(void);
+extern void sync_Arb_IDs(void);
+extern void init_bsp_APIC(void);
+extern void setup_local_APIC(void);
+extern void init_apic_mappings(void);
+void register_lapic_address(unsigned long address);
+extern void setup_boot_APIC_clock(void);
+extern void setup_secondary_APIC_clock(void);
+extern int APIC_init_uniprocessor(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+	return -1;
+}
+#else
+extern int apic_force_enable(unsigned long addr);
+#endif
+
+extern int apic_bsp_setup(bool upmode);
+extern void apic_ap_setup(void);
+
+/*
+ * On 32bit this is mach-xxx local
+ */
+#ifdef CONFIG_X86_64
+extern int apic_is_clustered_box(void);
+#else
+static inline int apic_is_clustered_box(void)
+{
+	return 0;
+}
+#endif
+
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
+
+#else /* !CONFIG_X86_LOCAL_APIC */
+static inline void lapic_shutdown(void) { }
+#define local_apic_timer_c2_ok		1
+static inline void init_apic_mappings(void) { }
+static inline void disable_local_APIC(void) { }
+# define setup_boot_APIC_clock x86_init_noop
+# define setup_secondary_APIC_clock x86_init_noop
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
 #ifdef CONFIG_X86_X2APIC
 /*
  * Make previous memory operations globally visible before
@@ -186,67 +239,14 @@ static inline int x2apic_enabled(void)
 }
 
 #define x2apic_supported()	(cpu_has_x2apic)
-#else
+#else /* !CONFIG_X86_X2APIC */
 static inline void check_x2apic(void) { }
 static inline void x2apic_setup(void) { }
 static inline int x2apic_enabled(void) { return 0; }
 
 #define x2apic_mode		(0)
 #define	x2apic_supported()	(0)
-#endif
-
-extern void enable_IR_x2apic(void);
-
-extern int get_physical_broadcast(void);
-
-extern int lapic_get_maxlvt(void);
-extern void clear_local_APIC(void);
-extern void disconnect_bsp_APIC(int virt_wire_setup);
-extern void disable_local_APIC(void);
-extern void lapic_shutdown(void);
-extern void sync_Arb_IDs(void);
-extern void init_bsp_APIC(void);
-extern void setup_local_APIC(void);
-extern void init_apic_mappings(void);
-void register_lapic_address(unsigned long address);
-extern void setup_boot_APIC_clock(void);
-extern void setup_secondary_APIC_clock(void);
-extern int APIC_init_uniprocessor(void);
-
-#ifdef CONFIG_X86_64
-static inline int apic_force_enable(unsigned long addr)
-{
-	return -1;
-}
-#else
-extern int apic_force_enable(unsigned long addr);
-#endif
-
-extern int apic_bsp_setup(bool upmode);
-extern void apic_ap_setup(void);
-
-/*
- * On 32bit this is mach-xxx local
- */
-#ifdef CONFIG_X86_64
-extern int apic_is_clustered_box(void);
-#else
-static inline int apic_is_clustered_box(void)
-{
-	return 0;
-}
-#endif
-
-extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
-
-#else /* !CONFIG_X86_LOCAL_APIC */
-static inline void lapic_shutdown(void) { }
-#define local_apic_timer_c2_ok		1
-static inline void init_apic_mappings(void) { }
-static inline void disable_local_APIC(void) { }
-# define setup_boot_APIC_clock x86_init_noop
-# define setup_secondary_APIC_clock x86_init_noop
-#endif /* !CONFIG_X86_LOCAL_APIC */
+#endif /* !CONFIG_X86_X2APIC */
 
 #ifdef CONFIG_X86_64
 #define	SET_APIC_ID(x)		(apic->set_apic_id(x))
@@ -313,7 +313,6 @@ struct apic {
 	/* wakeup_secondary_cpu */
 	int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
 
-	bool wait_for_init_deassert;
 	void (*inquire_remote_apic)(int apicid);
 
 	/* apic ops */
@@ -378,7 +377,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[];
  * APIC functionality to boot other CPUs - only used on SMP:
  */
 #ifdef CONFIG_SMP
-extern atomic_t init_deasserted;
 extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
 #endif
 
@@ -644,6 +642,12 @@ static inline void entering_ack_irq(void)
 	entering_irq();
 }
 
+static inline void ipi_entering_ack_irq(void)
+{
+	ack_APIC_irq();
+	irq_enter();
+}
+
 static inline void exiting_irq(void)
 {
 	irq_exit();
diff --git a/kernel/arch/x86/include/asm/arch_hweight.h b/kernel/arch/x86/include/asm/arch_hweight.h
index 9686c3d9f..259a7c1ef 100644
--- a/kernel/arch/x86/include/asm/arch_hweight.h
+++ b/kernel/arch/x86/include/asm/arch_hweight.h
@@ -21,7 +21,7 @@
  * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
  * compiler switches.
  */
-static inline unsigned int __arch_hweight32(unsigned int w)
+static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
 	unsigned int res = 0;
 
@@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w)
 	return __arch_hweight32(w & 0xff);
 }
 
+#ifdef CONFIG_X86_32
 static inline unsigned long __arch_hweight64(__u64 w)
 {
-	unsigned long res = 0;
-
-#ifdef CONFIG_X86_32
 	return  __arch_hweight32((u32)w) +
 		__arch_hweight32((u32)(w >> 32));
+}
 #else
+static __always_inline unsigned long __arch_hweight64(__u64 w)
+{
+	unsigned long res = 0;
+
 	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
 		     : "="REG_OUT (res)
 		     : REG_IN (w));
-#endif /* CONFIG_X86_32 */
 
 	return res;
 }
+#endif /* CONFIG_X86_32 */
 
 #endif
diff --git a/kernel/arch/x86/include/asm/asm.h b/kernel/arch/x86/include/asm/asm.h
index 7730c1c5c..189679aba 100644
--- a/kernel/arch/x86/include/asm/asm.h
+++ b/kernel/arch/x86/include/asm/asm.h
@@ -63,6 +63,31 @@
 	_ASM_ALIGN ;						\
 	_ASM_PTR (entry);					\
 	.popsection
+
+.macro ALIGN_DESTINATION
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+100:	movb (%rsi),%al
+101:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	.section .fixup,"ax"
+103:	addl %ecx,%edx			/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
+	.endm
+
 #else
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
diff --git a/kernel/arch/x86/include/asm/atomic.h b/kernel/arch/x86/include/asm/atomic.h
index 5e5cd123f..ae5fb83e6 100644
--- a/kernel/arch/x86/include/asm/atomic.h
+++ b/kernel/arch/x86/include/asm/atomic.h
@@ -22,9 +22,9 @@
  *
  * Atomically reads the value of @v.
  */
-static inline int atomic_read(const atomic_t *v)
+static __always_inline int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
@@ -34,9 +34,9 @@ static inline int atomic_read(const atomic_t *v)
  *
  * Atomically sets the value of @v to @i.
  */
-static inline void atomic_set(atomic_t *v, int i)
+static __always_inline void atomic_set(atomic_t *v, int i)
 {
-	v->counter = i;
+	WRITE_ONCE(v->counter, i);
 }
 
 /**
@@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i)
  *
  * Atomically adds @i to @v.
  */
-static inline void atomic_add(int i, atomic_t *v)
+static __always_inline void atomic_add(int i, atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "addl %1,%0"
 		     : "+m" (v->counter)
@@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v)
  *
  * Atomically subtracts @i from @v.
  */
-static inline void atomic_sub(int i, atomic_t *v)
+static __always_inline void atomic_sub(int i, atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "subl %1,%0"
 		     : "+m" (v->counter)
@@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v)
  * true if the result is zero, or false for all
  * other cases.
  */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
+static __always_inline int atomic_sub_and_test(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
 }
@@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v)
  *
  * Atomically increments @v by 1.
  */
-static inline void atomic_inc(atomic_t *v)
+static __always_inline void atomic_inc(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "incl %0"
 		     : "+m" (v->counter));
@@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v)
  *
  * Atomically decrements @v by 1.
  */
-static inline void atomic_dec(atomic_t *v)
+static __always_inline void atomic_dec(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "decl %0"
 		     : "+m" (v->counter));
@@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v)
  * returns true if the result is 0, or false for all other
  * cases.
  */
-static inline int atomic_dec_and_test(atomic_t *v)
+static __always_inline int atomic_dec_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
 }
@@ -126,7 +126,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
  * and returns true if the result is zero, or false for all
  * other cases.
  */
-static inline int atomic_inc_and_test(atomic_t *v)
+static __always_inline int atomic_inc_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
 }
@@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
-static inline int atomic_add_negative(int i, atomic_t *v)
+static __always_inline int atomic_add_negative(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s");
 }
@@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v)
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline int atomic_add_return(int i, atomic_t *v)
+static __always_inline int atomic_add_return(int i, atomic_t *v)
 {
 	return i + xadd(&v->counter, i);
 }
@@ -164,7 +164,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
  *
  * Atomically subtracts @i from @v and returns @v - @i
  */
-static inline int atomic_sub_return(int i, atomic_t *v)
+static __always_inline int atomic_sub_return(int i, atomic_t *v)
 {
 	return atomic_add_return(-i, v);
 }
@@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 #define atomic_inc_return(v)  (atomic_add_return(1, v))
 #define atomic_dec_return(v)  (atomic_sub_return(1, v))
 
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	return cmpxchg(&v->counter, old, new);
 }
@@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new)
 	return xchg(&v->counter, new);
 }
 
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	asm volatile(LOCK_PREFIX #op"l %1,%0"				\
+			: "+m" (v->counter)				\
+			: "ir" (i)					\
+			: "memory");					\
+}
+
+ATOMIC_OP(and)
+ATOMIC_OP(or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_OP
+
 /**
  * __atomic_add_unless - add unless the number is already a given value
  * @v: pointer of type atomic_t
@@ -191,7 +206,7 @@ static inline int atomic_xchg(atomic_t *v, int new)
  * Atomically adds @a to @v, so long as @v was not already @u.
  * Returns the old value of @v.
  */
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
@@ -213,22 +228,12 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  * Atomically adds 1 to @v
  * Returns the new value of @u
  */
-static inline short int atomic_inc_short(short int *v)
+static __always_inline short int atomic_inc_short(short int *v)
 {
 	asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
 	return *v;
 }
 
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr)				\
-	asm volatile(LOCK_PREFIX "andl %0,%1"			\
-		     : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr)				\
-	asm volatile(LOCK_PREFIX "orl %0,%1"			\
-		     : : "r" ((unsigned)(mask)), "m" (*(addr))	\
-		     : "memory")
-
 #ifdef CONFIG_X86_32
 # include <asm/atomic64_32.h>
 #else
diff --git a/kernel/arch/x86/include/asm/atomic64_32.h b/kernel/arch/x86/include/asm/atomic64_32.h
index b154de75c..a11c30b77 100644
--- a/kernel/arch/x86/include/asm/atomic64_32.h
+++ b/kernel/arch/x86/include/asm/atomic64_32.h
@@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 #undef alternative_atomic64
 #undef __alternative_atomic64
 
+#define ATOMIC64_OP(op, c_op)						\
+static inline void atomic64_##op(long long i, atomic64_t *v)		\
+{									\
+	long long old, c = 0;						\
+	while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c)		\
+		c = old;						\
+}
+
+ATOMIC64_OP(and, &)
+ATOMIC64_OP(or, |)
+ATOMIC64_OP(xor, ^)
+
+#undef ATOMIC64_OP
+
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/kernel/arch/x86/include/asm/atomic64_64.h b/kernel/arch/x86/include/asm/atomic64_64.h
index f8d273e18..037351022 100644
--- a/kernel/arch/x86/include/asm/atomic64_64.h
+++ b/kernel/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
  */
 static inline long atomic64_read(const atomic64_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
@@ -30,7 +30,7 @@ static inline long atomic64_read(const atomic64_t *v)
  */
 static inline void atomic64_set(atomic64_t *v, long i)
 {
-	v->counter = i;
+	WRITE_ONCE(v->counter, i);
 }
 
 /**
@@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i)
  *
  * Atomically adds @i to @v.
  */
-static inline void atomic64_add(long i, atomic64_t *v)
+static __always_inline void atomic64_add(long i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "addq %1,%0"
 		     : "=m" (v->counter)
@@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v)
  *
  * Atomically increments @v by 1.
  */
-static inline void atomic64_inc(atomic64_t *v)
+static __always_inline void atomic64_inc(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "incq %0"
 		     : "=m" (v->counter)
@@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v)
  *
  * Atomically decrements @v by 1.
  */
-static inline void atomic64_dec(atomic64_t *v)
+static __always_inline void atomic64_dec(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "decq %0"
 		     : "=m" (v->counter)
@@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v)
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline long atomic64_add_return(long i, atomic64_t *v)
+static __always_inline long atomic64_add_return(long i, atomic64_t *v)
 {
 	return i + xadd(&v->counter, i);
 }
@@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
 	return dec;
 }
 
+#define ATOMIC64_OP(op)							\
+static inline void atomic64_##op(long i, atomic64_t *v)			\
+{									\
+	asm volatile(LOCK_PREFIX #op"q %1,%0"				\
+			: "+m" (v->counter)				\
+			: "er" (i)					\
+			: "memory");					\
+}
+
+ATOMIC64_OP(and)
+ATOMIC64_OP(or)
+ATOMIC64_OP(xor)
+
+#undef ATOMIC64_OP
+
 #endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/kernel/arch/x86/include/asm/barrier.h b/kernel/arch/x86/include/asm/barrier.h
index 959e45b81..0681d2532 100644
--- a/kernel/arch/x86/include/asm/barrier.h
+++ b/kernel/arch/x86/include/asm/barrier.h
@@ -35,12 +35,12 @@
 #define smp_mb()	mb()
 #define smp_rmb()	dma_rmb()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else /* !SMP */
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #endif /* SMP */
 
 #define read_barrier_depends()		do { } while (0)
@@ -57,12 +57,12 @@
 do {									\
 	compiletime_assert_atomic_type(*p);				\
 	smp_mb();							\
-	ACCESS_ONCE(*p) = (v);						\
+	WRITE_ONCE(*p, v);						\
 } while (0)
 
 #define smp_load_acquire(p)						\
 ({									\
-	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
 	compiletime_assert_atomic_type(*p);				\
 	smp_mb();							\
 	___p1;								\
@@ -74,12 +74,12 @@ do {									\
 do {									\
 	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
-	ACCESS_ONCE(*p) = (v);						\
+	WRITE_ONCE(*p, v);						\
 } while (0)
 
 #define smp_load_acquire(p)						\
 ({									\
-	typeof(*p) ___p1 = ACCESS_ONCE(*p);				\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
 	compiletime_assert_atomic_type(*p);				\
 	barrier();							\
 	___p1;								\
@@ -91,15 +91,4 @@ do {									\
 #define smp_mb__before_atomic()	barrier()
 #define smp_mb__after_atomic()	barrier()
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- */
-static __always_inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif /* _ASM_X86_BARRIER_H */
diff --git a/kernel/arch/x86/include/asm/boot.h b/kernel/arch/x86/include/asm/boot.h
index 4fa687a47..6b8d6e8cd 100644
--- a/kernel/arch/x86/include/asm/boot.h
+++ b/kernel/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
 #define BOOT_HEAP_SIZE             0x400000
 #else /* !CONFIG_KERNEL_BZIP2 */
 
-#define BOOT_HEAP_SIZE	0x8000
+#define BOOT_HEAP_SIZE	0x10000
 
 #endif /* !CONFIG_KERNEL_BZIP2 */
 
diff --git a/kernel/arch/x86/include/asm/cacheflush.h b/kernel/arch/x86/include/asm/cacheflush.h
index 47c8e32f6..e63aa38e8 100644
--- a/kernel/arch/x86/include/asm/cacheflush.h
+++ b/kernel/arch/x86/include/asm/cacheflush.h
@@ -4,11 +4,12 @@
 /* Caches aren't brain-dead on the intel. */
 #include <asm-generic/cacheflush.h>
 #include <asm/special_insns.h>
+#include <asm/uaccess.h>
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
  * address range. The attributes include:
- * Cachability   : UnCached, WriteCombining, WriteBack
+ * Cachability   : UnCached, WriteCombining, WriteThrough, WriteBack
  * Executability : eXeutable, NoteXecutable
  * Read/Write    : ReadOnly, ReadWrite
  * Presence      : NotPresent
@@ -35,9 +36,11 @@
 
 int _set_memory_uc(unsigned long addr, int numpages);
 int _set_memory_wc(unsigned long addr, int numpages);
+int _set_memory_wt(unsigned long addr, int numpages);
 int _set_memory_wb(unsigned long addr, int numpages);
 int set_memory_uc(unsigned long addr, int numpages);
 int set_memory_wc(unsigned long addr, int numpages);
+int set_memory_wt(unsigned long addr, int numpages);
 int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
@@ -48,10 +51,12 @@ int set_memory_4k(unsigned long addr, int numpages);
 
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
 int set_memory_array_wc(unsigned long *addr, int addrinarray);
+int set_memory_array_wt(unsigned long *addr, int addrinarray);
 int set_memory_array_wb(unsigned long *addr, int addrinarray);
 
 int set_pages_array_uc(struct page **pages, int addrinarray);
 int set_pages_array_wc(struct page **pages, int addrinarray);
+int set_pages_array_wt(struct page **pages, int addrinarray);
 int set_pages_array_wb(struct page **pages, int addrinarray);
 
 /*
@@ -84,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages);
 
 void clflush_cache_range(void *addr, unsigned int size);
 
+#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
+
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 extern const int rodata_test_data;
diff --git a/kernel/arch/x86/include/asm/cmpxchg.h b/kernel/arch/x86/include/asm/cmpxchg.h
index 99c105d78..ad19841ed 100644
--- a/kernel/arch/x86/include/asm/cmpxchg.h
+++ b/kernel/arch/x86/include/asm/cmpxchg.h
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 /*
  * Non-existant functions to indicate usage errors at link time
  * (or compile-time if the compiler implements __compiletime_error().
diff --git a/kernel/arch/x86/include/asm/context_tracking.h b/kernel/arch/x86/include/asm/context_tracking.h
deleted file mode 100644
index 1fe49704b..000000000
--- a/kernel/arch/x86/include/asm/context_tracking.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_CONTEXT_TRACKING_H
-#define _ASM_X86_CONTEXT_TRACKING_H
-
-#ifdef CONFIG_CONTEXT_TRACKING
-# define SCHEDULE_USER call schedule_user
-#else
-# define SCHEDULE_USER call schedule
-#endif
-
-#endif
diff --git a/kernel/arch/x86/include/asm/cpufeature.h b/kernel/arch/x86/include/asm/cpufeature.h
index 3d6606fb9..f7ba9fbf1 100644
--- a/kernel/arch/x86/include/asm/cpufeature.h
+++ b/kernel/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
 #include <asm/disabled-features.h>
 #endif
 
-#define NCAPINTS	13	/* N 32-bit words worth of info */
+#define NCAPINTS	14	/* N 32-bit words worth of info */
 #define NBUGINTS	1	/* N 32-bit bug flags */
 
 /*
@@ -119,6 +119,7 @@
 #define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
 #define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_CID		( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
 #define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
 #define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
 #define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
@@ -176,6 +177,7 @@
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
 #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -191,7 +193,7 @@
 #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_HWP		( 7*32+ 10) /* "hwp" Intel HWP */
-#define X86_FEATURE_HWP_NOITFY	( 7*32+ 11) /* Intel HWP_NOTIFY */
+#define X86_FEATURE_HWP_NOTIFY	( 7*32+ 11) /* Intel HWP_NOTIFY */
 #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
 #define X86_FEATURE_HWP_EPP	( 7*32+13) /* Intel HWP_EPP */
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
@@ -214,6 +216,7 @@
 #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
 #define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
 
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
@@ -239,6 +242,7 @@
 #define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
 #define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
 
 /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
 #define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
@@ -252,6 +256,9 @@
 /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
 #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
 
+/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+#define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
+
 /*
  * BUG word(s)
  */
diff --git a/kernel/arch/x86/include/asm/crypto/glue_helper.h b/kernel/arch/x86/include/asm/crypto/glue_helper.h
index 1eef55596..03bb1065c 100644
--- a/kernel/arch/x86/include/asm/crypto/glue_helper.h
+++ b/kernel/arch/x86/include/asm/crypto/glue_helper.h
@@ -7,7 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <crypto/b128ops.h>
 
 typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
diff --git a/kernel/arch/x86/include/asm/delay.h b/kernel/arch/x86/include/asm/delay.h
index 9b3b4f275..36a760bda 100644
--- a/kernel/arch/x86/include/asm/delay.h
+++ b/kernel/arch/x86/include/asm/delay.h
@@ -4,5 +4,6 @@
 #include <asm-generic/delay.h>
 
 void use_tsc_delay(void);
+void use_mwaitx_delay(void);
 
 #endif /* _ASM_X86_DELAY_H */
diff --git a/kernel/arch/x86/include/asm/dma-mapping.h b/kernel/arch/x86/include/asm/dma-mapping.h
index 808dae63e..953b7263f 100644
--- a/kernel/arch/x86/include/asm/dma-mapping.h
+++ b/kernel/arch/x86/include/asm/dma-mapping.h
@@ -12,7 +12,6 @@
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/dma-contiguous.h>
 
 #ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 #endif
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	debug_dma_mapping_error(dev, dma_addr);
-	if (ops->mapping_error)
-		return ops->mapping_error(dev, dma_addr);
-
-	return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
+
+#include <asm-generic/dma-mapping-common.h>
 
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 					dma_addr_t *dma_addr, gfp_t flag,
@@ -125,52 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
        return gfp;
 }
 
-#define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-		return memory;
-
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
-	if (!is_device_dma_capable(dev))
-		return NULL;
-
-	if (!ops->alloc)
-		return NULL;
-
-	memory = ops->alloc(dev, size, dma_handle,
-			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-	return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t bus,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	WARN_ON(irqs_disabled());       /* for portability */
-
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
-
-	debug_dma_free_coherent(dev, size, vaddr, bus);
-	if (ops->free)
-		ops->free(dev, size, vaddr, bus, attrs);
-}
-
 #endif
diff --git a/kernel/arch/x86/include/asm/dwarf2.h b/kernel/arch/x86/include/asm/dwarf2.h
index de1cdaf4d..b7a1ab865 100644
--- a/kernel/arch/x86/include/asm/dwarf2.h
+++ b/kernel/arch/x86/include/asm/dwarf2.h
@@ -36,15 +36,22 @@
 #endif
 
 #if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
+#ifndef BUILD_VDSO
 	/*
 	 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
 	 * The latter we currently just discard since we don't do DWARF
 	 * unwinding at runtime.  So only the offline DWARF information is
-	 * useful to anyone.  Note we should not use this directive if this
-	 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
-	 * changed so it doesn't discard .eh_frame.
+	 * useful to anyone.  Note we should not use this directive if
+	 * vmlinux.lds.S gets changed so it doesn't discard .eh_frame.
 	 */
 	.cfi_sections .debug_frame
+#else
+	 /*
+	  * For the vDSO, emit both runtime unwind information and debug
+	  * symbols for the .dbg file.
+	  */
+	.cfi_sections .eh_frame, .debug_frame
+#endif
 #endif
 
 #else
@@ -74,97 +81,4 @@
 
 #endif
 
-/*
- * An attempt to make CFI annotations more or less
- * correct and shorter. It is implied that you know
- * what you're doing if you use them.
- */
-#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_64
-	.macro pushq_cfi reg
-	pushq \reg
-	CFI_ADJUST_CFA_OFFSET 8
-	.endm
-
-	.macro pushq_cfi_reg reg
-	pushq %\reg
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET \reg, 0
-	.endm
-
-	.macro popq_cfi reg
-	popq \reg
-	CFI_ADJUST_CFA_OFFSET -8
-	.endm
-
-	.macro popq_cfi_reg reg
-	popq %\reg
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE \reg
-	.endm
-
-	.macro pushfq_cfi
-	pushfq
-	CFI_ADJUST_CFA_OFFSET 8
-	.endm
-
-	.macro popfq_cfi
-	popfq
-	CFI_ADJUST_CFA_OFFSET -8
-	.endm
-
-	.macro movq_cfi reg offset=0
-	movq %\reg, \offset(%rsp)
-	CFI_REL_OFFSET \reg, \offset
-	.endm
-
-	.macro movq_cfi_restore offset reg
-	movq \offset(%rsp), %\reg
-	CFI_RESTORE \reg
-	.endm
-#else /*!CONFIG_X86_64*/
-	.macro pushl_cfi reg
-	pushl \reg
-	CFI_ADJUST_CFA_OFFSET 4
-	.endm
-
-	.macro pushl_cfi_reg reg
-	pushl %\reg
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET \reg, 0
-	.endm
-
-	.macro popl_cfi reg
-	popl \reg
-	CFI_ADJUST_CFA_OFFSET -4
-	.endm
-
-	.macro popl_cfi_reg reg
-	popl %\reg
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE \reg
-	.endm
-
-	.macro pushfl_cfi
-	pushfl
-	CFI_ADJUST_CFA_OFFSET 4
-	.endm
-
-	.macro popfl_cfi
-	popfl
-	CFI_ADJUST_CFA_OFFSET -4
-	.endm
-
-	.macro movl_cfi reg offset=0
-	movl %\reg, \offset(%esp)
-	CFI_REL_OFFSET \reg, \offset
-	.endm
-
-	.macro movl_cfi_restore offset reg
-	movl \offset(%esp), %\reg
-	CFI_RESTORE \reg
-	.endm
-#endif /*!CONFIG_X86_64*/
-#endif /*__ASSEMBLY__*/
-
 #endif /* _ASM_X86_DWARF2_H */
diff --git a/kernel/arch/x86/include/asm/edac.h b/kernel/arch/x86/include/asm/edac.h
index e9b57ecc7..cf8fdf83b 100644
--- a/kernel/arch/x86/include/asm/edac.h
+++ b/kernel/arch/x86/include/asm/edac.h
@@ -3,7 +3,7 @@
 
 /* ECC atomic, DMA, SMP and interrupt safe scrub function */
 
-static inline void atomic_scrub(void *va, u32 size)
+static inline void edac_atomic_scrub(void *va, u32 size)
 {
 	u32 i, *virt_addr = va;
 
diff --git a/kernel/arch/x86/include/asm/efi.h b/kernel/arch/x86/include/asm/efi.h
index 3738b138b..0010c78c4 100644
--- a/kernel/arch/x86/include/asm/efi.h
+++ b/kernel/arch/x86/include/asm/efi.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_EFI_H
 #define _ASM_X86_EFI_H
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/pgtable.h>
 
 /*
@@ -86,6 +86,18 @@ extern u64 asmlinkage efi_call(void *fp, ...);
 extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
 					u32 type, u64 attribute);
 
+#ifdef CONFIG_KASAN
+/*
+ * CONFIG_KASAN may redefine memset to __memset.  __memset function is present
+ * only in kernel binary.  Since the EFI stub linked into a separate binary it
+ * doesn't have __memset().  So we should use standard memset from
+ * arch/x86/boot/compressed/string.c.  The same applies to memcpy and memmove.
+ */
+#undef memcpy
+#undef memset
+#undef memmove
+#endif
+
 #endif /* CONFIG_X86_32 */
 
 extern struct efi_scratch efi_scratch;
@@ -93,6 +105,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
 extern pgd_t * __init efi_call_phys_prolog(void);
 extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init efi_print_memmap(void);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/kernel/arch/x86/include/asm/elf.h b/kernel/arch/x86/include/asm/elf.h
index f161c189c..1514753fd 100644
--- a/kernel/arch/x86/include/asm/elf.h
+++ b/kernel/arch/x86/include/asm/elf.h
@@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 #ifdef CONFIG_X86_64
 extern unsigned int vdso64_enabled;
 #endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 extern unsigned int vdso32_enabled;
 #endif
 
@@ -171,11 +171,11 @@ do {						\
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
 {
-	/* Commented-out registers are cleared in stub_execve */
-	/*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
-	regs->si = regs->di /*= regs->bp*/ = 0;
+	/* ax gets execve's return value. */
+	/*regs->ax = */ regs->bx = regs->cx = regs->dx = 0;
+	regs->si = regs->di = regs->bp = 0;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
-	/*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
+	regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
 	t->fs = t->gs = 0;
 	t->fsindex = t->gsindex = 0;
 	t->ds = t->es = ds;
@@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
 #define	COMPAT_ELF_PLAT_INIT(regs, load_addr)		\
 	elf_common_init(&current->thread, regs, __USER_DS)
 
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
-#define compat_start_thread start_thread_ia32
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread compat_start_thread
 
 void set_personality_ia32(bool);
 #define COMPAT_SET_PERSONALITY(ex)			\
@@ -328,7 +328,7 @@ else									\
 
 #define VDSO_ENTRY							\
 	((unsigned long)current->mm->context.vdso +			\
-	 selected_vdso32->sym___kernel_vsyscall)
+	 vdso_image_32.sym___kernel_vsyscall)
 
 struct linux_binprm;
 
@@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
  */
 static inline int mmap_is_ia32(void)
 {
-#ifdef CONFIG_X86_32
-	return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_ADDR32))
-		return 1;
-#endif
-	return 0;
+	return config_enabled(CONFIG_X86_32) ||
+	       (config_enabled(CONFIG_COMPAT) &&
+		test_thread_flag(TIF_ADDR32));
 }
 
 /* Do not change the values. See get_align_mask() */
diff --git a/kernel/arch/x86/include/asm/entry_arch.h b/kernel/arch/x86/include/asm/entry_arch.h
index dc5fa6614..df002992d 100644
--- a/kernel/arch/x86/include/asm/entry_arch.h
+++ b/kernel/arch/x86/include/asm/entry_arch.h
@@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 #ifdef CONFIG_HAVE_KVM
 BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
 		 smp_kvm_posted_intr_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
+		 smp_kvm_posted_intr_wakeup_ipi)
 #endif
 
 /*
@@ -50,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
+#endif
 #endif
diff --git a/kernel/arch/x86/include/asm/espfix.h b/kernel/arch/x86/include/asm/espfix.h
index 99efebb2f..ca3ce9ab9 100644
--- a/kernel/arch/x86/include/asm/espfix.h
+++ b/kernel/arch/x86/include/asm/espfix.h
@@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
 DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
 
 extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
+extern void init_espfix_ap(int cpu);
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/kernel/arch/x86/include/asm/fpu-internal.h b/kernel/arch/x86/include/asm/fpu-internal.h
deleted file mode 100644
index da5e96756..000000000
--- a/kernel/arch/x86/include/asm/fpu-internal.h
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _FPU_INTERNAL_H
-#define _FPU_INTERNAL_H
-
-#include <linux/kernel_stat.h>
-#include <linux/regset.h>
-#include <linux/compat.h>
-#include <linux/slab.h>
-#include <asm/asm.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/uaccess.h>
-#include <asm/xsave.h>
-#include <asm/smap.h>
-
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-struct ksignal;
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
-			compat_sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct ksignal *ksig,
-		     compat_sigset_t *set, struct pt_regs *regs);
-#else
-# define user_i387_ia32_struct	user_i387_struct
-# define user32_fxsr_struct	user_fxsr_struct
-# define ia32_setup_frame	__setup_frame
-# define ia32_setup_rt_frame	__setup_rt_frame
-#endif
-
-extern unsigned int mxcsr_feature_mask;
-extern void fpu_init(void);
-extern void eager_fpu_init(void);
-
-DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
-
-extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
-			      struct task_struct *tsk);
-extern void convert_to_fxsr(struct task_struct *tsk,
-			    const struct user_i387_ia32_struct *env);
-
-extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
-				xstateregs_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
-				 xstateregs_set;
-
-/*
- * xstateregs_active == fpregs_active. Please refer to the comment
- * at the definition of fpregs_active.
- */
-#define xstateregs_active	fpregs_active
-
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
-	per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-/*
- * Used to indicate that the FPU state in memory is newer than the FPU
- * state in registers, and the FPU state should be reloaded next time the
- * task is run. Only safe on the current task, or non-running tasks.
- */
-static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
-{
-	tsk->thread.fpu.last_cpu = ~0;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-	return new == this_cpu_read_stable(fpu_owner_task) &&
-		cpu == new->thread.fpu.last_cpu;
-}
-
-static inline int is_ia32_compat_frame(void)
-{
-	return config_enabled(CONFIG_IA32_EMULATION) &&
-	       test_thread_flag(TIF_IA32);
-}
-
-static inline int is_ia32_frame(void)
-{
-	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
-}
-
-static inline int is_x32_frame(void)
-{
-	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
-}
-
-#define X87_FSW_ES (1 << 7)	/* Exception Summary */
-
-static __always_inline __pure bool use_eager_fpu(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
-}
-
-static __always_inline __pure bool use_xsaveopt(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_FXSR);
-}
-
-static inline void fx_finit(struct i387_fxsave_struct *fx)
-{
-	fx->cwd = 0x37f;
-	fx->mxcsr = MXCSR_DEFAULT;
-}
-
-extern void __sanitize_i387_state(struct task_struct *);
-
-static inline void sanitize_i387_state(struct task_struct *tsk)
-{
-	if (!use_xsaveopt())
-		return;
-	__sanitize_i387_state(tsk);
-}
-
-#define user_insn(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile(ASM_STAC "\n"					\
-		     "1:" #insn "\n\t"					\
-		     "2: " ASM_CLAC "\n"				\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-#define check_insn(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile("1:" #insn "\n\t"					\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-static inline int fsave_user(struct i387_fsave_struct __user *fx)
-{
-	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
-}
-
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-			  "m" (*fx));
-}
-
-static inline int fxrstor_user(struct i387_fxsave_struct __user *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-			  "m" (*fx));
-}
-
-static inline int frstor_checking(struct i387_fsave_struct *fx)
-{
-	return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_user(struct i387_fsave_struct __user *fx)
-{
-	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void fpu_fxsave(struct fpu *fpu)
-{
-	if (config_enabled(CONFIG_X86_32))
-		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
-	else {
-		/* Using "rex64; fxsave %0" is broken because, if the memory
-		 * operand uses any extended registers for addressing, a second
-		 * REX prefix will be generated (to the assembler, rex64
-		 * followed by semicolon is a separate instruction), and hence
-		 * the 64-bitness is lost.
-		 *
-		 * Using "fxsaveq %0" would be the ideal choice, but is only
-		 * supported starting with gas 2.16.
-		 *
-		 * Using, as a workaround, the properly prefixed form below
-		 * isn't accepted by any binutils version so far released,
-		 * complaining that the same type of prefix is used twice if
-		 * an extended register is needed for addressing (fix submitted
-		 * to mainline 2005-11-21).
-		 *
-		 *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave));
-		 *
-		 * This, however, we can work around by forcing the compiler to
-		 * select an addressing mode that doesn't require extended
-		 * registers.
-		 */
-		asm volatile( "rex64/fxsave (%[fx])"
-			     : "=m" (fpu->state->fxsave)
-			     : [fx] "R" (&fpu->state->fxsave));
-	}
-}
-
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact.
- */
-static inline int fpu_save_init(struct fpu *fpu)
-{
-	if (use_xsave()) {
-		fpu_xsave(fpu);
-
-		/*
-		 * xsave header may indicate the init state of the FP.
-		 */
-		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
-			return 1;
-	} else if (use_fxsr()) {
-		fpu_fxsave(fpu);
-	} else {
-		asm volatile("fnsave %[fx]; fwait"
-			     : [fx] "=m" (fpu->state->fsave));
-		return 0;
-	}
-
-	/*
-	 * If exceptions are pending, we need to clear them so
-	 * that we don't randomly get exceptions later.
-	 *
-	 * FIXME! Is this perhaps only true for the old-style
-	 * irq13 case? Maybe we could leave the x87 state
-	 * intact otherwise?
-	 */
-	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
-		asm volatile("fnclex");
-		return 0;
-	}
-	return 1;
-}
-
-static inline int __save_init_fpu(struct task_struct *tsk)
-{
-	return fpu_save_init(&tsk->thread.fpu);
-}
-
-static inline int fpu_restore_checking(struct fpu *fpu)
-{
-	if (use_xsave())
-		return fpu_xrstor_checking(&fpu->state->xsave);
-	else if (use_fxsr())
-		return fxrstor_checking(&fpu->state->fxsave);
-	else
-		return frstor_checking(&fpu->state->fsave);
-}
-
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-	/*
-	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
-	 * pending. Clear the x87 state here by setting it to fixed values.
-	 * "m" is a random variable that should be in L1.
-	 */
-	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
-		asm volatile(
-			"fnclex\n\t"
-			"emms\n\t"
-			"fildl %P[addr]"	/* set F?P to defined value */
-			: : [addr] "m" (tsk->thread.fpu.has_fpu));
-	}
-
-	return fpu_restore_checking(&tsk->thread.fpu);
-}
-
-/*
- * Software FPU state helpers. Careful: these need to
- * be preemption protection *and* they need to be
- * properly paired with the CR0.TS changes!
- */
-static inline int __thread_has_fpu(struct task_struct *tsk)
-{
-	return tsk->thread.fpu.has_fpu;
-}
-
-/* Must be paired with an 'stts' after! */
-static inline void __thread_clear_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 0;
-	this_cpu_write(fpu_owner_task, NULL);
-}
-
-/* Must be paired with a 'clts' before! */
-static inline void __thread_set_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 1;
-	this_cpu_write(fpu_owner_task, tsk);
-}
-
-/*
- * Encapsulate the CR0.TS handling together with the
- * software flag.
- *
- * These generally need preemption protection to work,
- * do try to avoid using these on their own.
- */
-static inline void __thread_fpu_end(struct task_struct *tsk)
-{
-	__thread_clear_has_fpu(tsk);
-	if (!use_eager_fpu())
-		stts();
-}
-
-static inline void __thread_fpu_begin(struct task_struct *tsk)
-{
-	if (!use_eager_fpu())
-		clts();
-	__thread_set_has_fpu(tsk);
-}
-
-static inline void drop_fpu(struct task_struct *tsk)
-{
-	/*
-	 * Forget coprocessor state..
-	 */
-	preempt_disable();
-	tsk->thread.fpu_counter = 0;
-
-	if (__thread_has_fpu(tsk)) {
-		/* Ignore delayed exceptions from user space */
-		asm volatile("1: fwait\n"
-			     "2:\n"
-			     _ASM_EXTABLE(1b, 2b));
-		__thread_fpu_end(tsk);
-	}
-
-	clear_stopped_child_used_math(tsk);
-	preempt_enable();
-}
-
-static inline void restore_init_xstate(void)
-{
-	if (use_xsave())
-		xrstor_state(init_xstate_buf, -1);
-	else
-		fxrstor_checking(&init_xstate_buf->i387);
-}
-
-/*
- * Reset the FPU state in the eager case and drop it in the lazy case (later use
- * will reinit it).
- */
-static inline void fpu_reset_state(struct task_struct *tsk)
-{
-	if (!use_eager_fpu())
-		drop_fpu(tsk);
-	else
-		restore_init_xstate();
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- *  - switch_fpu_prepare() saves the old state and
- *    sets the new state of the CR0.TS bit. This is
- *    done within the context of the old process.
- *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
- */
-typedef struct { int preload; } fpu_switch_t;
-
-static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
-{
-	fpu_switch_t fpu;
-
-	/*
-	 * If the task has used the math, pre-load the FPU on xsave processors
-	 * or if the past 5 consecutive context-switches used math.
-	 */
-	fpu.preload = tsk_used_math(new) &&
-		      (use_eager_fpu() || new->thread.fpu_counter > 5);
-
-	if (__thread_has_fpu(old)) {
-		if (!__save_init_fpu(old))
-			task_disable_lazy_fpu_restore(old);
-		else
-			old->thread.fpu.last_cpu = cpu;
-
-		/* But leave fpu_owner_task! */
-		old->thread.fpu.has_fpu = 0;
-
-		/* Don't change CR0.TS if we just switch! */
-		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			__thread_set_has_fpu(new);
-			prefetch(new->thread.fpu.state);
-		} else if (!use_eager_fpu())
-			stts();
-	} else {
-		old->thread.fpu_counter = 0;
-		task_disable_lazy_fpu_restore(old);
-		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			if (fpu_lazy_restore(new, cpu))
-				fpu.preload = 0;
-			else
-				prefetch(new->thread.fpu.state);
-			__thread_fpu_begin(new);
-		}
-	}
-	return fpu;
-}
-
-/*
- * By the time this gets called, we've already cleared CR0.TS and
- * given the process the FPU if we are going to preload the FPU
- * state - all we need to do is to conditionally restore the register
- * state itself.
- */
-static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
-{
-	if (fpu.preload) {
-		if (unlikely(restore_fpu_checking(new)))
-			fpu_reset_state(new);
-	}
-}
-
-/*
- * Signal frame handlers...
- */
-extern int save_xstate_sig(void __user *buf, void __user *fx, int size);
-extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size);
-
-static inline int xstate_sigframe_size(void)
-{
-	return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
-}
-
-static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
-{
-	void __user *buf_fx = buf;
-	int size = xstate_sigframe_size();
-
-	if (ia32_frame && use_fxsr()) {
-		buf_fx = buf + sizeof(struct i387_fsave_struct);
-		size += sizeof(struct i387_fsave_struct);
-	}
-
-	return __restore_xstate_sig(buf, buf_fx, size);
-}
-
-/*
- * Needs to be preemption-safe.
- *
- * NOTE! user_fpu_begin() must be used only immediately before restoring
- * the save state. It does not do any saving/restoring on its own. In
- * lazy FPU mode, it is just an optimization to avoid a #NM exception,
- * the task can lose the FPU right after preempt_enable().
- */
-static inline void user_fpu_begin(void)
-{
-	preempt_disable();
-	if (!user_has_fpu())
-		__thread_fpu_begin(current);
-	preempt_enable();
-}
-
-static inline void __save_fpu(struct task_struct *tsk)
-{
-	if (use_xsave()) {
-		if (unlikely(system_state == SYSTEM_BOOTING))
-			xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
-		else
-			xsave_state(&tsk->thread.fpu.state->xsave, -1);
-	} else
-		fpu_fxsave(&tsk->thread.fpu);
-}
-
-/*
- * i387 state interaction
- */
-static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.cwd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
-	}
-}
-
-static inline unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.swd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
-	}
-}
-
-static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-	if (cpu_has_xmm) {
-		return tsk->thread.fpu.state->fxsave.mxcsr;
-	} else {
-		return MXCSR_DEFAULT;
-	}
-}
-
-static bool fpu_allocated(struct fpu *fpu)
-{
-	return fpu->state != NULL;
-}
-
-static inline int fpu_alloc(struct fpu *fpu)
-{
-	if (fpu_allocated(fpu))
-		return 0;
-	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
-	if (!fpu->state)
-		return -ENOMEM;
-	WARN_ON((unsigned long)fpu->state & 15);
-	return 0;
-}
-
-static inline void fpu_free(struct fpu *fpu)
-{
-	if (fpu->state) {
-		kmem_cache_free(task_xstate_cachep, fpu->state);
-		fpu->state = NULL;
-	}
-}
-
-static inline void fpu_copy(struct task_struct *dst, struct task_struct *src)
-{
-	if (use_eager_fpu()) {
-		memset(&dst->thread.fpu.state->xsave, 0, xstate_size);
-		__save_fpu(dst);
-	} else {
-		struct fpu *dfpu = &dst->thread.fpu;
-		struct fpu *sfpu = &src->thread.fpu;
-
-		unlazy_fpu(src);
-		memcpy(dfpu->state, sfpu->state, xstate_size);
-	}
-}
-
-static inline unsigned long
-alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx,
-		unsigned long *size)
-{
-	unsigned long frame_size = xstate_sigframe_size();
-
-	*buf_fx = sp = round_down(sp - frame_size, 64);
-	if (ia32_frame && use_fxsr()) {
-		frame_size += sizeof(struct i387_fsave_struct);
-		sp -= sizeof(struct i387_fsave_struct);
-	}
-
-	*size = frame_size;
-	return sp;
-}
-
-#endif
diff --git a/kernel/arch/x86/include/asm/fpu/api.h b/kernel/arch/x86/include/asm/fpu/api.h
new file mode 100644
index 000000000..1429a7c73
--- /dev/null
+++ b/kernel/arch/x86/include/asm/fpu/api.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_API_H
+#define _ASM_X86_FPU_API_H
+
+/*
+ * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
+ * and they don't touch the preempt state on their own.
+ * If you enable preemption after __kernel_fpu_begin(), preempt notifier
+ * should call the __kernel_fpu_end() to prevent the kernel/user FPU
+ * state from getting corrupted. KVM for example uses this model.
+ *
+ * All other cases use kernel_fpu_begin/end() which disable preemption
+ * during kernel FPU usage.
+ */
+extern void __kernel_fpu_begin(void);
+extern void __kernel_fpu_end(void);
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
+extern bool irq_fpu_usable(void);
+
+/*
+ * Some instructions like VIA's padlock instructions generate a spurious
+ * DNA fault but don't modify SSE registers. And these instructions
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
+ * should use them only in the context of irq_ts_save/restore()
+ */
+extern int  irq_ts_save(void);
+extern void irq_ts_restore(int TS_state);
+
+/*
+ * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
+ *
+ * If 'feature_name' is set then put a human-readable description of
+ * the feature there as well - this can be used to print error (or success)
+ * messages.
+ */
+extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
+
+#endif /* _ASM_X86_FPU_API_H */
diff --git a/kernel/arch/x86/include/asm/fpu/internal.h b/kernel/arch/x86/include/asm/fpu/internal.h
new file mode 100644
index 000000000..3c3550c3a
--- /dev/null
+++ b/kernel/arch/x86/include/asm/fpu/internal.h
@@ -0,0 +1,694 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_INTERNAL_H
+#define _ASM_X86_FPU_INTERNAL_H
+
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/user.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+
+/*
+ * High level FPU state handling functions:
+ */
+extern void fpu__activate_curr(struct fpu *fpu);
+extern void fpu__activate_fpstate_read(struct fpu *fpu);
+extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__save(struct fpu *fpu);
+extern void fpu__restore(struct fpu *fpu);
+extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
+extern void fpu__drop(struct fpu *fpu);
+extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern void fpu__clear(struct fpu *fpu);
+extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
+
+/*
+ * Boot time FPU initialization functions:
+ */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system_xstate(void);
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
+/*
+ * Debugging facility:
+ */
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/*
+ * FPU related CPU feature flag helper routines:
+ */
+static __always_inline __pure bool use_eager_fpu(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+}
+
+static __always_inline __pure bool use_xsaveopt(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+}
+
+static __always_inline __pure bool use_xsave(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_FXSR);
+}
+
+/*
+ * fpstate handling functions:
+ */
+
+extern union fpregs_state init_fpstate;
+
+extern void fpstate_init(union fpregs_state *state);
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+{
+	fx->cwd = 0x37f;
+	fx->mxcsr = MXCSR_DEFAULT;
+}
+extern void fpstate_sanitize_xstate(struct fpu *fpu);
+
+#define user_insn(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile(ASM_STAC "\n"					\
+		     "1:" #insn "\n\t"					\
+		     "2: " ASM_CLAC "\n"				\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+#define check_insn(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+static inline int copy_fregs_to_user(struct fregs_state __user *fx)
+{
+	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
+{
+	if (config_enabled(CONFIG_X86_32))
+		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+	/* See comment in copy_fxregs_to_kernel() below. */
+	return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
+}
+
+static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
+{
+	int err;
+
+	if (config_enabled(CONFIG_X86_32)) {
+		err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	} else {
+		if (config_enabled(CONFIG_AS_FXSAVEQ)) {
+			err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+		} else {
+			/* See comment in copy_fxregs_to_kernel() below. */
+			err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
+		}
+	}
+	/* Copying from a kernel buffer to FPU registers should never fail: */
+	WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
+{
+	if (config_enabled(CONFIG_X86_32))
+		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+	/* See comment in copy_fxregs_to_kernel() below. */
+	return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
+			  "m" (*fx));
+}
+
+static inline void copy_kernel_to_fregs(struct fregs_state *fx)
+{
+	int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+	WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fregs(struct fregs_state __user *fx)
+{
+	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void copy_fxregs_to_kernel(struct fpu *fpu)
+{
+	if (config_enabled(CONFIG_X86_32))
+		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+	else {
+		/* Using "rex64; fxsave %0" is broken because, if the memory
+		 * operand uses any extended registers for addressing, a second
+		 * REX prefix will be generated (to the assembler, rex64
+		 * followed by semicolon is a separate instruction), and hence
+		 * the 64-bitness is lost.
+		 *
+		 * Using "fxsaveq %0" would be the ideal choice, but is only
+		 * supported starting with gas 2.16.
+		 *
+		 * Using, as a workaround, the properly prefixed form below
+		 * isn't accepted by any binutils version so far released,
+		 * complaining that the same type of prefix is used twice if
+		 * an extended register is needed for addressing (fix submitted
+		 * to mainline 2005-11-21).
+		 *
+		 *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
+		 *
+		 * This, however, we can work around by forcing the compiler to
+		 * select an addressing mode that doesn't require extended
+		 * registers.
+		 */
+		asm volatile( "rex64/fxsave (%[fx])"
+			     : "=m" (fpu->state.fxsave)
+			     : [fx] "R" (&fpu->state.fxsave));
+	}
+}
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/* xstate instruction fault handler: */
+#define xstate_fault(__err)		\
+					\
+	".section .fixup,\"ax\"\n"	\
+					\
+	"3:  movl $-2,%[_err]\n"	\
+	"    jmp  2b\n"			\
+					\
+	".previous\n"			\
+					\
+	_ASM_EXTABLE(1b, 3b)		\
+	: [_err] "=r" (__err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XSAVES"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+	else
+		asm volatile("1:"XSAVE"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XRSTORS"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+	else
+		asm volatile("1:"XRSTOR"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+
+	/* We should never fault when copying from a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(!alternatives_patched);
+
+	/*
+	 * If xsaves is enabled, xsaves replaces xsaveopt because
+	 * it supports compact format and supervisor states in addition to
+	 * modified optimization in xsaveopt.
+	 *
+	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+	 * because xsaveopt supports modified optimization which is not
+	 * supported by xsave.
+	 *
+	 * If none of xsaves and xsaveopt is enabled, use xsave.
+	 */
+	alternative_input_2(
+		"1:"XSAVE,
+		XSAVEOPT,
+		X86_FEATURE_XSAVEOPT,
+		XSAVES,
+		X86_FEATURE_XSAVES,
+		[xstate] "D" (xstate), "a" (lmask), "d" (hmask) :
+		"memory");
+	asm volatile("2:\n\t"
+		     xstate_fault(err)
+		     : "0" (err)
+		     : "memory");
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	/*
+	 * Use xrstors to restore context if it is enabled. xrstors supports
+	 * compacted format of xsave area which is not supported by xrstor.
+	 */
+	alternative_input(
+		"1: " XRSTOR,
+		XRSTORS,
+		X86_FEATURE_XSAVES,
+		"D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask)
+		: "memory");
+
+	asm volatile("2:\n"
+		     xstate_fault(err)
+		     : "0" (err)
+		     : "memory");
+
+	/* We should never fault when copying from a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
+static inline int copy_xregs_to_user(struct xregs_state __user *buf)
+{
+	int err;
+
+	/*
+	 * Clear the xsave header first, so that reserved fields are
+	 * initialized to zero.
+	 */
+	err = __clear_user(&buf->header, sizeof(buf->header));
+	if (unlikely(err))
+		return -EFAULT;
+
+	__asm__ __volatile__(ASM_STAC "\n"
+			     "1:"XSAVE"\n"
+			     "2: " ASM_CLAC "\n"
+			     xstate_fault(err)
+			     : "D" (buf), "a" (-1), "d" (-1), "0" (err)
+			     : "memory");
+	return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
+{
+	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	__asm__ __volatile__(ASM_STAC "\n"
+			     "1:"XRSTOR"\n"
+			     "2: " ASM_CLAC "\n"
+			     xstate_fault(err)
+			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err)
+			     : "memory");	/* memory required? */
+	return err;
+}
+
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+	if (likely(use_xsave())) {
+		copy_xregs_to_kernel(&fpu->state.xsave);
+		return 1;
+	}
+
+	if (likely(use_fxsr())) {
+		copy_fxregs_to_kernel(fpu);
+		return 1;
+	}
+
+	/*
+	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
+	 * so we have to mark them inactive:
+	 */
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+	return 0;
+}
+
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+	if (use_xsave()) {
+		copy_kernel_to_xregs(&fpstate->xsave, -1);
+	} else {
+		if (use_fxsr())
+			copy_kernel_to_fxregs(&fpstate->fxsave);
+		else
+			copy_kernel_to_fregs(&fpstate->fsave);
+	}
+}
+
+static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+	/*
+	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
+	 * pending. Clear the x87 state here by setting it to fixed values.
+	 * "m" is a random variable that should be in L1.
+	 */
+	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+		asm volatile(
+			"fnclex\n\t"
+			"emms\n\t"
+			"fildl %P[addr]"	/* set F?P to defined value */
+			: : [addr] "m" (fpstate));
+	}
+
+	__copy_kernel_to_fpregs(fpstate);
+}
+
+extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+
+/*
+ * FPU context switch related helper methods:
+ */
+
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/*
+ * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+}
+
+static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu)
+{
+	return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+
+/*
+ * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation'
+ * idiom, which is then paired with the sw-flag (fpregs_active) later on:
+ */
+
+static inline void __fpregs_activate_hw(void)
+{
+	if (!use_eager_fpu())
+		clts();
+}
+
+static inline void __fpregs_deactivate_hw(void)
+{
+	if (!use_eager_fpu())
+		stts();
+}
+
+/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */
+static inline void __fpregs_deactivate(struct fpu *fpu)
+{
+	WARN_ON_FPU(!fpu->fpregs_active);
+
+	fpu->fpregs_active = 0;
+	this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */
+static inline void __fpregs_activate(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu->fpregs_active);
+
+	fpu->fpregs_active = 1;
+	this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+}
+
+/*
+ * The question "does this thread have fpu access?"
+ * is slightly racy, since preemption could come in
+ * and revoke it immediately after the test.
+ *
+ * However, even in that very unlikely scenario,
+ * we can just assume we have FPU access - typically
+ * to save the FP state - we'll just take a #NM
+ * fault and get the FPU access back.
+ */
+static inline int fpregs_active(void)
+{
+	return current->thread.fpu.fpregs_active;
+}
+
+/*
+ * Encapsulate the CR0.TS handling together with the
+ * software flag.
+ *
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own.
+ */
+static inline void fpregs_activate(struct fpu *fpu)
+{
+	__fpregs_activate_hw();
+	__fpregs_activate(fpu);
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+	__fpregs_deactivate(fpu);
+	__fpregs_deactivate_hw();
+}
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state and
+ *    sets the new state of the CR0.TS bit. This is
+ *    done within the context of the old process.
+ *
+ *  - switch_fpu_finish() restores the new state as
+ *    necessary.
+ */
+typedef struct { int preload; } fpu_switch_t;
+
+static inline fpu_switch_t
+switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
+{
+	fpu_switch_t fpu;
+
+	/*
+	 * If the task has used the math, pre-load the FPU on xsave processors
+	 * or if the past 5 consecutive context-switches used math.
+	 */
+	fpu.preload = new_fpu->fpstate_active &&
+		      (use_eager_fpu() || new_fpu->counter > 5);
+
+	if (old_fpu->fpregs_active) {
+		if (!copy_fpregs_to_fpstate(old_fpu))
+			old_fpu->last_cpu = -1;
+		else
+			old_fpu->last_cpu = cpu;
+
+		/* But leave fpu_fpregs_owner_ctx! */
+		old_fpu->fpregs_active = 0;
+
+		/* Don't change CR0.TS if we just switch! */
+		if (fpu.preload) {
+			new_fpu->counter++;
+			__fpregs_activate(new_fpu);
+			prefetch(&new_fpu->state);
+		} else {
+			__fpregs_deactivate_hw();
+		}
+	} else {
+		old_fpu->counter = 0;
+		old_fpu->last_cpu = -1;
+		if (fpu.preload) {
+			new_fpu->counter++;
+			if (fpu_want_lazy_restore(new_fpu, cpu))
+				fpu.preload = 0;
+			else
+				prefetch(&new_fpu->state);
+			fpregs_activate(new_fpu);
+		}
+	}
+	return fpu;
+}
+
+/*
+ * Misc helper functions:
+ */
+
+/*
+ * By the time this gets called, we've already cleared CR0.TS and
+ * given the process the FPU if we are going to preload the FPU
+ * state - all we need to do is to conditionally restore the register
+ * state itself.
+ */
+static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch)
+{
+	if (fpu_switch.preload)
+		copy_kernel_to_fpregs(&new_fpu->state);
+}
+
+/*
+ * Needs to be preemption-safe.
+ *
+ * NOTE! user_fpu_begin() must be used only immediately before restoring
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
+ */
+static inline void user_fpu_begin(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	preempt_disable();
+	if (!fpregs_active())
+		fpregs_activate(fpu);
+	preempt_enable();
+}
+
+/*
+ * MXCSR and XCR definitions:
+ */
+
+extern unsigned int mxcsr_feature_mask;
+
+#define XCR_XFEATURE_ENABLED_MASK	0x00000000
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+		     : : "a" (eax), "d" (edx), "c" (index));
+}
+
+#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/kernel/arch/x86/include/asm/fpu/regset.h b/kernel/arch/x86/include/asm/fpu/regset.h
new file mode 100644
index 000000000..39d3107ac
--- /dev/null
+++ b/kernel/arch/x86/include/asm/fpu/regset.h
@@ -0,0 +1,21 @@
+/*
+ * FPU regset handling methods:
+ */
+#ifndef _ASM_X86_FPU_REGSET_H
+#define _ASM_X86_FPU_REGSET_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+				xstateregs_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+				 xstateregs_set;
+
+/*
+ * xstateregs_active == regset_fpregs_active. Please refer to the comment
+ * at the definition of regset_fpregs_active.
+ */
+#define xstateregs_active	regset_fpregs_active
+
+#endif /* _ASM_X86_FPU_REGSET_H */
diff --git a/kernel/arch/x86/include/asm/fpu/signal.h b/kernel/arch/x86/include/asm/fpu/signal.h
new file mode 100644
index 000000000..0e970d00d
--- /dev/null
+++ b/kernel/arch/x86/include/asm/fpu/signal.h
@@ -0,0 +1,33 @@
+/*
+ * x86 FPU signal frame handling methods:
+ */
+#ifndef _ASM_X86_FPU_SIGNAL_H
+#define _ASM_X86_FPU_SIGNAL_H
+
+#ifdef CONFIG_X86_64
+# include <uapi/asm/sigcontext.h>
+# include <asm/user32.h>
+struct ksignal;
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
+			compat_sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct ksignal *ksig,
+		     compat_sigset_t *set, struct pt_regs *regs);
+#else
+# define user_i387_ia32_struct	user_i387_struct
+# define user32_fxsr_struct	user_fxsr_struct
+# define ia32_setup_frame	__setup_frame
+# define ia32_setup_rt_frame	__setup_rt_frame
+#endif
+
+extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
+			      struct task_struct *tsk);
+extern void convert_to_fxsr(struct task_struct *tsk,
+			    const struct user_i387_ia32_struct *env);
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+		     unsigned long *buf_fx, unsigned long *size);
+
+extern void fpu__init_prepare_fx_sw_frame(void);
+
+#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/kernel/arch/x86/include/asm/fpu/types.h b/kernel/arch/x86/include/asm/fpu/types.h
new file mode 100644
index 000000000..1c6f6ac52
--- /dev/null
+++ b/kernel/arch/x86/include/asm/fpu/types.h
@@ -0,0 +1,355 @@
+/*
+ * FPU data structures:
+ */
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+/*
+ * The legacy x87 FPU state format, as saved by FSAVE and
+ * restored by the FRSTOR instructions:
+ */
+struct fregs_state {
+	u32			cwd;	/* FPU Control Word		*/
+	u32			swd;	/* FPU Status Word		*/
+	u32			twd;	/* FPU Tag Word			*/
+	u32			fip;	/* FPU IP Offset		*/
+	u32			fcs;	/* FPU IP Selector		*/
+	u32			foo;	/* FPU Operand Pointer Offset	*/
+	u32			fos;	/* FPU Operand Pointer Selector	*/
+
+	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
+	u32			st_space[20];
+
+	/* Software status information [not touched by FSAVE]:		*/
+	u32			status;
+};
+
+/*
+ * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and
+ * restored by the FXRSTOR instructions. It's similar to the FSAVE
+ * format, but differs in some areas, plus has extensions at
+ * the end for the XMM registers.
+ */
+struct fxregs_state {
+	u16			cwd; /* Control Word			*/
+	u16			swd; /* Status Word			*/
+	u16			twd; /* Tag Word			*/
+	u16			fop; /* Last Instruction Opcode		*/
+	union {
+		struct {
+			u64	rip; /* Instruction Pointer		*/
+			u64	rdp; /* Data Pointer			*/
+		};
+		struct {
+			u32	fip; /* FPU IP Offset			*/
+			u32	fcs; /* FPU IP Selector			*/
+			u32	foo; /* FPU Operand Offset		*/
+			u32	fos; /* FPU Operand Selector		*/
+		};
+	};
+	u32			mxcsr;		/* MXCSR Register State */
+	u32			mxcsr_mask;	/* MXCSR Mask		*/
+
+	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
+	u32			st_space[32];
+
+	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
+	u32			xmm_space[64];
+
+	u32			padding[12];
+
+	union {
+		u32		padding1[12];
+		u32		sw_reserved[12];
+	};
+
+} __attribute__((aligned(16)));
+
+/* Default value for fxregs_state.mxcsr: */
+#define MXCSR_DEFAULT		0x1f80
+
+/*
+ * Software based FPU emulation state. This is arbitrary really,
+ * it matches the x87 format to make it easier to understand:
+ */
+struct swregs_state {
+	u32			cwd;
+	u32			swd;
+	u32			twd;
+	u32			fip;
+	u32			fcs;
+	u32			foo;
+	u32			fos;
+	/* 8*10 bytes for each FP-reg = 80 bytes: */
+	u32			st_space[20];
+	u8			ftop;
+	u8			changed;
+	u8			lookahead;
+	u8			no_update;
+	u8			rm;
+	u8			alimit;
+	struct math_emu_info	*info;
+	u32			entry_eip;
+};
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature {
+	XFEATURE_FP,
+	XFEATURE_SSE,
+	/*
+	 * Values above here are "legacy states".
+	 * Those below are "extended states".
+	 */
+	XFEATURE_YMM,
+	XFEATURE_BNDREGS,
+	XFEATURE_BNDCSR,
+	XFEATURE_OPMASK,
+	XFEATURE_ZMM_Hi256,
+	XFEATURE_Hi16_ZMM,
+
+	XFEATURE_MAX,
+};
+
+#define XFEATURE_MASK_FP		(1 << XFEATURE_FP)
+#define XFEATURE_MASK_SSE		(1 << XFEATURE_SSE)
+#define XFEATURE_MASK_YMM		(1 << XFEATURE_YMM)
+#define XFEATURE_MASK_BNDREGS		(1 << XFEATURE_BNDREGS)
+#define XFEATURE_MASK_BNDCSR		(1 << XFEATURE_BNDCSR)
+#define XFEATURE_MASK_OPMASK		(1 << XFEATURE_OPMASK)
+#define XFEATURE_MASK_ZMM_Hi256		(1 << XFEATURE_ZMM_Hi256)
+#define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
+
+#define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
+#define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
+					 | XFEATURE_MASK_ZMM_Hi256 \
+					 | XFEATURE_MASK_Hi16_ZMM)
+
+#define FIRST_EXTENDED_XFEATURE	XFEATURE_YMM
+
+struct reg_128_bit {
+	u8      regbytes[128/8];
+};
+struct reg_256_bit {
+	u8	regbytes[256/8];
+};
+struct reg_512_bit {
+	u8	regbytes[512/8];
+};
+
+/*
+ * State component 2:
+ *
+ * There are 16x 256-bit AVX registers named YMM0-YMM15.
+ * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
+ * and are stored in 'struct fxregs_state::xmm_space[]' in the
+ * "legacy" area.
+ *
+ * The high 128 bits are stored here.
+ */
+struct ymmh_struct {
+	struct reg_128_bit              hi_ymm[16];
+} __packed;
+
+/* Intel MPX support: */
+
+struct mpx_bndreg {
+	u64				lower_bound;
+	u64				upper_bound;
+} __packed;
+/*
+ * State component 3 is used for the 4 128-bit bounds registers
+ */
+struct mpx_bndreg_state {
+	struct mpx_bndreg		bndreg[4];
+} __packed;
+
+/*
+ * State component 4 is used for the 64-bit user-mode MPX
+ * configuration register BNDCFGU and the 64-bit MPX status
+ * register BNDSTATUS.  We call the pair "BNDCSR".
+ */
+struct mpx_bndcsr {
+	u64				bndcfgu;
+	u64				bndstatus;
+} __packed;
+
+/*
+ * The BNDCSR state is padded out to be 64-bytes in size.
+ */
+struct mpx_bndcsr_state {
+	union {
+		struct mpx_bndcsr		bndcsr;
+		u8				pad_to_64_bytes[64];
+	};
+} __packed;
+
+/* AVX-512 Components: */
+
+/*
+ * State component 5 is used for the 8 64-bit opmask registers
+ * k0-k7 (opmask state).
+ */
+struct avx_512_opmask_state {
+	u64				opmask_reg[8];
+} __packed;
+
+/*
+ * State component 6 is used for the upper 256 bits of the
+ * registers ZMM0-ZMM15. These 16 256-bit values are denoted
+ * ZMM0_H-ZMM15_H (ZMM_Hi256 state).
+ */
+struct avx_512_zmm_uppers_state {
+	struct reg_256_bit		zmm_upper[16];
+} __packed;
+
+/*
+ * State component 7 is used for the 16 512-bit registers
+ * ZMM16-ZMM31 (Hi16_ZMM state).
+ */
+struct avx_512_hi16_state {
+	struct reg_512_bit		hi16_zmm[16];
+} __packed;
+
+struct xstate_header {
+	u64				xfeatures;
+	u64				xcomp_bv;
+	u64				reserved[6];
+} __attribute__((packed));
+
+/*
+ * This is our most modern FPU state format, as saved by the XSAVE
+ * and restored by the XRSTOR instructions.
+ *
+ * It consists of a legacy fxregs portion, an xstate header and
+ * subsequent areas as defined by the xstate header.  Not all CPUs
+ * support all the extensions, so the size of the extended area
+ * can vary quite a bit between CPUs.
+ */
+struct xregs_state {
+	struct fxregs_state		i387;
+	struct xstate_header		header;
+	u8				extended_state_area[0];
+} __attribute__ ((packed, aligned (64)));
+
+/*
+ * This is a union of all the possible FPU state formats
+ * put together, so that we can pick the right one runtime.
+ *
+ * The size of the structure is determined by the largest
+ * member - which is the xsave area.  The padding is there
+ * to ensure that statically-allocated task_structs (just
+ * the init_task today) have enough space.
+ */
+union fpregs_state {
+	struct fregs_state		fsave;
+	struct fxregs_state		fxsave;
+	struct swregs_state		soft;
+	struct xregs_state		xsave;
+	u8 __padding[PAGE_SIZE];
+};
+
+/*
+ * Highest level per task FPU state data structure that
+ * contains the FPU register state plus various FPU
+ * state fields:
+ */
+struct fpu {
+	/*
+	 * @last_cpu:
+	 *
+	 * Records the last CPU on which this context was loaded into
+	 * FPU registers. (In the lazy-restore case we might be
+	 * able to reuse FPU registers across multiple context switches
+	 * this way, if no intermediate task used the FPU.)
+	 *
+	 * A value of -1 is used to indicate that the FPU state in context
+	 * memory is newer than the FPU state in registers, and that the
+	 * FPU state should be reloaded next time the task is run.
+	 */
+	unsigned int			last_cpu;
+
+	/*
+	 * @fpstate_active:
+	 *
+	 * This flag indicates whether this context is active: if the task
+	 * is not running then we can restore from this context, if the task
+	 * is running then we should save into this context.
+	 */
+	unsigned char			fpstate_active;
+
+	/*
+	 * @fpregs_active:
+	 *
+	 * This flag determines whether a given context is actively
+	 * loaded into the FPU's registers and that those registers
+	 * represent the task's current FPU state.
+	 *
+	 * Note the interaction with fpstate_active:
+	 *
+	 *   # task does not use the FPU:
+	 *   fpstate_active == 0
+	 *
+	 *   # task uses the FPU and regs are active:
+	 *   fpstate_active == 1 && fpregs_active == 1
+	 *
+	 *   # the regs are inactive but still match fpstate:
+	 *   fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu
+	 *
+	 * The third state is what we use for the lazy restore optimization
+	 * on lazy-switching CPUs.
+	 */
+	unsigned char			fpregs_active;
+
+	/*
+	 * @counter:
+	 *
+	 * This counter contains the number of consecutive context switches
+	 * during which the FPU stays used. If this is over a threshold, the
+	 * lazy FPU restore logic becomes eager, to save the trap overhead.
+	 * This is an unsigned char so that after 256 iterations the counter
+	 * wraps and the context switch behavior turns lazy again; this is to
+	 * deal with bursty apps that only use the FPU for a short time:
+	 */
+	unsigned char			counter;
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+	/*
+	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * anything after it here.
+	 */
+};
+
+#endif /* _ASM_X86_FPU_H */
diff --git a/kernel/arch/x86/include/asm/fpu/xstate.h b/kernel/arch/x86/include/asm/fpu/xstate.h
new file mode 100644
index 000000000..3a6c89b70
--- /dev/null
+++ b/kernel/arch/x86/include/asm/fpu/xstate.h
@@ -0,0 +1,51 @@
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XFEATURE_MASK_EXTEND	(~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
+
+#define XSTATE_CPUID		0x0000000d
+
+#define FXSAVE_SIZE	512
+
+#define XSAVE_HDR_SIZE	    64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE	    256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
+/* Supported features which support lazy state saving */
+#define XFEATURE_MASK_LAZY	(XFEATURE_MASK_FP | \
+				 XFEATURE_MASK_SSE | \
+				 XFEATURE_MASK_YMM | \
+				 XFEATURE_MASK_OPMASK |	\
+				 XFEATURE_MASK_ZMM_Hi256 | \
+				 XFEATURE_MASK_Hi16_ZMM)
+
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+
+/* All currently supported features */
+#define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX	"0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern unsigned int xstate_size;
+extern u64 xfeatures_mask;
+extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+
+void fpu__xstate_clear_all_cpu_caps(void);
+void *get_xsave_addr(struct xregs_state *xsave, int xstate);
+const void *get_xsave_field_ptr(int xstate_field);
+
+#endif
diff --git a/kernel/arch/x86/include/asm/frame.h b/kernel/arch/x86/include/asm/frame.h
index 3b629f47e..793179cf8 100644
--- a/kernel/arch/x86/include/asm/frame.h
+++ b/kernel/arch/x86/include/asm/frame.h
@@ -1,20 +1,17 @@
 #ifdef __ASSEMBLY__
 
 #include <asm/asm.h>
-#include <asm/dwarf2.h>
 
 /* The annotation hides the frame from the unwinder and makes it look
    like a ordinary ebp save/restore. This avoids some special cases for
    frame pointer later */
 #ifdef CONFIG_FRAME_POINTER
 	.macro FRAME
-	__ASM_SIZE(push,_cfi)	%__ASM_REG(bp)
-	CFI_REL_OFFSET		__ASM_REG(bp), 0
+	__ASM_SIZE(push,)	%__ASM_REG(bp)
 	__ASM_SIZE(mov)		%__ASM_REG(sp), %__ASM_REG(bp)
 	.endm
 	.macro ENDFRAME
-	__ASM_SIZE(pop,_cfi)	%__ASM_REG(bp)
-	CFI_RESTORE		__ASM_REG(bp)
+	__ASM_SIZE(pop,)	%__ASM_REG(bp)
 	.endm
 #else
 	.macro FRAME
diff --git a/kernel/arch/x86/include/asm/ftrace.h b/kernel/arch/x86/include/asm/ftrace.h
index f45acad3c..24938852d 100644
--- a/kernel/arch/x86/include/asm/ftrace.h
+++ b/kernel/arch/x86/include/asm/ftrace.h
@@ -3,9 +3,9 @@
 
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CC_USING_FENTRY
-# define MCOUNT_ADDR		((long)(__fentry__))
+# define MCOUNT_ADDR		((unsigned long)(__fentry__))
 #else
-# define MCOUNT_ADDR		((long)(mcount))
+# define MCOUNT_ADDR		((unsigned long)(mcount))
 #endif
 #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
 
diff --git a/kernel/arch/x86/include/asm/hardirq.h b/kernel/arch/x86/include/asm/hardirq.h
index 0f5fb6b65..7178043b0 100644
--- a/kernel/arch/x86/include/asm/hardirq.h
+++ b/kernel/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 #ifdef CONFIG_HAVE_KVM
 	unsigned int kvm_posted_intr_ipis;
+	unsigned int kvm_posted_intr_wakeup_ipis;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
@@ -33,6 +34,9 @@ typedef struct {
 #ifdef CONFIG_X86_MCE_THRESHOLD
 	unsigned int irq_threshold_count;
 #endif
+#ifdef CONFIG_X86_MCE_AMD
+	unsigned int irq_deferred_error_count;
+#endif
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
 	unsigned int irq_hv_callback_count;
 #endif
diff --git a/kernel/arch/x86/include/asm/highmem.h b/kernel/arch/x86/include/asm/highmem.h
index 04e9d0231..1c0b43724 100644
--- a/kernel/arch/x86/include/asm/highmem.h
+++ b/kernel/arch/x86/include/asm/highmem.h
@@ -68,7 +68,6 @@ void *kmap_atomic(struct page *page);
 void __kunmap_atomic(void *kvaddr);
 void *kmap_atomic_pfn(unsigned long pfn);
 void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
-struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
diff --git a/kernel/arch/x86/include/asm/hpet.h b/kernel/arch/x86/include/asm/hpet.h
index 36f712594..cc285ec4b 100644
--- a/kernel/arch/x86/include/asm/hpet.h
+++ b/kernel/arch/x86/include/asm/hpet.h
@@ -63,10 +63,10 @@
 /* hpet memory map physical address */
 extern unsigned long hpet_address;
 extern unsigned long force_hpet_address;
-extern int boot_hpet_disable;
+extern bool boot_hpet_disable;
 extern u8 hpet_blockid;
-extern int hpet_force_user;
-extern u8 hpet_msi_disable;
+extern bool hpet_force_user;
+extern bool hpet_msi_disable;
 extern int is_hpet_enabled(void);
 extern int hpet_enable(void);
 extern void hpet_disable(void);
@@ -74,20 +74,16 @@ extern unsigned int hpet_readl(unsigned int a);
 extern void force_hpet_resume(void);
 
 struct irq_data;
+struct hpet_dev;
+struct irq_domain;
+
 extern void hpet_msi_unmask(struct irq_data *data);
 extern void hpet_msi_mask(struct irq_data *data);
-struct hpet_dev;
 extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
 extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
-
-#ifdef CONFIG_PCI_MSI
-extern int default_setup_hpet_msi(unsigned int irq, unsigned int id);
-#else
-static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id)
-{
-	return -EINVAL;
-}
-#endif
+extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
+extern int hpet_assign_irq(struct irq_domain *domain,
+			   struct hpet_dev *dev, int dev_num);
 
 #ifdef CONFIG_HPET_EMULATE_RTC
 
diff --git a/kernel/arch/x86/include/asm/hugetlb.h b/kernel/arch/x86/include/asm/hugetlb.h
index 68c05398b..f8a29d2c9 100644
--- a/kernel/arch/x86/include/asm/hugetlb.h
+++ b/kernel/arch/x86/include/asm/hugetlb.h
@@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file,
 	return 0;
 }
 
-static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
-}
-
 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 					  unsigned long addr, unsigned long end,
 					  unsigned long floor,
@@ -83,15 +80,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
 	return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-	return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/kernel/arch/x86/include/asm/hw_irq.h b/kernel/arch/x86/include/asm/hw_irq.h
index e9571ddab..1e3408e88 100644
--- a/kernel/arch/x86/include/asm/hw_irq.h
+++ b/kernel/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,7 @@
 extern asmlinkage void apic_timer_interrupt(void);
 extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
 extern asmlinkage void error_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 
@@ -36,43 +37,10 @@ extern asmlinkage void spurious_interrupt(void);
 extern asmlinkage void thermal_interrupt(void);
 extern asmlinkage void reschedule_interrupt(void);
 
-extern asmlinkage void invalidate_interrupt(void);
-extern asmlinkage void invalidate_interrupt0(void);
-extern asmlinkage void invalidate_interrupt1(void);
-extern asmlinkage void invalidate_interrupt2(void);
-extern asmlinkage void invalidate_interrupt3(void);
-extern asmlinkage void invalidate_interrupt4(void);
-extern asmlinkage void invalidate_interrupt5(void);
-extern asmlinkage void invalidate_interrupt6(void);
-extern asmlinkage void invalidate_interrupt7(void);
-extern asmlinkage void invalidate_interrupt8(void);
-extern asmlinkage void invalidate_interrupt9(void);
-extern asmlinkage void invalidate_interrupt10(void);
-extern asmlinkage void invalidate_interrupt11(void);
-extern asmlinkage void invalidate_interrupt12(void);
-extern asmlinkage void invalidate_interrupt13(void);
-extern asmlinkage void invalidate_interrupt14(void);
-extern asmlinkage void invalidate_interrupt15(void);
-extern asmlinkage void invalidate_interrupt16(void);
-extern asmlinkage void invalidate_interrupt17(void);
-extern asmlinkage void invalidate_interrupt18(void);
-extern asmlinkage void invalidate_interrupt19(void);
-extern asmlinkage void invalidate_interrupt20(void);
-extern asmlinkage void invalidate_interrupt21(void);
-extern asmlinkage void invalidate_interrupt22(void);
-extern asmlinkage void invalidate_interrupt23(void);
-extern asmlinkage void invalidate_interrupt24(void);
-extern asmlinkage void invalidate_interrupt25(void);
-extern asmlinkage void invalidate_interrupt26(void);
-extern asmlinkage void invalidate_interrupt27(void);
-extern asmlinkage void invalidate_interrupt28(void);
-extern asmlinkage void invalidate_interrupt29(void);
-extern asmlinkage void invalidate_interrupt30(void);
-extern asmlinkage void invalidate_interrupt31(void);
-
 extern asmlinkage void irq_move_cleanup_interrupt(void);
 extern asmlinkage void reboot_interrupt(void);
 extern asmlinkage void threshold_interrupt(void);
+extern asmlinkage void deferred_error_interrupt(void);
 
 extern asmlinkage void call_function_interrupt(void);
 extern asmlinkage void call_function_single_interrupt(void);
@@ -87,60 +55,93 @@ extern void trace_spurious_interrupt(void);
 extern void trace_thermal_interrupt(void);
 extern void trace_reschedule_interrupt(void);
 extern void trace_threshold_interrupt(void);
+extern void trace_deferred_error_interrupt(void);
 extern void trace_call_function_interrupt(void);
 extern void trace_call_function_single_interrupt(void);
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
+#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
 #endif /* CONFIG_TRACING */
 
-#ifdef CONFIG_IRQ_REMAP
-/* Intel specific interrupt remapping information */
-struct irq_2_iommu {
-	struct intel_iommu *iommu;
-	u16 irte_index;
-	u16 sub_handle;
-	u8  irte_mask;
-};
-
-/* AMD specific interrupt remapping information */
-struct irq_2_irte {
-	u16 devid; /* Device ID for IRTE table */
-	u16 index; /* Index into IRTE table*/
-};
-#endif	/* CONFIG_IRQ_REMAP */
-
 #ifdef	CONFIG_X86_LOCAL_APIC
 struct irq_data;
+struct pci_dev;
+struct msi_desc;
+
+enum irq_alloc_type {
+	X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
+	X86_IRQ_ALLOC_TYPE_HPET,
+	X86_IRQ_ALLOC_TYPE_MSI,
+	X86_IRQ_ALLOC_TYPE_MSIX,
+	X86_IRQ_ALLOC_TYPE_DMAR,
+	X86_IRQ_ALLOC_TYPE_UV,
+};
 
-struct irq_cfg {
-	cpumask_var_t		domain;
-	cpumask_var_t		old_domain;
-	u8			vector;
-	u8			move_in_progress : 1;
-#ifdef CONFIG_IRQ_REMAP
-	u8			remapped : 1;
+struct irq_alloc_info {
+	enum irq_alloc_type	type;
+	u32			flags;
+	const struct cpumask	*mask;	/* CPU mask for vector allocation */
 	union {
-		struct irq_2_iommu irq_2_iommu;
-		struct irq_2_irte  irq_2_irte;
-	};
+		int		unused;
+#ifdef	CONFIG_HPET_TIMER
+		struct {
+			int		hpet_id;
+			int		hpet_index;
+			void		*hpet_data;
+		};
 #endif
-	union {
-#ifdef CONFIG_X86_IO_APIC
+#ifdef	CONFIG_PCI_MSI
 		struct {
-			struct list_head	irq_2_pin;
+			struct pci_dev	*msi_dev;
+			irq_hw_number_t	msi_hwirq;
+		};
+#endif
+#ifdef	CONFIG_X86_IO_APIC
+		struct {
+			int		ioapic_id;
+			int		ioapic_pin;
+			int		ioapic_node;
+			u32		ioapic_trigger : 1;
+			u32		ioapic_polarity : 1;
+			u32		ioapic_valid : 1;
+			struct IO_APIC_route_entry *ioapic_entry;
+		};
+#endif
+#ifdef	CONFIG_DMAR_TABLE
+		struct {
+			int		dmar_id;
+			void		*dmar_data;
+		};
+#endif
+#ifdef	CONFIG_HT_IRQ
+		struct {
+			int		ht_pos;
+			int		ht_idx;
+			struct pci_dev	*ht_dev;
+			void		*ht_update;
+		};
+#endif
+#ifdef	CONFIG_X86_UV
+		struct {
+			int		uv_limit;
+			int		uv_blade;
+			unsigned long	uv_offset;
+			char		*uv_name;
 		};
 #endif
 	};
 };
 
+struct irq_cfg {
+	unsigned int		dest_apicid;
+	u8			vector;
+};
+
 extern struct irq_cfg *irq_cfg(unsigned int irq);
 extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
-extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
 extern void lock_vector_lock(void);
 extern void unlock_vector_lock(void);
-extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
-extern void clear_irq_vector(int irq, struct irq_cfg *cfg);
 extern void setup_vector_irq(int cpu);
 #ifdef CONFIG_SMP
 extern void send_cleanup_vector(struct irq_cfg *);
@@ -150,10 +151,7 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { }
 static inline void irq_complete_move(struct irq_cfg *c) { }
 #endif
 
-extern int apic_retrigger_irq(struct irq_data *data);
 extern void apic_ack_edge(struct irq_data *data);
-extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			     unsigned int *dest_id);
 #else	/*  CONFIG_X86_LOCAL_APIC */
 static inline void lock_vector_lock(void) {}
 static inline void unlock_vector_lock(void) {}
@@ -163,8 +161,7 @@ static inline void unlock_vector_lock(void) {}
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
-/* EISA */
-extern void eisa_set_level_irq(unsigned int irq);
+extern void elcr_set_level_irq(unsigned int irq);
 
 /* SMP */
 extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
@@ -178,7 +175,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 extern __visible void smp_reschedule_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
-extern __visible void smp_invalidate_interrupt(struct pt_regs *);
 #endif
 
 extern char irq_entries_start[];
@@ -186,10 +182,10 @@ extern char irq_entries_start[];
 #define trace_irq_entries_start irq_entries_start
 #endif
 
-#define VECTOR_UNDEFINED	(-1)
-#define VECTOR_RETRIGGERED	(-2)
+#define VECTOR_UNUSED		NULL
+#define VECTOR_RETRIGGERED	((void *)~0UL)
 
-typedef int vector_irq_t[NR_VECTORS];
+typedef struct irq_desc* vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
diff --git a/kernel/arch/x86/include/asm/i387.h b/kernel/arch/x86/include/asm/i387.h
deleted file mode 100644
index 6eb6fcb83..000000000
--- a/kernel/arch/x86/include/asm/i387.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_I387_H
-#define _ASM_X86_I387_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-
-struct pt_regs;
-struct user_i387_struct;
-
-extern int init_fpu(struct task_struct *child);
-extern void fpu_finit(struct fpu *fpu);
-extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
-extern void math_state_restore(void);
-
-extern bool irq_fpu_usable(void);
-
-/*
- * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
- * and they don't touch the preempt state on their own.
- * If you enable preemption after __kernel_fpu_begin(), preempt notifier
- * should call the __kernel_fpu_end() to prevent the kernel/user FPU
- * state from getting corrupted. KVM for example uses this model.
- *
- * All other cases use kernel_fpu_begin/end() which disable preemption
- * during kernel FPU usage.
- */
-extern void __kernel_fpu_begin(void);
-extern void __kernel_fpu_end(void);
-
-static inline void kernel_fpu_begin(void)
-{
-	preempt_disable();
-	WARN_ON_ONCE(!irq_fpu_usable());
-	__kernel_fpu_begin();
-}
-
-static inline void kernel_fpu_end(void)
-{
-	__kernel_fpu_end();
-	preempt_enable();
-}
-
-/* Must be called with preempt disabled */
-extern void kernel_fpu_disable(void);
-extern void kernel_fpu_enable(void);
-
-/*
- * Some instructions like VIA's padlock instructions generate a spurious
- * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context as well. To prevent these kernel instructions
- * in interrupt context interacting wrongly with other user/kernel fpu usage, we
- * should use them only in the context of irq_ts_save/restore()
- */
-static inline int irq_ts_save(void)
-{
-	/*
-	 * If in process context and not atomic, we can take a spurious DNA fault.
-	 * Otherwise, doing clts() in process context requires disabling preemption
-	 * or some heavy lifting like kernel_fpu_begin()
-	 */
-	if (!in_atomic())
-		return 0;
-
-	if (read_cr0() & X86_CR0_TS) {
-		clts();
-		return 1;
-	}
-
-	return 0;
-}
-
-static inline void irq_ts_restore(int TS_state)
-{
-	if (TS_state)
-		stts();
-}
-
-/*
- * The question "does this thread have fpu access?"
- * is slightly racy, since preemption could come in
- * and revoke it immediately after the test.
- *
- * However, even in that very unlikely scenario,
- * we can just assume we have FPU access - typically
- * to save the FP state - we'll just take a #NM
- * fault and get the FPU access back.
- */
-static inline int user_has_fpu(void)
-{
-	return current->thread.fpu.has_fpu;
-}
-
-extern void unlazy_fpu(struct task_struct *tsk);
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_I387_H */
diff --git a/kernel/arch/x86/include/asm/i8259.h b/kernel/arch/x86/include/asm/i8259.h
index ccffa5375..39bcefc20 100644
--- a/kernel/arch/x86/include/asm/i8259.h
+++ b/kernel/arch/x86/include/asm/i8259.h
@@ -60,6 +60,7 @@ struct legacy_pic {
 	void (*mask_all)(void);
 	void (*restore_mask)(void);
 	void (*init)(int auto_eoi);
+	int (*probe)(void);
 	int (*irq_pending)(unsigned int irq);
 	void (*make_irq)(unsigned int irq);
 };
diff --git a/kernel/arch/x86/include/asm/ia32.h b/kernel/arch/x86/include/asm/ia32.h
index d0e8e0141..a9bdf5569 100644
--- a/kernel/arch/x86/include/asm/ia32.h
+++ b/kernel/arch/x86/include/asm/ia32.h
@@ -10,7 +10,7 @@
  * 32 bit structures for IA32 support.
  */
 
-#include <asm/sigcontext32.h>
+#include <uapi/asm/sigcontext.h>
 
 /* signal.h */
 
@@ -18,16 +18,7 @@ struct ucontext_ia32 {
 	unsigned int	  uc_flags;
 	unsigned int 	  uc_link;
 	compat_stack_t	  uc_stack;
-	struct sigcontext_ia32 uc_mcontext;
-	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
-};
-
-struct ucontext_x32 {
-	unsigned int	  uc_flags;
-	unsigned int 	  uc_link;
-	compat_stack_t	  uc_stack;
-	unsigned int	  uc__pad0;     /* needed for alignment */
-	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
+	struct sigcontext_32 uc_mcontext;
 	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
 };
 
diff --git a/kernel/arch/x86/include/asm/intel_pmc_ipc.h b/kernel/arch/x86/include/asm/intel_pmc_ipc.h
new file mode 100644
index 000000000..cd0310e18
--- /dev/null
+++ b/kernel/arch/x86/include/asm/intel_pmc_ipc.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_INTEL_PMC_IPC_H_
+#define  _ASM_X86_INTEL_PMC_IPC_H_
+
+/* Commands */
+#define PMC_IPC_PMIC_ACCESS		0xFF
+#define		PMC_IPC_PMIC_ACCESS_READ	0x0
+#define		PMC_IPC_PMIC_ACCESS_WRITE	0x1
+#define PMC_IPC_USB_PWR_CTRL		0xF0
+#define PMC_IPC_PMIC_BLACKLIST_SEL	0xEF
+#define PMC_IPC_PHY_CONFIG		0xEE
+#define PMC_IPC_NORTHPEAK_CTRL		0xED
+#define PMC_IPC_PM_DEBUG		0xEC
+#define PMC_IPC_PMC_TELEMTRY		0xEB
+#define PMC_IPC_PMC_FW_MSG_CTRL		0xEA
+
+/* IPC return code */
+#define IPC_ERR_NONE			0
+#define IPC_ERR_CMD_NOT_SUPPORTED	1
+#define IPC_ERR_CMD_NOT_SERVICED	2
+#define IPC_ERR_UNABLE_TO_SERVICE	3
+#define IPC_ERR_CMD_INVALID		4
+#define IPC_ERR_CMD_FAILED		5
+#define IPC_ERR_EMSECURITY		6
+#define IPC_ERR_UNSIGNEDKERNEL		7
+
+#if IS_ENABLED(CONFIG_INTEL_PMC_IPC)
+
+int intel_pmc_ipc_simple_command(int cmd, int sub);
+int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
+		u32 *out, u32 outlen, u32 dptr, u32 sptr);
+int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
+		u32 *out, u32 outlen);
+
+#else
+
+static inline int intel_pmc_ipc_simple_command(int cmd, int sub)
+{
+	return -EINVAL;
+}
+
+static inline int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
+		u32 *out, u32 outlen, u32 dptr, u32 sptr)
+{
+	return -EINVAL;
+}
+
+static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
+		u32 *out, u32 outlen)
+{
+	return -EINVAL;
+}
+
+#endif /*CONFIG_INTEL_PMC_IPC*/
+
+#endif
diff --git a/kernel/arch/x86/include/asm/io.h b/kernel/arch/x86/include/asm/io.h
index 34a5b9370..de25aad07 100644
--- a/kernel/arch/x86/include/asm/io.h
+++ b/kernel/arch/x86/include/asm/io.h
@@ -35,11 +35,13 @@
   */
 
 #define ARCH_HAS_IOREMAP_WC
+#define ARCH_HAS_IOREMAP_WT
 
 #include <linux/string.h>
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
 
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
@@ -177,6 +179,9 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  * look at pci_iomap().
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
+#define ioremap_uc ioremap_uc
+
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 				unsigned long prot_val);
@@ -197,8 +202,6 @@ extern void set_iounmap_nonlazy(void);
 
 #include <asm-generic/iomap.h>
 
-#include <linux/vmalloc.h>
-
 /*
  * Convert a virtual cached pointer to an uncached pointer
  */
@@ -320,6 +323,7 @@ extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 				enum page_cache_mode pcm);
 extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
 
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
@@ -338,6 +342,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 #define IO_SPACE_LIMIT 0xffff
 
 #ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
 extern int __must_check arch_phys_wc_add(unsigned long base,
 					 unsigned long size);
 extern void arch_phys_wc_del(int handle);
diff --git a/kernel/arch/x86/include/asm/io_apic.h b/kernel/arch/x86/include/asm/io_apic.h
index 2f91685fe..6cbf2cfb3 100644
--- a/kernel/arch/x86/include/asm/io_apic.h
+++ b/kernel/arch/x86/include/asm/io_apic.h
@@ -95,9 +95,22 @@ struct IR_IO_APIC_route_entry {
 		index		: 15;
 } __attribute__ ((packed));
 
-#define IOAPIC_AUTO     -1
-#define IOAPIC_EDGE     0
-#define IOAPIC_LEVEL    1
+struct irq_alloc_info;
+struct ioapic_domain_cfg;
+
+#define IOAPIC_AUTO			-1
+#define IOAPIC_EDGE			0
+#define IOAPIC_LEVEL			1
+
+#define IOAPIC_MASKED			1
+#define IOAPIC_UNMASKED			0
+
+#define IOAPIC_POL_HIGH			0
+#define IOAPIC_POL_LOW			1
+
+#define IOAPIC_DEST_MODE_PHYSICAL	0
+#define IOAPIC_DEST_MODE_LOGICAL	1
+
 #define	IOAPIC_MAP_ALLOC		0x1
 #define	IOAPIC_MAP_CHECK		0x2
 
@@ -110,9 +123,6 @@ extern int nr_ioapics;
 
 extern int mpc_ioapic_id(int ioapic);
 extern unsigned int mpc_ioapic_addr(int ioapic);
-extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
-
-#define MP_MAX_IOAPIC_PIN 127
 
 /* # of MP IRQ source entries */
 extern int mp_irq_entries;
@@ -120,9 +130,6 @@ extern int mp_irq_entries;
 /* MP IRQ source entries */
 extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
-/* Older SiS APIC requires we rewrite the index register */
-extern int sis_apic_bug;
-
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
 
@@ -132,6 +139,8 @@ extern int noioapicquirk;
 /* -1 if "noapic" boot option passed */
 extern int noioapicreroute;
 
+extern u32 gsi_top;
+
 extern unsigned long io_apic_irqs;
 
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
@@ -147,13 +156,6 @@ struct irq_cfg;
 extern void ioapic_insert_resources(void);
 extern int arch_early_ioapic_init(void);
 
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
-				     unsigned int, int,
-				     struct io_apic_irq_attr *);
-extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
-
-extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-
 extern int save_ioapic_entries(void);
 extern void mask_ioapic_entries(void);
 extern int restore_ioapic_entries(void);
@@ -161,82 +163,32 @@ extern int restore_ioapic_entries(void);
 extern void setup_ioapic_ids_from_mpc(void);
 extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
-struct io_apic_irq_attr {
-	int ioapic;
-	int ioapic_pin;
-	int trigger;
-	int polarity;
-};
-
-enum ioapic_domain_type {
-	IOAPIC_DOMAIN_INVALID,
-	IOAPIC_DOMAIN_LEGACY,
-	IOAPIC_DOMAIN_STRICT,
-	IOAPIC_DOMAIN_DYNAMIC,
-};
-
-struct device_node;
-struct irq_domain;
-struct irq_domain_ops;
-
-struct ioapic_domain_cfg {
-	enum ioapic_domain_type		type;
-	const struct irq_domain_ops	*ops;
-	struct device_node		*dev;
-};
-
-struct mp_ioapic_gsi{
-	u32 gsi_base;
-	u32 gsi_end;
-};
-extern u32 gsi_top;
-
 extern int mp_find_ioapic(u32 gsi);
 extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
-extern u32 mp_pin_to_gsi(int ioapic, int pin);
-extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+			     struct irq_alloc_info *info);
 extern void mp_unmap_irq(int irq);
 extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
 			      struct ioapic_domain_cfg *cfg);
 extern int mp_unregister_ioapic(u32 gsi_base);
 extern int mp_ioapic_registered(u32 gsi_base);
-extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-			    irq_hw_number_t hwirq);
-extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
-extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
-extern void __init pre_init_apic_IRQ0(void);
+
+extern void ioapic_set_alloc_attr(struct irq_alloc_info *info,
+				  int node, int trigger, int polarity);
 
 extern void mp_save_irq(struct mpc_intsrc *m);
 
 extern void disable_ioapic_support(void);
 
-extern void __init native_io_apic_init_mappings(void);
+extern void __init io_apic_init_mappings(void);
 extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
-extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val);
-extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
 extern void native_disable_io_apic(void);
-extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern int native_ioapic_set_affinity(struct irq_data *,
-				      const struct cpumask *,
-				      bool);
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
 	return x86_io_apic_ops.read(apic, reg);
 }
 
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.write(apic, reg, value);
-}
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.modify(apic, reg, value);
-}
-
-extern void io_apic_eoi(unsigned int apic, unsigned int vector);
-
 extern void setup_IO_APIC(void);
 extern void enable_IO_APIC(void);
 extern void disable_IO_APIC(void);
@@ -253,8 +205,12 @@ static inline int arch_early_ioapic_init(void) { return 0; }
 static inline void print_IO_APICs(void) {}
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
-static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
-static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+				    struct irq_alloc_info *info)
+{
+	return gsi;
+}
+
 static inline void mp_unmap_irq(int irq) { }
 
 static inline int save_ioapic_entries(void)
@@ -268,17 +224,11 @@ static inline int restore_ioapic_entries(void)
 	return -ENOMEM;
 }
 
-static inline void mp_save_irq(struct mpc_intsrc *m) { };
+static inline void mp_save_irq(struct mpc_intsrc *m) { }
 static inline void disable_ioapic_support(void) { }
-#define native_io_apic_init_mappings	NULL
+static inline void io_apic_init_mappings(void) { }
 #define native_io_apic_read		NULL
-#define native_io_apic_write		NULL
-#define native_io_apic_modify		NULL
 #define native_disable_io_apic		NULL
-#define native_io_apic_print_entries	NULL
-#define native_ioapic_set_affinity	NULL
-#define native_setup_ioapic_entry	NULL
-#define native_eoi_ioapic_pin		NULL
 
 static inline void setup_IO_APIC(void) { }
 static inline void enable_IO_APIC(void) { }
diff --git a/kernel/arch/x86/include/asm/iosf_mbi.h b/kernel/arch/x86/include/asm/iosf_mbi.h
index 57995f059..b72ad0faa 100644
--- a/kernel/arch/x86/include/asm/iosf_mbi.h
+++ b/kernel/arch/x86/include/asm/iosf_mbi.h
@@ -52,20 +52,20 @@
 
 /* Quark available units */
 #define QRK_MBI_UNIT_HBA	0x00
-#define QRK_MBI_UNIT_HB	0x03
+#define QRK_MBI_UNIT_HB		0x03
 #define QRK_MBI_UNIT_RMU	0x04
-#define QRK_MBI_UNIT_MM	0x05
+#define QRK_MBI_UNIT_MM		0x05
 #define QRK_MBI_UNIT_MMESRAM	0x05
 #define QRK_MBI_UNIT_SOC	0x31
 
 /* Quark read/write opcodes */
 #define QRK_MBI_HBA_READ	0x10
 #define QRK_MBI_HBA_WRITE	0x11
-#define QRK_MBI_HB_READ	0x10
+#define QRK_MBI_HB_READ		0x10
 #define QRK_MBI_HB_WRITE	0x11
 #define QRK_MBI_RMU_READ	0x10
 #define QRK_MBI_RMU_WRITE	0x11
-#define QRK_MBI_MM_READ	0x10
+#define QRK_MBI_MM_READ		0x10
 #define QRK_MBI_MM_WRITE	0x11
 #define QRK_MBI_MMESRAM_READ	0x12
 #define QRK_MBI_MMESRAM_WRITE	0x13
diff --git a/kernel/arch/x86/include/asm/irq.h b/kernel/arch/x86/include/asm/irq.h
index a80cbb88e..e7de5c9a4 100644
--- a/kernel/arch/x86/include/asm/irq.h
+++ b/kernel/arch/x86/include/asm/irq.h
@@ -23,16 +23,23 @@ extern void irq_ctx_init(int cpu);
 
 #define __ARCH_HAS_DO_SOFTIRQ
 
+struct irq_desc;
+
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
 extern int check_irq_vectors_for_cpu_disable(void);
 extern void fixup_irqs(void);
-extern void irq_force_complete_move(int);
+extern void irq_force_complete_move(struct irq_desc *desc);
+#endif
+
+#ifdef CONFIG_HAVE_KVM
+extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
 #endif
 
 extern void (*x86_platform_ipi_callback)(void);
 extern void native_init_IRQ(void);
-extern bool handle_irq(unsigned irq, struct pt_regs *regs);
+
+extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
 
 extern __visible unsigned int do_IRQ(struct pt_regs *regs);
 
diff --git a/kernel/arch/x86/include/asm/irq_remapping.h b/kernel/arch/x86/include/asm/irq_remapping.h
index 6224d316c..a210eba27 100644
--- a/kernel/arch/x86/include/asm/irq_remapping.h
+++ b/kernel/arch/x86/include/asm/irq_remapping.h
@@ -22,84 +22,72 @@
 #ifndef __X86_IRQ_REMAPPING_H
 #define __X86_IRQ_REMAPPING_H
 
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
 #include <asm/io_apic.h>
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_chip;
 struct msi_msg;
-struct pci_dev;
-struct irq_cfg;
+struct irq_alloc_info;
+
+enum irq_remap_cap {
+	IRQ_POSTING_CAP = 0,
+};
+
+struct vcpu_data {
+	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
+	u32 vector;		/* Guest vector of the interrupt */
+};
 
 #ifdef CONFIG_IRQ_REMAP
 
+extern bool irq_remapping_cap(enum irq_remap_cap cap);
 extern void set_irq_remapping_broken(void);
 extern int irq_remapping_prepare(void);
 extern int irq_remapping_enable(void);
 extern void irq_remapping_disable(void);
 extern int irq_remapping_reenable(int);
 extern int irq_remap_enable_fault_handling(void);
-extern int setup_ioapic_remapped_entry(int irq,
-				       struct IO_APIC_route_entry *entry,
-				       unsigned int destination,
-				       int vector,
-				       struct io_apic_irq_attr *attr);
-extern void free_remapped_irq(int irq);
-extern void compose_remapped_msi_msg(struct pci_dev *pdev,
-				     unsigned int irq, unsigned int dest,
-				     struct msi_msg *msg, u8 hpet_id);
-extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
 extern void panic_if_irq_remap(const char *msg);
-extern bool setup_remapped_irq(int irq,
-			       struct irq_cfg *cfg,
-			       struct irq_chip *chip);
 
-void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+extern struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
+extern struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info);
+
+/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
+extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent);
+
+/* Get parent irqdomain for interrupt remapping irqdomain */
+static inline struct irq_domain *arch_get_ir_parent_domain(void)
+{
+	return x86_vector_domain;
+}
 
 #else  /* CONFIG_IRQ_REMAP */
 
+static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
 static inline void set_irq_remapping_broken(void) { }
 static inline int irq_remapping_prepare(void) { return -ENODEV; }
 static inline int irq_remapping_enable(void) { return -ENODEV; }
 static inline void irq_remapping_disable(void) { }
 static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
 static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
-static inline int setup_ioapic_remapped_entry(int irq,
-					      struct IO_APIC_route_entry *entry,
-					      unsigned int destination,
-					      int vector,
-					      struct io_apic_irq_attr *attr)
-{
-	return -ENODEV;
-}
-static inline void free_remapped_irq(int irq) { }
-static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
-					    unsigned int irq, unsigned int dest,
-					    struct msi_msg *msg, u8 hpet_id)
-{
-}
-static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
-{
-	return -ENODEV;
-}
 
 static inline void panic_if_irq_remap(const char *msg)
 {
 }
 
-static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
 {
+	return NULL;
 }
 
-static inline bool setup_remapped_irq(int irq,
-				      struct irq_cfg *cfg,
-				      struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 {
-	return false;
+	return NULL;
 }
-#endif /* CONFIG_IRQ_REMAP */
-
-#define dmar_alloc_hwirq()	irq_alloc_hwirq(-1)
-#define dmar_free_hwirq		irq_free_hwirq
 
+#endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/kernel/arch/x86/include/asm/irq_vectors.h b/kernel/arch/x86/include/asm/irq_vectors.h
index 666c89ec4..6ca9fd623 100644
--- a/kernel/arch/x86/include/asm/irq_vectors.h
+++ b/kernel/arch/x86/include/asm/irq_vectors.h
@@ -47,31 +47,12 @@
 #define IRQ_MOVE_CLEANUP_VECTOR		FIRST_EXTERNAL_VECTOR
 
 #define IA32_SYSCALL_VECTOR		0x80
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR			0x80
-#endif
 
 /*
  * Vectors 0x30-0x3f are used for ISA interrupts.
  *   round up to the next 16-vector boundary
  */
-#define IRQ0_VECTOR			((FIRST_EXTERNAL_VECTOR + 16) & ~15)
-
-#define IRQ1_VECTOR			(IRQ0_VECTOR +  1)
-#define IRQ2_VECTOR			(IRQ0_VECTOR +  2)
-#define IRQ3_VECTOR			(IRQ0_VECTOR +  3)
-#define IRQ4_VECTOR			(IRQ0_VECTOR +  4)
-#define IRQ5_VECTOR			(IRQ0_VECTOR +  5)
-#define IRQ6_VECTOR			(IRQ0_VECTOR +  6)
-#define IRQ7_VECTOR			(IRQ0_VECTOR +  7)
-#define IRQ8_VECTOR			(IRQ0_VECTOR +  8)
-#define IRQ9_VECTOR			(IRQ0_VECTOR +  9)
-#define IRQ10_VECTOR			(IRQ0_VECTOR + 10)
-#define IRQ11_VECTOR			(IRQ0_VECTOR + 11)
-#define IRQ12_VECTOR			(IRQ0_VECTOR + 12)
-#define IRQ13_VECTOR			(IRQ0_VECTOR + 13)
-#define IRQ14_VECTOR			(IRQ0_VECTOR + 14)
-#define IRQ15_VECTOR			(IRQ0_VECTOR + 15)
+#define ISA_IRQ_VECTOR(irq)		(((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq)
 
 /*
  * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -102,21 +83,23 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xf7
 
-/* Vector for KVM to deliver posted interrupt IPI */
-#ifdef CONFIG_HAVE_KVM
-#define POSTED_INTR_VECTOR		0xf2
-#endif
-
+#define POSTED_INTR_WAKEUP_VECTOR	0xf1
 /*
  * IRQ work vector:
  */
 #define IRQ_WORK_VECTOR			0xf6
 
 #define UV_BAU_MESSAGE			0xf5
+#define DEFERRED_ERROR_VECTOR		0xf4
 
 /* Vector on which hypervisor callbacks will be delivered */
 #define HYPERVISOR_CALLBACK_VECTOR	0xf3
 
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR		0xf2
+#endif
+
 /*
  * Local APIC timer IRQ vector is on a different priority level,
  * to work around the 'lost local interrupt if more than 2 IRQ
@@ -134,16 +117,6 @@
 
 #define FPU_IRQ				  13
 
-#define	FIRST_VM86_IRQ			   3
-#define LAST_VM86_IRQ			  15
-
-#ifndef __ASSEMBLY__
-static inline int invalid_vm86_irq(int irq)
-{
-	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
-}
-#endif
-
 /*
  * Size the maximum number of interrupts.
  *
@@ -155,18 +128,22 @@ static inline int invalid_vm86_irq(int irq)
  * static arrays.
  */
 
-#define NR_IRQS_LEGACY			  16
+#define NR_IRQS_LEGACY			16
 
-#define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
+#define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
+#define IO_APIC_VECTOR_LIMIT		(32 * MAX_IO_APICS)
 
-#ifdef CONFIG_X86_IO_APIC
-# define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
-# define NR_IRQS					\
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_IRQS						\
 	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
 		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-#else /* !CONFIG_X86_IO_APIC: */
-# define NR_IRQS			NR_IRQS_LEGACY
+#elif defined(CONFIG_X86_IO_APIC)
+#define	NR_IRQS				(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_IRQS				(NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_IRQS				NR_IRQS_LEGACY
 #endif
 
 #endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/kernel/arch/x86/include/asm/irqdomain.h b/kernel/arch/x86/include/asm/irqdomain.h
new file mode 100644
index 000000000..d26075b52
--- /dev/null
+++ b/kernel/arch/x86/include/asm/irqdomain.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_IRQDOMAIN_H
+#define _ASM_IRQDOMAIN_H
+
+#include <linux/irqdomain.h>
+#include <asm/hw_irq.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+enum {
+	/* Allocate contiguous CPU vectors */
+	X86_IRQ_ALLOC_CONTIGUOUS_VECTORS		= 0x1,
+};
+
+extern struct irq_domain *x86_vector_domain;
+
+extern void init_irq_alloc_info(struct irq_alloc_info *info,
+				const struct cpumask *mask);
+extern void copy_irq_alloc_info(struct irq_alloc_info *dst,
+				struct irq_alloc_info *src);
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+struct device_node;
+struct irq_data;
+
+enum ioapic_domain_type {
+	IOAPIC_DOMAIN_INVALID,
+	IOAPIC_DOMAIN_LEGACY,
+	IOAPIC_DOMAIN_STRICT,
+	IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct ioapic_domain_cfg {
+	enum ioapic_domain_type		type;
+	const struct irq_domain_ops	*ops;
+	struct device_node		*dev;
+};
+
+extern const struct irq_domain_ops mp_ioapic_irqdomain_ops;
+
+extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg);
+extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs);
+extern void mp_irqdomain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data);
+extern void mp_irqdomain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data);
+extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
+#endif /* CONFIG_X86_IO_APIC */
+
+#ifdef CONFIG_PCI_MSI
+extern void arch_init_msi_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+#endif
+
+#ifdef CONFIG_HT_IRQ
+extern void arch_init_htirq_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_htirq_domain(struct irq_domain *domain) { }
+#endif
+
+#endif
diff --git a/kernel/arch/x86/include/asm/jump_label.h b/kernel/arch/x86/include/asm/jump_label.h
index a4c1cf7e9..5daeca3d0 100644
--- a/kernel/arch/x86/include/asm/jump_label.h
+++ b/kernel/arch/x86/include/asm/jump_label.h
@@ -16,15 +16,32 @@
 # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
 #endif
 
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
 	asm_volatile_goto("1:"
 		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
 		".pushsection __jump_table,  \"aw\" \n\t"
 		_ASM_ALIGN "\n\t"
-		_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+		_ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
 		".popsection \n\t"
-		: :  "i" (key) : : l_yes);
+		: :  "i" (key), "i" (branch) : : l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+	asm_volatile_goto("1:"
+		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
+		"2:\n\t"
+		".pushsection __jump_table,  \"aw\" \n\t"
+		_ASM_ALIGN "\n\t"
+		_ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
+		".popsection \n\t"
+		: :  "i" (key), "i" (branch) : : l_yes);
+
 	return false;
 l_yes:
 	return true;
diff --git a/kernel/arch/x86/include/asm/kasan.h b/kernel/arch/x86/include/asm/kasan.h
index 74a2a8dc9..1410b567e 100644
--- a/kernel/arch/x86/include/asm/kasan.h
+++ b/kernel/arch/x86/include/asm/kasan.h
@@ -1,6 +1,9 @@
 #ifndef _ASM_X86_KASAN_H
 #define _ASM_X86_KASAN_H
 
+#include <linux/const.h>
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
 /*
  * Compiler uses shadow offset assuming that addresses start
  * from 0. Kernel addresses don't start from 0, so shadow
diff --git a/kernel/arch/x86/include/asm/kdebug.h b/kernel/arch/x86/include/asm/kdebug.h
index 32ce71375..e5f5dc978 100644
--- a/kernel/arch/x86/include/asm/kdebug.h
+++ b/kernel/arch/x86/include/asm/kdebug.h
@@ -29,11 +29,5 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
-extern int in_crash_kexec;
-#else
-/* no crash dump is ever in progress if no crash kernel can be kexec'd */
-#define in_crash_kexec 0
-#endif
 
 #endif /* _ASM_X86_KDEBUG_H */
diff --git a/kernel/arch/x86/include/asm/kvm_emulate.h b/kernel/arch/x86/include/asm/kvm_emulate.h
index 57a9d94fe..e9cd7befc 100644
--- a/kernel/arch/x86/include/asm/kvm_emulate.h
+++ b/kernel/arch/x86/include/asm/kvm_emulate.h
@@ -112,6 +112,16 @@ struct x86_emulate_ops {
 			struct x86_exception *fault);
 
 	/*
+	 * read_phys: Read bytes of standard (non-emulated/special) memory.
+	 *            Used for descriptor reading.
+	 *  @addr:  [IN ] Physical address from which to read.
+	 *  @val:   [OUT] Value read from memory.
+	 *  @bytes: [IN ] Number of bytes to read from memory.
+	 */
+	int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+			void *val, unsigned int bytes);
+
+	/*
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
 	 *            Used for descriptor writing.
 	 *  @addr:  [IN ] Linear address to which to write.
@@ -193,6 +203,8 @@ struct x86_emulate_ops {
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+	u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
+	void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
 	int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
 	int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
 	int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -262,6 +274,11 @@ enum x86emul_mode {
 	X86EMUL_MODE_PROT64,	/* 64-bit (long) mode.    */
 };
 
+/* These match some of the HF_* flags defined in kvm_host.h  */
+#define X86EMUL_GUEST_MASK           (1 << 5) /* VCPU is in guest-mode */
+#define X86EMUL_SMM_MASK             (1 << 6)
+#define X86EMUL_SMM_INSIDE_NMI_MASK  (1 << 7)
+
 struct x86_emulate_ctxt {
 	const struct x86_emulate_ops *ops;
 
@@ -273,8 +290,8 @@ struct x86_emulate_ctxt {
 
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
+	int emul_flags;
 
-	bool guest_mode; /* guest running a nested guest */
 	bool perm_ok; /* do not check permissions if true */
 	bool ud;	/* inject an #UD if host doesn't support insn */
 
diff --git a/kernel/arch/x86/include/asm/kvm_host.h b/kernel/arch/x86/include/asm/kvm_host.h
index 41b06fca3..30cfd6429 100644
--- a/kernel/arch/x86/include/asm/kvm_host.h
+++ b/kernel/arch/x86/include/asm/kvm_host.h
@@ -24,6 +24,7 @@
 #include <linux/perf_event.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/clocksource.h>
+#include <linux/irqbypass.h>
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -40,6 +41,7 @@
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
 
@@ -175,6 +177,8 @@ enum {
  */
 #define KVM_APIC_PV_EOI_PENDING	1
 
+struct kvm_kernel_irq_routing_entry;
+
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
@@ -184,23 +188,12 @@ struct kvm_mmu_memory_cache {
 	void *objects[KVM_NR_MEM_OBJS];
 };
 
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- *   bits 4:7 - page table level for this shadow (1-4)
- *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - direct mapping of virtual to physical mapping at gfn
- *              used for real mode and two-dimensional paging
- *   bits 17:19 - common access permissions for all ptes in this shadow page
- */
 union kvm_mmu_page_role {
 	unsigned word;
 	struct {
 		unsigned level:4;
 		unsigned cr4_pae:1;
 		unsigned quadrant:2;
-		unsigned pad_for_nice_hex_output:6;
 		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
@@ -208,6 +201,15 @@ union kvm_mmu_page_role {
 		unsigned cr0_wp:1;
 		unsigned smep_andnot_wp:1;
 		unsigned smap_andnot_wp:1;
+		unsigned :8;
+
+		/*
+		 * This is left at the top of the word so that
+		 * kvm_memslots_for_spte_role can extract it with a
+		 * simple shift.  While there is room, give it a whole
+		 * byte so it is also faster to load it from memory.
+		 */
+		unsigned smm:8;
 	};
 };
 
@@ -254,6 +256,11 @@ struct kvm_pio_request {
 	int size;
 };
 
+struct rsvd_bits_validate {
+	u64 rsvd_bits_mask[2][4];
+	u64 bad_mt_xwr;
+};
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -291,8 +298,15 @@ struct kvm_mmu {
 
 	u64 *pae_root;
 	u64 *lm_root;
-	u64 rsvd_bits_mask[2][4];
-	u64 bad_mt_xwr;
+
+	/*
+	 * check zero bits on shadow page table entries, these
+	 * bits include not only hardware reserved bits but also
+	 * the bits spte never used.
+	 */
+	struct rsvd_bits_validate shadow_zero_check;
+
+	struct rsvd_bits_validate guest_rsvd_check;
 
 	/*
 	 * Bitmap: bit set = last pte in walk
@@ -338,12 +352,34 @@ struct kvm_pmu {
 	u64 reprogram_pmi;
 };
 
+struct kvm_pmu_ops;
+
 enum {
 	KVM_DEBUGREG_BP_ENABLED = 1,
 	KVM_DEBUGREG_WONT_EXIT = 2,
 	KVM_DEBUGREG_RELOAD = 4,
 };
 
+struct kvm_mtrr_range {
+	u64 base;
+	u64 mask;
+	struct list_head node;
+};
+
+struct kvm_mtrr {
+	struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
+	mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+	u64 deftype;
+
+	struct list_head head;
+};
+
+/* Hyper-V per vcpu emulation context */
+struct kvm_vcpu_hv {
+	u64 hv_vapic;
+	s64 runtime_offset;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -364,10 +400,12 @@ struct kvm_vcpu_arch {
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	u64 eoi_exit_bitmap[4];
 	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int mp_state;
 	u64 ia32_misc_enable_msr;
+	u64 smbase;
 	bool tpr_access_reporting;
 	u64 ia32_xss;
 
@@ -467,12 +505,14 @@ struct kvm_vcpu_arch {
 	u32 virtual_tsc_mult;
 	u32 virtual_tsc_khz;
 	s64 ia32_tsc_adjust_msr;
+	u64 tsc_scaling_ratio;
 
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 	unsigned nmi_pending; /* NMI queued after currently running handler */
 	bool nmi_injected;    /* Trying to inject an NMI this entry */
+	bool smi_pending;    /* SMI queued after currently running handler */
 
-	struct mtrr_state_type mtrr_state;
+	struct kvm_mtrr mtrr_state;
 	u64 pat;
 
 	unsigned switch_db_regs;
@@ -498,8 +538,7 @@ struct kvm_vcpu_arch {
 	/* used for guest single stepping over the given code position */
 	unsigned long singlestep_rip;
 
-	/* fields used by HYPER-V emulation */
-	u64 hv_vapic;
+	struct kvm_vcpu_hv hyperv;
 
 	cpumask_var_t wbinvd_dirty_mask;
 
@@ -540,6 +579,9 @@ struct kvm_vcpu_arch {
 	struct {
 		bool pv_unhalted;
 	} pv;
+
+	int pending_ioapic_eoi;
+	int pending_external_vector;
 };
 
 struct kvm_lpage_info {
@@ -570,6 +612,17 @@ struct kvm_apic_map {
 	struct kvm_lapic *logical_map[16][16];
 };
 
+/* Hyper-V emulation context */
+struct kvm_hv {
+	u64 hv_guest_os_id;
+	u64 hv_hypercall;
+	u64 hv_tsc_page;
+
+	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
+	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
+	u64 hv_crash_ctl;
+};
+
 struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
@@ -588,6 +641,8 @@ struct kvm_arch {
 	bool iommu_noncoherent;
 #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
 	atomic_t noncoherent_dma_count;
+#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+	atomic_t assigned_device_count;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
@@ -627,16 +682,19 @@ struct kvm_arch {
 	/* reads protected by irq_srcu, writes by irq_lock */
 	struct hlist_head mask_notifier_list;
 
-	/* fields used by HYPER-V emulation */
-	u64 hv_guest_os_id;
-	u64 hv_hypercall;
-	u64 hv_tsc_page;
+	struct kvm_hv hyperv;
 
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
 
 	bool boot_vcpu_runs_old_kvmclock;
+	u32 bsp_vcpu_id;
+
+	u64 disabled_quirks;
+
+	bool irqchip_split;
+	u8 nr_reserved_ioapic_pins;
 };
 
 struct kvm_vm_stat {
@@ -666,6 +724,7 @@ struct kvm_vcpu_stat {
 	u32 nmi_window_exits;
 	u32 halt_exits;
 	u32 halt_successful_poll;
+	u32 halt_attempted_poll;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
 	u32 irq_exits;
@@ -689,12 +748,13 @@ struct msr_data {
 
 struct kvm_lapic_irq {
 	u32 vector;
-	u32 delivery_mode;
-	u32 dest_mode;
-	u32 level;
-	u32 trig_mode;
+	u16 delivery_mode;
+	u16 dest_mode;
+	bool level;
+	u16 trig_mode;
 	u32 shorthand;
 	u32 dest_id;
+	bool msi_redir_hint;
 };
 
 struct kvm_x86_ops {
@@ -706,19 +766,20 @@ struct kvm_x86_ops {
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
+	bool (*cpu_has_high_real_mode_segbase)(void);
 	void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
 	/* Create, but do not attach this VCPU */
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
-	void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
 	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
-	void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
-	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+	void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
+	int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
 	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
 	void (*get_segment)(struct kvm_vcpu *vcpu,
@@ -770,10 +831,10 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	int (*vm_has_apicv)(struct kvm *kvm);
+	int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
-	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@@ -784,7 +845,7 @@ struct kvm_x86_ops {
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 	bool (*invpcid_supported)(void);
-	void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host);
+	void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
@@ -792,11 +853,9 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
 	u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
-	u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
 	u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
 
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
@@ -836,6 +895,22 @@ struct kvm_x86_ops {
 	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
+	/* pmu operations of sub-arch */
+	const struct kvm_pmu_ops *pmu_ops;
+
+	/*
+	 * Architecture specific hooks for vCPU blocking due to
+	 * HLT instruction.
+	 * Returns for .pre_block():
+	 *    - 0 means continue to block the vCPU.
+	 *    - 1 means we cannot block the vCPU since some event
+	 *        happens during this period, such as, 'ON' bit in
+	 *        posted-interrupts descriptor is set.
+	 */
+	int (*pre_block)(struct kvm_vcpu *vcpu);
+	void (*post_block)(struct kvm_vcpu *vcpu);
+	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set);
 };
 
 struct kvm_arch_async_pf {
@@ -847,17 +922,6 @@ struct kvm_arch_async_pf {
 
 extern struct kvm_x86_ops *kvm_x86_ops;
 
-static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
-					   s64 adjustment)
-{
-	kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false);
-}
-
-static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
-{
-	kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true);
-}
-
 int kvm_mmu_module_init(void);
 void kvm_mmu_module_exit(void);
 
@@ -871,7 +935,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-					struct kvm_memory_slot *memslot);
+				   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -882,7 +946,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				   struct kvm_memory_slot *slot,
 				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
@@ -890,7 +954,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 struct kvm_irq_mask_notifier {
 	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
@@ -911,10 +974,12 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
 
 /* control of guest tsc rate supported? */
 extern bool kvm_has_tsc_control;
-/* minimum supported tsc_khz for guests */
-extern u32  kvm_min_guest_tsc_khz;
 /* maximum supported tsc_khz for guests */
 extern u32  kvm_max_guest_tsc_khz;
+/* number of bits of the fractional part of the TSC scaling ratio */
+extern u8   kvm_tsc_scaling_ratio_frac_bits;
+/* maximum allowed value of TSC scaling ratio */
+extern u64  kvm_max_tsc_scaling_ratio;
 
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
@@ -938,7 +1003,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 
 void kvm_enable_efer_bits(u64);
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 struct x86_emulate_ctxt;
@@ -967,7 +1032,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
@@ -1002,8 +1067,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-int fx_init(struct kvm_vcpu *vcpu);
-
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
@@ -1112,6 +1175,14 @@ enum {
 #define HF_NMI_MASK		(1 << 3)
 #define HF_IRET_MASK		(1 << 4)
 #define HF_GUEST_MASK		(1 << 5) /* VCPU is in guest-mode */
+#define HF_SMM_MASK		(1 << 6)
+#define HF_SMM_INSIDE_NMI_MASK	(1 << 7)
+
+#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#define KVM_ADDRESS_SPACE_NUM 2
+
+#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
+#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
 
 /*
  * Hardware virtualization extension instructions may fault if a
@@ -1146,7 +1217,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 					   unsigned long address);
@@ -1154,6 +1225,9 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 void kvm_define_shared_msr(unsigned index, u32 msr);
 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
 
@@ -1170,16 +1244,18 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
 
 int kvm_is_in_guest(void);
 
-void kvm_pmu_init(struct kvm_vcpu *vcpu);
-void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
-void kvm_pmu_reset(struct kvm_vcpu *vcpu);
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc);
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
+int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
+int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			     struct kvm_vcpu **dest_vcpu);
+
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+		     struct kvm_lapic_irq *irq);
+
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/kernel/arch/x86/include/asm/livepatch.h b/kernel/arch/x86/include/asm/livepatch.h
index 2d29197bd..19c099afa 100644
--- a/kernel/arch/x86/include/asm/livepatch.h
+++ b/kernel/arch/x86/include/asm/livepatch.h
@@ -21,6 +21,7 @@
 #ifndef _ASM_X86_LIVEPATCH_H
 #define _ASM_X86_LIVEPATCH_H
 
+#include <asm/setup.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
diff --git a/kernel/arch/x86/include/asm/math_emu.h b/kernel/arch/x86/include/asm/math_emu.h
index 031f6266f..0d9b14f60 100644
--- a/kernel/arch/x86/include/asm/math_emu.h
+++ b/kernel/arch/x86/include/asm/math_emu.h
@@ -2,7 +2,6 @@
 #define _ASM_X86_MATH_EMU_H
 
 #include <asm/ptrace.h>
-#include <asm/vm86.h>
 
 /* This structure matches the layout of the data saved to the stack
    following a device-not-present interrupt, part of it saved
@@ -10,9 +9,6 @@
    */
 struct math_emu_info {
 	long ___orig_eip;
-	union {
-		struct pt_regs *regs;
-		struct kernel_vm86_regs *vm86;
-	};
+	struct pt_regs *regs;
 };
 #endif /* _ASM_X86_MATH_EMU_H */
diff --git a/kernel/arch/x86/include/asm/mce.h b/kernel/arch/x86/include/asm/mce.h
index 1f5a86d51..2ea4527e4 100644
--- a/kernel/arch/x86/include/asm/mce.h
+++ b/kernel/arch/x86/include/asm/mce.h
@@ -17,11 +17,16 @@
 #define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
 #define MCG_SER_P		(1ULL<<24)   /* MCA recovery/new status bits */
 #define MCG_ELOG_P		(1ULL<<26)   /* Extended error log supported */
+#define MCG_LMCE_P		(1ULL<<27)   /* Local machine check supported */
 
 /* MCG_STATUS register defines */
 #define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
+#define MCG_STATUS_LMCES (1ULL<<3)   /* LMCE signaled */
+
+/* MCG_EXT_CTL register defines */
+#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
 
 /* MCi_STATUS register defines */
 #define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
@@ -104,6 +109,7 @@ struct mce_log {
 struct mca_config {
 	bool dont_log_ce;
 	bool cmci_disabled;
+	bool lmce_disabled;
 	bool ignore_ce;
 	bool disabled;
 	bool ser;
@@ -117,8 +123,27 @@ struct mca_config {
 };
 
 struct mce_vendor_flags {
-	__u64		overflow_recov	: 1, /* cpuid_ebx(80000007) */
-			__reserved_0	: 63;
+	/*
+	 * Indicates that overflow conditions are not fatal, when set.
+	 */
+	__u64 overflow_recov	: 1,
+
+	/*
+	 * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+	 * Recovery. It indicates support for data poisoning in HW and deferred
+	 * error interrupts.
+	 */
+	      succor		: 1,
+
+	/*
+	 * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+	 * the register space for each MCA bank and also increases number of
+	 * banks. Also, to accommodate the new banks and registers, the MCA
+	 * register space is moved to a new MSR range.
+	 */
+	      smca		: 1,
+
+	      __reserved_0	: 61;
 };
 extern struct mce_vendor_flags mce_flags;
 
@@ -134,10 +159,12 @@ extern int mce_p5_enabled;
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
 void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_cpu_clear(struct cpuinfo_x86 *c);
 void mcheck_vendor_init_severity(void);
 #else
 static inline int mcheck_init(void) { return 0; }
 static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
 static inline void mcheck_vendor_init_severity(void) {}
 #endif
 
@@ -164,12 +191,14 @@ DECLARE_PER_CPU(struct device *, mce_device);
 
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_clear(struct cpuinfo_x86 *c);
 void cmci_clear(void);
 void cmci_reenable(void);
 void cmci_rediscover(void);
 void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
 static inline void cmci_clear(void) {}
 static inline void cmci_reenable(void) {}
 static inline void cmci_rediscover(void) {}
@@ -223,6 +252,9 @@ void do_machine_check(struct pt_regs *, long);
 extern void (*mce_threshold_vector)(void);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
+/* Deferred error interrupt handler */
+extern void (*deferred_error_int_vector)(void);
+
 /*
  * Thermal handler
  */
diff --git a/kernel/arch/x86/include/asm/microcode.h b/kernel/arch/x86/include/asm/microcode.h
index 2fb20d6f7..34e62b1dc 100644
--- a/kernel/arch/x86/include/asm/microcode.h
+++ b/kernel/arch/x86/include/asm/microcode.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_MICROCODE_H
 #define _ASM_X86_MICROCODE_H
 
+#include <linux/earlycpio.h>
+
 #define native_rdmsr(msr, val1, val2)			\
 do {							\
 	u64 __val = native_read_msr((msr));		\
@@ -25,7 +27,6 @@ struct cpu_signature {
 struct device;
 
 enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
-extern bool dis_ucode_ldr;
 
 struct microcode_ops {
 	enum ucode_state (*request_microcode_user) (int cpu,
@@ -53,6 +54,12 @@ struct ucode_cpu_info {
 };
 extern struct ucode_cpu_info ucode_cpu_info[];
 
+#ifdef CONFIG_MICROCODE
+int __init microcode_init(void);
+#else
+static inline int __init microcode_init(void)	{ return 0; };
+#endif
+
 #ifdef CONFIG_MICROCODE_INTEL
 extern struct microcode_ops * __init init_intel_microcode(void);
 #else
@@ -73,7 +80,6 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
 static inline void __exit exit_amd_microcode(void) {}
 #endif
 
-#ifdef CONFIG_MICROCODE_EARLY
 #define MAX_UCODE_COUNT 128
 
 #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
@@ -148,18 +154,18 @@ static inline unsigned int x86_model(unsigned int sig)
 	return model;
 }
 
+#ifdef CONFIG_MICROCODE
 extern void __init load_ucode_bsp(void);
 extern void load_ucode_ap(void);
 extern int __init save_microcode_in_initrd(void);
 void reload_early_microcode(void);
+extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
 #else
-static inline void __init load_ucode_bsp(void) {}
-static inline void load_ucode_ap(void) {}
-static inline int __init save_microcode_in_initrd(void)
-{
-	return 0;
-}
-static inline void reload_early_microcode(void) {}
+static inline void __init load_ucode_bsp(void)			{ }
+static inline void load_ucode_ap(void)				{ }
+static inline int __init save_microcode_in_initrd(void)		{ return 0; }
+static inline void reload_early_microcode(void)			{ }
+static inline bool
+get_builtin_firmware(struct cpio_data *cd, const char *name)	{ return false; }
 #endif
-
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/kernel/arch/x86/include/asm/microcode_amd.h b/kernel/arch/x86/include/asm/microcode_amd.h
index af935397e..adfc847a3 100644
--- a/kernel/arch/x86/include/asm/microcode_amd.h
+++ b/kernel/arch/x86/include/asm/microcode_amd.h
@@ -64,16 +64,17 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s
 #define PATCH_MAX_SIZE PAGE_SIZE
 extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
 
-#ifdef CONFIG_MICROCODE_AMD_EARLY
-extern void __init load_ucode_amd_bsp(void);
+#ifdef CONFIG_MICROCODE_AMD
+extern void __init load_ucode_amd_bsp(unsigned int family);
 extern void load_ucode_amd_ap(void);
 extern int __init save_microcode_in_initrd_amd(void);
 void reload_ucode_amd(void);
 #else
-static inline void __init load_ucode_amd_bsp(void) {}
+static inline void __init load_ucode_amd_bsp(unsigned int family) {}
 static inline void load_ucode_amd_ap(void) {}
 static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; }
 void reload_ucode_amd(void) {}
 #endif
 
+extern bool check_current_patch_level(u32 *rev, bool early);
 #endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/kernel/arch/x86/include/asm/microcode_intel.h b/kernel/arch/x86/include/asm/microcode_intel.h
index 2b9209c46..8559b0102 100644
--- a/kernel/arch/x86/include/asm/microcode_intel.h
+++ b/kernel/arch/x86/include/asm/microcode_intel.h
@@ -51,22 +51,13 @@ struct extended_sigtable {
 	(((struct microcode_intel *)mc)->hdr.datasize ? \
 	 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
 
-#define sigmatch(s1, s2, p1, p2) \
-	(((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
+extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
 extern int microcode_sanity_check(void *mc, int print_err);
-extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
-
-static inline int
-revision_is_newer(struct microcode_header_intel *mc_header, int rev)
-{
-	return (mc_header->rev <= rev) ? 0 : 1;
-}
+extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
 
-#ifdef CONFIG_MICROCODE_INTEL_EARLY
+#ifdef CONFIG_MICROCODE_INTEL
 extern void __init load_ucode_intel_bsp(void);
 extern void load_ucode_intel_ap(void);
 extern void show_ucode_info_early(void);
@@ -80,13 +71,9 @@ static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL;
 static inline void reload_ucode_intel(void) {}
 #endif
 
-#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
+#ifdef CONFIG_HOTPLUG_CPU
 extern int save_mc_for_early(u8 *mc);
 #else
-static inline int save_mc_for_early(u8 *mc)
-{
-	return 0;
-}
+static inline int save_mc_for_early(u8 *mc) { return 0; }
 #endif
-
 #endif /* _ASM_X86_MICROCODE_INTEL_H */
diff --git a/kernel/arch/x86/include/asm/mmu.h b/kernel/arch/x86/include/asm/mmu.h
index 364d27481..55234d5e7 100644
--- a/kernel/arch/x86/include/asm/mmu.h
+++ b/kernel/arch/x86/include/asm/mmu.h
@@ -9,7 +9,9 @@
  * we put the segment information here.
  */
 typedef struct {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
+#endif
 
 #ifdef CONFIG_X86_64
 	/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/kernel/arch/x86/include/asm/mmu_context.h b/kernel/arch/x86/include/asm/mmu_context.h
index 80d67dd80..bfd9b2a35 100644
--- a/kernel/arch/x86/include/asm/mmu_context.h
+++ b/kernel/arch/x86/include/asm/mmu_context.h
@@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
 static inline void load_mm_cr4(struct mm_struct *mm) {}
 #endif
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 /*
  * ldt_structs can be allocated, used, and freed, but they are never
  * modified while live.
@@ -48,8 +49,23 @@ struct ldt_struct {
 	int size;
 };
 
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+#else	/* CONFIG_MODIFY_LDT_SYSCALL */
+static inline int init_new_context(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	return 0;
+}
+static inline void destroy_context(struct mm_struct *mm) {}
+#endif
+
 static inline void load_mm_ldt(struct mm_struct *mm)
 {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 
 	/* lockless_dereference synchronizes with smp_store_release */
@@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 		set_ldt(ldt->entries, ldt->size);
 	else
 		clear_LDT();
+#else
+	clear_LDT();
+#endif
 
 	DEBUG_LOCKS_WARN_ON(preemptible());
 }
 
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
-
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
@@ -104,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #endif
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 
-		/* Re-load page tables */
+		/*
+		 * Re-load page tables.
+		 *
+		 * This logic has an ordering constraint:
+		 *
+		 *  CPU 0: Write to a PTE for 'next'
+		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+		 *  CPU 1: set bit 1 in next's mm_cpumask
+		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+		 *
+		 * We need to prevent an outcome in which CPU 1 observes
+		 * the new PTE value and CPU 0 observes bit 1 clear in
+		 * mm_cpumask.  (If that occurs, then the IPI will never
+		 * be sent, and CPU 0's TLB will contain a stale entry.)
+		 *
+		 * The bad outcome can occur if either CPU's load is
+		 * reordered before that CPU's store, so both CPUs must
+		 * execute full barriers to prevent this from happening.
+		 *
+		 * Thus, switch_mm needs a full barrier between the
+		 * store to mm_cpumask and any operation that could load
+		 * from next->pgd.  TLB fills are special and can happen
+		 * due to instruction fetches or for no reason at all,
+		 * and neither LOCK nor MFENCE orders them.
+		 * Fortunately, load_cr3() is serializing and gives the
+		 * ordering guarantee we need.
+		 *
+		 */
 		load_cr3(next->pgd);
+
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
 		/* Stop flush ipis for the previous mm */
@@ -114,6 +154,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Load per-mm CR4 state */
 		load_mm_cr4(next);
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
@@ -128,6 +169,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_mm_ldt(next);
+#endif
 	}
 #ifdef CONFIG_SMP
 	  else {
@@ -142,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 * schedule, protecting us from simultaneous changes.
 			 */
 			cpumask_set_cpu(cpu, mm_cpumask(next));
+
 			/*
 			 * We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
+			 *
+			 * As above, load_cr3() is serializing and orders TLB
+			 * fills with respect to the mm_cpumask write.
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
@@ -186,6 +232,19 @@ static inline void arch_exit_mmap(struct mm_struct *mm)
 	paravirt_arch_exit_mmap(mm);
 }
 
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+	return	!config_enabled(CONFIG_IA32_EMULATION) ||
+		!(mm->context.ia32_compat == TIF_IA32);
+}
+#else
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+	return false;
+}
+#endif
+
 static inline void arch_bprm_mm_init(struct mm_struct *mm,
 		struct vm_area_struct *vma)
 {
diff --git a/kernel/arch/x86/include/asm/mpx.h b/kernel/arch/x86/include/asm/mpx.h
index a952a13d5..7a3549527 100644
--- a/kernel/arch/x86/include/asm/mpx.h
+++ b/kernel/arch/x86/include/asm/mpx.h
@@ -13,55 +13,50 @@
 #define MPX_BNDCFG_ENABLE_FLAG	0x1
 #define MPX_BD_ENTRY_VALID_FLAG	0x1
 
-#ifdef CONFIG_X86_64
-
-/* upper 28 bits [47:20] of the virtual address in 64-bit used to
- * index into bounds directory (BD).
- */
-#define MPX_BD_ENTRY_OFFSET	28
-#define MPX_BD_ENTRY_SHIFT	3
-/* bits [19:3] of the virtual address in 64-bit used to index into
- * bounds table (BT).
+/*
+ * The upper 28 bits [47:20] of the virtual address in 64-bit
+ * are used to index into bounds directory (BD).
+ *
+ * The directory is 2G (2^31) in size, and with 8-byte entries
+ * it has 2^28 entries.
  */
-#define MPX_BT_ENTRY_OFFSET	17
-#define MPX_BT_ENTRY_SHIFT	5
-#define MPX_IGN_BITS		3
-#define MPX_BD_ENTRY_TAIL	3
+#define MPX_BD_SIZE_BYTES_64	(1UL<<31)
+#define MPX_BD_ENTRY_BYTES_64	8
+#define MPX_BD_NR_ENTRIES_64	(MPX_BD_SIZE_BYTES_64/MPX_BD_ENTRY_BYTES_64)
 
-#else
-
-#define MPX_BD_ENTRY_OFFSET	20
-#define MPX_BD_ENTRY_SHIFT	2
-#define MPX_BT_ENTRY_OFFSET	10
-#define MPX_BT_ENTRY_SHIFT	4
-#define MPX_IGN_BITS		2
-#define MPX_BD_ENTRY_TAIL	2
+/*
+ * The 32-bit directory is 4MB (2^22) in size, and with 4-byte
+ * entries it has 2^20 entries.
+ */
+#define MPX_BD_SIZE_BYTES_32	(1UL<<22)
+#define MPX_BD_ENTRY_BYTES_32	4
+#define MPX_BD_NR_ENTRIES_32	(MPX_BD_SIZE_BYTES_32/MPX_BD_ENTRY_BYTES_32)
 
-#endif
+/*
+ * A 64-bit table is 4MB total in size, and an entry is
+ * 4 64-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_64	(1UL<<22)
+#define MPX_BT_ENTRY_BYTES_64	32
+#define MPX_BT_NR_ENTRIES_64	(MPX_BT_SIZE_BYTES_64/MPX_BT_ENTRY_BYTES_64)
 
-#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT))
-#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT))
+/*
+ * A 32-bit table is 16kB total in size, and an entry is
+ * 4 32-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_32	(1UL<<14)
+#define MPX_BT_ENTRY_BYTES_32	16
+#define MPX_BT_NR_ENTRIES_32	(MPX_BT_SIZE_BYTES_32/MPX_BT_ENTRY_BYTES_32)
 
 #define MPX_BNDSTA_TAIL		2
 #define MPX_BNDCFG_TAIL		12
 #define MPX_BNDSTA_ADDR_MASK	(~((1UL<<MPX_BNDSTA_TAIL)-1))
 #define MPX_BNDCFG_ADDR_MASK	(~((1UL<<MPX_BNDCFG_TAIL)-1))
-#define MPX_BT_ADDR_MASK	(~((1UL<<MPX_BD_ENTRY_TAIL)-1))
-
-#define MPX_BNDCFG_ADDR_MASK	(~((1UL<<MPX_BNDCFG_TAIL)-1))
 #define MPX_BNDSTA_ERROR_CODE	0x3
 
-#define MPX_BD_ENTRY_MASK	((1<<MPX_BD_ENTRY_OFFSET)-1)
-#define MPX_BT_ENTRY_MASK	((1<<MPX_BT_ENTRY_OFFSET)-1)
-#define MPX_GET_BD_ENTRY_OFFSET(addr)	((((addr)>>(MPX_BT_ENTRY_OFFSET+ \
-		MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT)
-#define MPX_GET_BT_ENTRY_OFFSET(addr)	((((addr)>>MPX_IGN_BITS) & \
-		MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT)
-
 #ifdef CONFIG_X86_INTEL_MPX
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-				struct xsave_struct *xsave_buf);
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf);
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs);
+int mpx_handle_bd_fault(void);
 static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
 {
 	return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR);
@@ -77,12 +72,11 @@ static inline void mpx_mm_init(struct mm_struct *mm)
 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long start, unsigned long end);
 #else
-static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-					      struct xsave_struct *xsave_buf)
+static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
 	return NULL;
 }
-static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+static inline int mpx_handle_bd_fault(void)
 {
 	return -EINVAL;
 }
diff --git a/kernel/arch/x86/include/asm/mshyperv.h b/kernel/arch/x86/include/asm/mshyperv.h
index c163215ab..aaf59b7da 100644
--- a/kernel/arch/x86/include/asm/mshyperv.h
+++ b/kernel/arch/x86/include/asm/mshyperv.h
@@ -7,6 +7,7 @@
 
 struct ms_hyperv_info {
 	u32 features;
+	u32 misc_features;
 	u32 hints;
 };
 
@@ -20,4 +21,8 @@ void hyperv_vector_handler(struct pt_regs *regs);
 void hv_setup_vmbus_irq(void (*handler)(void));
 void hv_remove_vmbus_irq(void);
 
+void hv_setup_kexec_handler(void (*handler)(void));
+void hv_remove_kexec_handler(void);
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
+void hv_remove_crash_handler(void);
 #endif
diff --git a/kernel/arch/x86/include/asm/msi.h b/kernel/arch/x86/include/asm/msi.h
new file mode 100644
index 000000000..93724cc62
--- /dev/null
+++ b/kernel/arch/x86/include/asm/msi.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_MSI_H
+#define _ASM_X86_MSI_H
+#include <asm/hw_irq.h>
+
+typedef struct irq_alloc_info msi_alloc_info_t;
+
+#endif /* _ASM_X86_MSI_H */
diff --git a/kernel/arch/x86/include/uapi/asm/msr-index.h b/kernel/arch/x86/include/asm/msr-index.h
index 3c6bb342a..690b4027e 100644
--- a/kernel/arch/x86/include/uapi/asm/msr-index.h
+++ b/kernel/arch/x86/include/asm/msr-index.h
@@ -35,7 +35,7 @@
 #define MSR_IA32_PERFCTR0		0x000000c1
 #define MSR_IA32_PERFCTR1		0x000000c2
 #define MSR_FSB_FREQ			0x000000cd
-#define MSR_NHM_PLATFORM_INFO		0x000000ce
+#define MSR_PLATFORM_INFO		0x000000ce
 
 #define MSR_NHM_SNB_PKG_CST_CFG_CTL	0x000000e2
 #define NHM_C3_AUTO_DEMOTE		(1UL << 25)
@@ -44,7 +44,6 @@
 #define SNB_C1_AUTO_UNDEMOTE		(1UL << 27)
 #define SNB_C3_AUTO_UNDEMOTE		(1UL << 28)
 
-#define MSR_PLATFORM_INFO		0x000000ce
 #define MSR_MTRRcap			0x000000fe
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
@@ -56,6 +55,7 @@
 #define MSR_IA32_MCG_CAP		0x00000179
 #define MSR_IA32_MCG_STATUS		0x0000017a
 #define MSR_IA32_MCG_CTL		0x0000017b
+#define MSR_IA32_MCG_EXT_CTL		0x000004d0
 
 #define MSR_OFFCORE_RSP_0		0x000001a6
 #define MSR_OFFCORE_RSP_1		0x000001a7
@@ -72,6 +72,12 @@
 #define MSR_LBR_CORE_FROM		0x00000040
 #define MSR_LBR_CORE_TO			0x00000060
 
+#define MSR_LBR_INFO_0			0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED		BIT_ULL(63)
+#define LBR_INFO_IN_TX			BIT_ULL(62)
+#define LBR_INFO_ABORT			BIT_ULL(61)
+#define LBR_INFO_CYCLES			0xffff
+
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
 #define MSR_IA32_DS_AREA		0x00000600
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
@@ -79,13 +85,21 @@
 
 #define MSR_IA32_RTIT_CTL		0x00000570
 #define RTIT_CTL_TRACEEN		BIT(0)
+#define RTIT_CTL_CYCLEACC		BIT(1)
 #define RTIT_CTL_OS			BIT(2)
 #define RTIT_CTL_USR			BIT(3)
 #define RTIT_CTL_CR3EN			BIT(7)
 #define RTIT_CTL_TOPA			BIT(8)
+#define RTIT_CTL_MTC_EN			BIT(9)
 #define RTIT_CTL_TSC_EN			BIT(10)
 #define RTIT_CTL_DISRETC		BIT(11)
 #define RTIT_CTL_BRANCH_EN		BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET	14
+#define RTIT_CTL_MTC_RANGE		(0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET	19
+#define RTIT_CTL_CYC_THRESH		(0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET	24
+#define RTIT_CTL_PSB_FREQ      		(0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
 #define MSR_IA32_RTIT_STATUS		0x00000571
 #define RTIT_STATUS_CONTEXTEN		BIT(1)
 #define RTIT_STATUS_TRIGGEREN		BIT(2)
@@ -126,6 +140,8 @@
 #define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
 #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
 
+#define MSR_PEBS_FRONTEND		0x000003f7
+
 #define MSR_IA32_POWER_CTL		0x000001fc
 
 #define MSR_IA32_MC0_CTL		0x00000400
@@ -169,6 +185,12 @@
 #define MSR_PP1_ENERGY_STATUS		0x00000641
 #define MSR_PP1_POLICY			0x00000642
 
+#define MSR_CONFIG_TDP_NOMINAL		0x00000648
+#define MSR_CONFIG_TDP_LEVEL_1		0x00000649
+#define MSR_CONFIG_TDP_LEVEL_2		0x0000064A
+#define MSR_CONFIG_TDP_CONTROL		0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO	0x0000064C
+
 #define MSR_PKG_WEIGHTED_CORE_C0_RES	0x00000658
 #define MSR_PKG_ANY_CORE_C0_RES		0x00000659
 #define MSR_PKG_ANY_GFXE_C0_RES		0x0000065A
@@ -183,6 +205,13 @@
 #define MSR_GFX_PERF_LIMIT_REASONS	0x000006B0
 #define MSR_RING_PERF_LIMIT_REASONS	0x000006B1
 
+/* Config TDP MSRs */
+#define MSR_CONFIG_TDP_NOMINAL		0x00000648
+#define MSR_CONFIG_TDP_LEVEL1		0x00000649
+#define MSR_CONFIG_TDP_LEVEL2		0x0000064A
+#define MSR_CONFIG_TDP_CONTROL		0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO	0x0000064C
+
 /* Hardware P state interface */
 #define MSR_PPERF			0x0000064e
 #define MSR_PERF_LIMIT_REASONS		0x0000064f
@@ -310,6 +339,7 @@
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK		0x18000000
 #define MSR_K8_TSEG_ADDR		0xc0010112
+#define MSR_K8_TSEG_MASK		0xc0010113
 #define K8_MTRRFIXRANGE_DRAM_ENABLE	0x00040000 /* MtrrFixDramEn bit    */
 #define K8_MTRRFIXRANGE_DRAM_MODIFY	0x00080000 /* MtrrFixDramModEn bit */
 #define K8_MTRR_RDMEM_WRMEM_MASK	0x18181818 /* Mask: RdMem|WrMem    */
@@ -380,6 +410,7 @@
 #define FEATURE_CONTROL_LOCKED				(1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
 #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX	(1<<2)
+#define FEATURE_CONTROL_LMCE				(1<<20)
 
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
diff --git a/kernel/arch/x86/include/asm/msr.h b/kernel/arch/x86/include/asm/msr.h
index de36f22eb..77d8b284e 100644
--- a/kernel/arch/x86/include/asm/msr.h
+++ b/kernel/arch/x86/include/asm/msr.h
@@ -1,13 +1,14 @@
 #ifndef _ASM_X86_MSR_H
 #define _ASM_X86_MSR_H
 
-#include <uapi/asm/msr.h>
+#include "msr-index.h"
 
 #ifndef __ASSEMBLY__
 
 #include <asm/asm.h>
 #include <asm/errno.h>
 #include <asm/cpumask.h>
+#include <uapi/asm/msr.h>
 
 struct msr {
 	union {
@@ -46,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
  * it means rax *or* rdx.
  */
 #ifdef CONFIG_X86_64
-#define DECLARE_ARGS(val, low, high)	unsigned low, high
-#define EAX_EDX_VAL(val, low, high)	((low) | ((u64)(high) << 32))
-#define EAX_EDX_ARGS(val, low, high)	"a" (low), "d" (high)
+/* Using 64-bit values saves one instruction clearing the high half of low */
+#define DECLARE_ARGS(val, low, high)	unsigned long low, high
+#define EAX_EDX_VAL(val, low, high)	((low) | (high) << 32)
 #define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)
 #else
 #define DECLARE_ARGS(val, low, high)	unsigned long long val
 #define EAX_EDX_VAL(val, low, high)	(val)
-#define EAX_EDX_ARGS(val, low, high)	"A" (val)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
@@ -105,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 	return err;
 }
 
-extern unsigned long long native_read_tsc(void);
-
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
-static __always_inline unsigned long long __native_read_tsc(void)
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect.  The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline unsigned long long rdtsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
@@ -119,6 +126,35 @@ static __always_inline unsigned long long __native_read_tsc(void)
 	return EAX_EDX_VAL(val, low, high);
 }
 
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter.  It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline unsigned long long rdtsc_ordered(void)
+{
+	/*
+	 * The RDTSC instruction is not ordered relative to memory
+	 * access.  The Intel SDM and the AMD APM are both vague on this
+	 * point, but empirically an RDTSC instruction can be
+	 * speculatively executed before prior loads.  An RDTSC
+	 * immediately after an appropriate barrier appears to be
+	 * ordered as a normal load, that is, it provides the same
+	 * ordering guarantees as reading from a global memory location
+	 * that some other imaginary CPU is updating continuously with a
+	 * time stamp.
+	 */
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
+	return rdtsc();
+}
+
+/* Deprecated, keep it for a cycle for easier merging: */
+#define rdtscll(now)	do { (now) = rdtsc_ordered(); } while (0)
+
 static inline unsigned long long native_read_pmc(int counter)
 {
 	DECLARE_ARGS(val, low, high);
@@ -152,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
 #define rdmsrl(msr, val)			\
 	((val) = native_read_msr((msr)))
 
-#define wrmsrl(msr, val)						\
-	native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	native_write_msr(msr, (u32)val, (u32)(val >> 32));
+}
 
 /* wrmsr with exception handling */
 static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
@@ -179,12 +217,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-#define rdtscl(low)						\
-	((low) = (u32)__native_read_tsc())
-
-#define rdtscll(val)						\
-	((val) = __native_read_tsc())
-
 #define rdpmc(counter, low, high)			\
 do {							\
 	u64 _l = native_read_pmc((counter));		\
@@ -194,19 +226,15 @@ do {							\
 
 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
 
-#define rdtscp(low, high, aux)					\
-do {                                                            \
-	unsigned long long _val = native_read_tscp(&(aux));     \
-	(low) = (u32)_val;                                      \
-	(high) = (u32)(_val >> 32);                             \
-} while (0)
-
-#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
-
 #endif	/* !CONFIG_PARAVIRT */
 
-#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val),		\
-					     (u32)((val) >> 32))
+/*
+ * 64-bit version of wrmsr_safe():
+ */
+static inline int wrmsrl_safe(u32 msr, u64 val)
+{
+	return wrmsr_safe(msr, (u32)val,  (u32)(val >> 32));
+}
 
 #define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
 
diff --git a/kernel/arch/x86/include/asm/mtrr.h b/kernel/arch/x86/include/asm/mtrr.h
index f768f6298..b94f6f64e 100644
--- a/kernel/arch/x86/include/asm/mtrr.h
+++ b/kernel/arch/x86/include/asm/mtrr.h
@@ -31,7 +31,7 @@
  * arch_phys_wc_add and arch_phys_wc_del.
  */
 # ifdef CONFIG_MTRR
-extern u8 mtrr_type_lookup(u64 addr, u64 end);
+extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
 extern int mtrr_add(unsigned long base, unsigned long size,
@@ -48,14 +48,13 @@ extern void mtrr_aps_init(void);
 extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
-extern int phys_wc_to_mtrr_index(int handle);
 #  else
-static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
 	/*
 	 * Return no-MTRRs:
 	 */
-	return 0xff;
+	return MTRR_TYPE_INVALID;
 }
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
 #define mtrr_save_state() do {} while (0)
@@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 }
-static inline int phys_wc_to_mtrr_index(int handle)
-{
-	return -1;
-}
 
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
@@ -127,4 +122,8 @@ struct mtrr_gentry32 {
 				 _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry32)
 #endif /* CONFIG_COMPAT */
 
+/* Bit fields for enabled in struct mtrr_state_type */
+#define MTRR_STATE_MTRR_FIXED_ENABLED	0x01
+#define MTRR_STATE_MTRR_ENABLED		0x02
+
 #endif /* _ASM_X86_MTRR_H */
diff --git a/kernel/arch/x86/include/asm/mwait.h b/kernel/arch/x86/include/asm/mwait.h
index 653dfa766..c70689b5e 100644
--- a/kernel/arch/x86/include/asm/mwait.h
+++ b/kernel/arch/x86/include/asm/mwait.h
@@ -14,6 +14,9 @@
 #define CPUID5_ECX_INTERRUPT_BREAK	0x2
 
 #define MWAIT_ECX_INTERRUPT_BREAK	0x1
+#define MWAITX_ECX_TIMER_ENABLE		BIT(1)
+#define MWAITX_MAX_LOOPS		((u32)-1)
+#define MWAITX_DISABLE_CSTATES		0xf
 
 static inline void __monitor(const void *eax, unsigned long ecx,
 			     unsigned long edx)
@@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx,
 		     :: "a" (eax), "c" (ecx), "d"(edx));
 }
 
+static inline void __monitorx(const void *eax, unsigned long ecx,
+			      unsigned long edx)
+{
+	/* "monitorx %eax, %ecx, %edx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xfa;"
+		     :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
 static inline void __mwait(unsigned long eax, unsigned long ecx)
 {
 	/* "mwait %eax, %ecx;" */
@@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 		     :: "a" (eax), "c" (ecx));
 }
 
+/*
+ * MWAITX allows for a timer expiration to get the core out a wait state in
+ * addition to the default MWAIT exit condition of a store appearing at a
+ * monitored virtual address.
+ *
+ * Registers:
+ *
+ * MWAITX ECX[1]: enable timer if set
+ * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
+ * frequency is the same as the TSC frequency.
+ *
+ * Below is a comparison between MWAIT and MWAITX on AMD processors:
+ *
+ *                 MWAIT                           MWAITX
+ * opcode          0f 01 c9           |            0f 01 fb
+ * ECX[0]                  value of RFLAGS.IF seen by instruction
+ * ECX[1]          unused/#GP if set  |            enable timer if set
+ * ECX[31:2]                     unused/#GP if set
+ * EAX                           unused (reserve for hint)
+ * EBX[31:0]       unused             |            max wait time (P0 clocks)
+ *
+ *                 MONITOR                         MONITORX
+ * opcode          0f 01 c8           |            0f 01 fa
+ * EAX                     (logical) address to monitor
+ * ECX                     #GP if not zero
+ */
+static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+			    unsigned long ecx)
+{
+	/* "mwaitx %eax, %ebx, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xfb;"
+		     :: "a" (eax), "b" (ebx), "c" (ecx));
+}
+
 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
 	trace_hardirqs_on();
diff --git a/kernel/arch/x86/include/asm/numachip/numachip.h b/kernel/arch/x86/include/asm/numachip/numachip.h
index 1c6f7f621..c64373a2d 100644
--- a/kernel/arch/x86/include/asm/numachip/numachip.h
+++ b/kernel/arch/x86/include/asm/numachip/numachip.h
@@ -14,6 +14,7 @@
 #ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
 #define _ASM_X86_NUMACHIP_NUMACHIP_H
 
+extern u8 numachip_system;
 extern int __init pci_numachip_init(void);
 
 #endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
diff --git a/kernel/arch/x86/include/asm/numachip/numachip_csr.h b/kernel/arch/x86/include/asm/numachip/numachip_csr.h
index 660f843df..29719eecd 100644
--- a/kernel/arch/x86/include/asm/numachip/numachip_csr.h
+++ b/kernel/arch/x86/include/asm/numachip/numachip_csr.h
@@ -14,12 +14,8 @@
 #ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
 #define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
 
-#include <linux/numa.h>
-#include <linux/percpu.h>
+#include <linux/smp.h>
 #include <linux/io.h>
-#include <linux/swab.h>
-#include <asm/types.h>
-#include <asm/processor.h>
 
 #define CSR_NODE_SHIFT		16
 #define CSR_NODE_BITS(p)	(((unsigned long)(p)) << CSR_NODE_SHIFT)
@@ -27,11 +23,8 @@
 
 /* 32K CSR space, b15 indicates geo/non-geo */
 #define CSR_OFFSET_MASK	0x7fffUL
-
-/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */
-#define NUMACHIP_GCSR_BASE	0x3fff00000000ULL
-#define NUMACHIP_GCSR_LIM	0x3fff0fffffffULL
-#define NUMACHIP_GCSR_SIZE	(NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1)
+#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
+#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
 
 /*
  * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
@@ -41,12 +34,7 @@
 #define NUMACHIP_LCSR_BASE	0x3ffffe000000ULL
 #define NUMACHIP_LCSR_LIM	0x3fffffffffffULL
 #define NUMACHIP_LCSR_SIZE	(NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
-
-static inline void *gcsr_address(int node, unsigned long offset)
-{
-	return __va(NUMACHIP_GCSR_BASE | (1UL << 15) |
-		CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK));
-}
+#define NUMACHIP_LAPIC_BITS	8
 
 static inline void *lcsr_address(unsigned long offset)
 {
@@ -54,114 +42,57 @@ static inline void *lcsr_address(unsigned long offset)
 		CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
 }
 
-static inline unsigned int read_gcsr(int node, unsigned long offset)
+static inline unsigned int read_lcsr(unsigned long offset)
 {
-	return swab32(readl(gcsr_address(node, offset)));
+	return swab32(readl(lcsr_address(offset)));
 }
 
-static inline void write_gcsr(int node, unsigned long offset, unsigned int val)
+static inline void write_lcsr(unsigned long offset, unsigned int val)
 {
-	writel(swab32(val), gcsr_address(node, offset));
+	writel(swab32(val), lcsr_address(offset));
 }
 
-static inline unsigned int read_lcsr(unsigned long offset)
+/*
+ * On NumaChip2, local CSR space is 16MB and starts at fixed offset below 4G
+ */
+
+#define NUMACHIP2_LCSR_BASE       0xf0000000UL
+#define NUMACHIP2_LCSR_SIZE       0x1000000UL
+#define NUMACHIP2_APIC_ICR        0x100000
+#define NUMACHIP2_TIMER_DEADLINE  0x200000
+#define NUMACHIP2_TIMER_INT       0x200008
+#define NUMACHIP2_TIMER_NOW       0x200018
+#define NUMACHIP2_TIMER_RESET     0x200020
+
+static inline void __iomem *numachip2_lcsr_address(unsigned long offset)
 {
-	return swab32(readl(lcsr_address(offset)));
+	return (void __iomem *)__va(NUMACHIP2_LCSR_BASE |
+		(offset & (NUMACHIP2_LCSR_SIZE - 1)));
 }
 
-static inline void write_lcsr(unsigned long offset, unsigned int val)
+static inline u32 numachip2_read32_lcsr(unsigned long offset)
 {
-	writel(swab32(val), lcsr_address(offset));
+	return readl(numachip2_lcsr_address(offset));
 }
 
-/* ========================================================================= */
-/*                   CSR_G0_STATE_CLEAR                                      */
-/* ========================================================================= */
-
-#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12))
-union numachip_csr_g0_state_clear {
-	unsigned int v;
-	struct numachip_csr_g0_state_clear_s {
-		unsigned int _state:2;
-		unsigned int _rsvd_2_6:5;
-		unsigned int _lost:1;
-		unsigned int _rsvd_8_31:24;
-	} s;
-};
-
-/* ========================================================================= */
-/*                   CSR_G0_NODE_IDS                                         */
-/* ========================================================================= */
+static inline u64 numachip2_read64_lcsr(unsigned long offset)
+{
+	return readq(numachip2_lcsr_address(offset));
+}
 
-#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
-union numachip_csr_g0_node_ids {
-	unsigned int v;
-	struct numachip_csr_g0_node_ids_s {
-		unsigned int _initialid:16;
-		unsigned int _nodeid:12;
-		unsigned int _rsvd_28_31:4;
-	} s;
-};
-
-/* ========================================================================= */
-/*                   CSR_G3_EXT_IRQ_GEN                                      */
-/* ========================================================================= */
+static inline void numachip2_write32_lcsr(unsigned long offset, u32 val)
+{
+	writel(val, numachip2_lcsr_address(offset));
+}
 
-#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
-union numachip_csr_g3_ext_irq_gen {
-	unsigned int v;
-	struct numachip_csr_g3_ext_irq_gen_s {
-		unsigned int _vector:8;
-		unsigned int _msgtype:3;
-		unsigned int _index:5;
-		unsigned int _destination_apic_id:16;
-	} s;
-};
-
-/* ========================================================================= */
-/*                   CSR_G3_EXT_IRQ_STATUS                                   */
-/* ========================================================================= */
-
-#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12))
-union numachip_csr_g3_ext_irq_status {
-	unsigned int v;
-	struct numachip_csr_g3_ext_irq_status_s {
-		unsigned int _result:32;
-	} s;
-};
-
-/* ========================================================================= */
-/*                   CSR_G3_EXT_IRQ_DEST                                     */
-/* ========================================================================= */
-
-#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12))
-union numachip_csr_g3_ext_irq_dest {
-	unsigned int v;
-	struct numachip_csr_g3_ext_irq_dest_s {
-		unsigned int _irq:8;
-		unsigned int _rsvd_8_31:24;
-	} s;
-};
-
-/* ========================================================================= */
-/*                   CSR_G3_NC_ATT_MAP_SELECT                                */
-/* ========================================================================= */
-
-#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12))
-union numachip_csr_g3_nc_att_map_select {
-	unsigned int v;
-	struct numachip_csr_g3_nc_att_map_select_s {
-		unsigned int _upper_address_bits:4;
-		unsigned int _select_ram:4;
-		unsigned int _rsvd_8_31:24;
-	} s;
-};
-
-/* ========================================================================= */
-/*                   CSR_G3_NC_ATT_MAP_SELECT_0-255                          */
-/* ========================================================================= */
-
-#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12))
+static inline void numachip2_write64_lcsr(unsigned long offset, u64 val)
+{
+	writeq(val, numachip2_lcsr_address(offset));
+}
 
-#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
+static inline unsigned int numachip2_timer(void)
+{
+	return (smp_processor_id() % 48) << 6;
+}
 
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
diff --git a/kernel/arch/x86/include/asm/page_64_types.h b/kernel/arch/x86/include/asm/page_64_types.h
index 4edd53b79..4928cf0d5 100644
--- a/kernel/arch/x86/include/asm/page_64_types.h
+++ b/kernel/arch/x86/include/asm/page_64_types.h
@@ -26,9 +26,6 @@
 #define MCE_STACK 4
 #define N_EXCEPTION_STACKS 4  /* hw limit: 7 */
 
-#define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
-#define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
-
 /*
  * Set __PAGE_OFFSET to the most negative possible address +
  * PGDIR_SIZE*16 (pgd slot 272).  The gap is to allow a space for a
diff --git a/kernel/arch/x86/include/asm/page_types.h b/kernel/arch/x86/include/asm/page_types.h
index c7c712f26..cc071c6f7 100644
--- a/kernel/arch/x86/include/asm/page_types.h
+++ b/kernel/arch/x86/include/asm/page_types.h
@@ -9,16 +9,21 @@
 #define PAGE_SIZE	(_AC(1,UL) << PAGE_SHIFT)
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 
+#define PMD_PAGE_SIZE		(_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK		(~(PMD_PAGE_SIZE-1))
+
+#define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
+
 #define __PHYSICAL_MASK		((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
 #define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
 
-/* Cast PAGE_MASK to a signed type so that it is sign-extended if
+/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
    virtual addresses are 32-bits but physical addresses are larger
    (ie, 32-bit PAE). */
 #define PHYSICAL_PAGE_MASK	(((signed long)PAGE_MASK) & __PHYSICAL_MASK)
-
-#define PMD_PAGE_SIZE		(_AC(1, UL) << PMD_SHIFT)
-#define PMD_PAGE_MASK		(~(PMD_PAGE_SIZE-1))
+#define PHYSICAL_PMD_PAGE_MASK	(((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PUD_PAGE_MASK	(((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
 
 #define HPAGE_SHIFT		PMD_SHIFT
 #define HPAGE_SIZE		(_AC(1,UL) << HPAGE_SHIFT)
diff --git a/kernel/arch/x86/include/asm/paravirt.h b/kernel/arch/x86/include/asm/paravirt.h
index 8957810ad..c759b3cca 100644
--- a/kernel/arch/x86/include/asm/paravirt.h
+++ b/kernel/arch/x86/include/asm/paravirt.h
@@ -19,6 +19,12 @@ static inline int paravirt_enabled(void)
 	return pv_info.paravirt_enabled;
 }
 
+static inline int paravirt_has_feature(unsigned int feature)
+{
+	WARN_ON_ONCE(!pv_info.paravirt_enabled);
+	return (pv_info.features & feature);
+}
+
 static inline void load_sp0(struct tss_struct *tss,
 			     struct thread_struct *thread)
 {
@@ -153,7 +159,11 @@ do {						\
 	val = paravirt_read_msr(msr, &_err);	\
 } while (0)
 
-#define wrmsrl(msr, val)	wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	wrmsr(msr, (u32)val, (u32)(val>>32));
+}
+
 #define wrmsr_safe(msr, a, b)	paravirt_write_msr(msr, a, b)
 
 /* rdmsr with exception handling */
@@ -174,19 +184,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-static inline u64 paravirt_read_tsc(void)
-{
-	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
-}
-
-#define rdtscl(low)				\
-do {						\
-	u64 _l = paravirt_read_tsc();		\
-	low = (int)_l;				\
-} while (0)
-
-#define rdtscll(val) (val = paravirt_read_tsc())
-
 static inline unsigned long long paravirt_sched_clock(void)
 {
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@@ -215,27 +212,6 @@ do {						\
 
 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 
-static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
-{
-	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
-}
-
-#define rdtscp(low, high, aux)				\
-do {							\
-	int __aux;					\
-	unsigned long __val = paravirt_rdtscp(&__aux);	\
-	(low) = (u32)__val;				\
-	(high) = (u32)(__val >> 32);			\
-	(aux) = __aux;					\
-} while (0)
-
-#define rdtscpll(val, aux)				\
-do {							\
-	unsigned long __aux; 				\
-	val = paravirt_rdtscp(&__aux);			\
-	(aux) = __aux;					\
-} while (0)
-
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
@@ -712,6 +688,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+							u32 val)
+{
+	PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+	PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
 							__ticket_t ticket)
 {
@@ -724,7 +725,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
 	PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
 
-#endif
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+#endif /* SMP && PARAVIRT_SPINLOCKS */
 
 #ifdef CONFIG_X86_32
 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
diff --git a/kernel/arch/x86/include/asm/paravirt_types.h b/kernel/arch/x86/include/asm/paravirt_types.h
index f7b0b5c11..3d4419118 100644
--- a/kernel/arch/x86/include/asm/paravirt_types.h
+++ b/kernel/arch/x86/include/asm/paravirt_types.h
@@ -70,9 +70,14 @@ struct pv_info {
 #endif
 
 	int paravirt_enabled;
+	unsigned int features;	  /* valid only if paravirt_enabled is set */
 	const char *name;
 };
 
+#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
+/* Supported features */
+#define PV_SUPPORTED_RTC        (1<<0)
+
 struct pv_init_ops {
 	/*
 	 * Patch may replace one of the defined code sequences with
@@ -97,7 +102,6 @@ struct pv_lazy_ops {
 struct pv_time_ops {
 	unsigned long long (*sched_clock)(void);
 	unsigned long long (*steal_clock)(int cpu);
-	unsigned long (*get_tsc_khz)(void);
 };
 
 struct pv_cpu_ops {
@@ -156,17 +160,16 @@ struct pv_cpu_ops {
 	u64 (*read_msr)(unsigned int msr, int *err);
 	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
 
-	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(int counter);
-	unsigned long long (*read_tscp)(unsigned int *aux);
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Atomically enable interrupts and return to userspace.  This
-	 * is only ever used to return to 32-bit processes; in a
-	 * 64-bit kernel, it's used for 32-on-64 compat processes, but
-	 * never native 64-bit processes.  (Jump, not call.)
+	 * is only used in 32-bit kernels.  64-bit kernels use
+	 * usergs_sysret32 instead.
 	 */
 	void (*irq_enable_sysexit)(void);
+#endif
 
 	/*
 	 * Switch to usermode gs and return to 64-bit usermode using
@@ -333,9 +336,19 @@ struct arch_spinlock;
 typedef u16 __ticket_t;
 #endif
 
+struct qspinlock;
+
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+	struct paravirt_callee_save queued_spin_unlock;
+
+	void (*wait)(u8 *ptr, u8 val);
+	void (*kick)(int cpu);
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	struct paravirt_callee_save lock_spinning;
 	void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/kernel/arch/x86/include/asm/pat.h b/kernel/arch/x86/include/asm/pat.h
index 91bc4ba95..ca6c228d5 100644
--- a/kernel/arch/x86/include/asm/pat.h
+++ b/kernel/arch/x86/include/asm/pat.h
@@ -4,14 +4,9 @@
 #include <linux/types.h>
 #include <asm/pgtable_types.h>
 
-#ifdef CONFIG_X86_PAT
-extern int pat_enabled;
-#else
-static const int pat_enabled;
-#endif
-
+bool pat_enabled(void);
 extern void pat_init(void);
-void pat_init_cache_modes(void);
+void pat_init_cache_modes(u64);
 
 extern int reserve_memtype(u64 start, u64 end,
 		enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
diff --git a/kernel/arch/x86/include/asm/pci.h b/kernel/arch/x86/include/asm/pci.h
index 4e370a5d8..462594320 100644
--- a/kernel/arch/x86/include/asm/pci.h
+++ b/kernel/arch/x86/include/asm/pci.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/x86_init.h>
 
@@ -80,13 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
 #ifdef CONFIG_PCI
 extern void early_quirks(void);
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
 #else
 static inline void early_quirks(void) { }
 #endif
@@ -96,15 +89,10 @@ extern void pci_iommu_alloc(void);
 #ifdef CONFIG_PCI_MSI
 /* implemented in arch/x86/kernel/apic/io_apic. */
 struct msi_desc;
-void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq,
-			    unsigned int dest, struct msi_msg *msg, u8 hpet_id);
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev);
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset);
 #else
-#define native_compose_msi_msg		NULL
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #endif
diff --git a/kernel/arch/x86/include/asm/pci_x86.h b/kernel/arch/x86/include/asm/pci_x86.h
index 164e3f8d3..fa1195dae 100644
--- a/kernel/arch/x86/include/asm/pci_x86.h
+++ b/kernel/arch/x86/include/asm/pci_x86.h
@@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock;
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
 
-extern bool mp_should_keep_irq(struct device *dev);
-
 struct pci_raw_ops {
 	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 *val);
diff --git a/kernel/arch/x86/include/asm/perf_event.h b/kernel/arch/x86/include/asm/perf_event.h
index dc0f6ed35..7bcb861a0 100644
--- a/kernel/arch/x86/include/asm/perf_event.h
+++ b/kernel/arch/x86/include/asm/perf_event.h
@@ -159,6 +159,13 @@ struct x86_pmu_capability {
  */
 #define INTEL_PMC_IDX_FIXED_BTS				(INTEL_PMC_IDX_FIXED + 16)
 
+#define GLOBAL_STATUS_COND_CHG				BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF			BIT_ULL(62)
+#define GLOBAL_STATUS_UNC_OVF				BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF				BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN			BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN			BIT_ULL(58)
+
 /*
  * IBS cpuid feature detection
  */
diff --git a/kernel/arch/x86/include/asm/pgtable.h b/kernel/arch/x86/include/asm/pgtable.h
index fe57e7a98..6ec0c8b2e 100644
--- a/kernel/arch/x86/include/asm/pgtable.h
+++ b/kernel/arch/x86/include/asm/pgtable.h
@@ -19,6 +19,13 @@
 #include <asm/x86_init.h>
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_checkwx(void);
+
+#ifdef CONFIG_DEBUG_WX
+#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
+#else
+#define debug_checkwx() do { } while (0)
+#endif
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
@@ -142,12 +149,12 @@ static inline unsigned long pte_pfn(pte_t pte)
 
 static inline unsigned long pmd_pfn(pmd_t pmd)
 {
-	return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+	return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
 }
 
 static inline unsigned long pud_pfn(pud_t pud)
 {
-	return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT;
+	return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
 }
 
 #define pte_page(pte)	pfn_to_page(pte_pfn(pte))
@@ -318,6 +325,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*
@@ -379,7 +396,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 	return __pgprot(preservebits | addbits);
 }
 
-#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+#define pte_pgprot(x) __pgprot(pte_flags(x))
+#define pmd_pgprot(x) __pgprot(pmd_flags(x))
+#define pud_pgprot(x) __pgprot(pud_flags(x))
 
 #define canon_pgprot(p) __pgprot(massage_pgprot(p))
 
@@ -398,11 +417,17 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
 	 * requested memtype:
 	 * - request is uncached, return cannot be write-back
 	 * - request is write-combine, return cannot be write-back
+	 * - request is write-through, return cannot be write-back
+	 * - request is write-through, return cannot be write-combine
 	 */
 	if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
 	     new_pcm == _PAGE_CACHE_MODE_WB) ||
 	    (pcm == _PAGE_CACHE_MODE_WC &&
-	     new_pcm == _PAGE_CACHE_MODE_WB)) {
+	     new_pcm == _PAGE_CACHE_MODE_WB) ||
+	    (pcm == _PAGE_CACHE_MODE_WT &&
+	     new_pcm == _PAGE_CACHE_MODE_WB) ||
+	    (pcm == _PAGE_CACHE_MODE_WT &&
+	     new_pcm == _PAGE_CACHE_MODE_WC)) {
 		return 0;
 	}
 
@@ -496,14 +521,15 @@ static inline int pmd_none(pmd_t pmd)
 
 static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 {
-	return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+	return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
 }
 
 /*
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pmd_page(pmd)	pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
+#define pmd_page(pmd)		\
+	pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
 
 /*
  * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -564,14 +590,15 @@ static inline int pud_present(pud_t pud)
 
 static inline unsigned long pud_page_vaddr(pud_t pud)
 {
-	return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+	return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
 }
 
 /*
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pud_page(pud)		pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+#define pud_page(pud)		\
+	pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
 
 /* Find an entry in the second-level page table.. */
 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -799,8 +826,8 @@ static inline int pmd_write(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_RW;
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
 				       pmd_t *pmdp)
 {
 	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
diff --git a/kernel/arch/x86/include/asm/pgtable_types.h b/kernel/arch/x86/include/asm/pgtable_types.h
index 78f0c8cbe..79c91853e 100644
--- a/kernel/arch/x86/include/asm/pgtable_types.h
+++ b/kernel/arch/x86/include/asm/pgtable_types.h
@@ -209,10 +209,10 @@ enum page_cache_mode {
 
 #include <linux/types.h>
 
-/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
 #define PTE_PFN_MASK		((pteval_t)PHYSICAL_PAGE_MASK)
 
-/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
 #define PTE_FLAGS_MASK		(~PTE_PFN_MASK)
 
 typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
@@ -276,14 +276,40 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
 }
 #endif
 
+static inline pudval_t pud_pfn_mask(pud_t pud)
+{
+	if (native_pud_val(pud) & _PAGE_PSE)
+		return PHYSICAL_PUD_PAGE_MASK;
+	else
+		return PTE_PFN_MASK;
+}
+
+static inline pudval_t pud_flags_mask(pud_t pud)
+{
+	return ~pud_pfn_mask(pud);
+}
+
 static inline pudval_t pud_flags(pud_t pud)
 {
-	return native_pud_val(pud) & PTE_FLAGS_MASK;
+	return native_pud_val(pud) & pud_flags_mask(pud);
+}
+
+static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
+{
+	if (native_pmd_val(pmd) & _PAGE_PSE)
+		return PHYSICAL_PMD_PAGE_MASK;
+	else
+		return PTE_PFN_MASK;
+}
+
+static inline pmdval_t pmd_flags_mask(pmd_t pmd)
+{
+	return ~pmd_pfn_mask(pmd);
 }
 
 static inline pmdval_t pmd_flags(pmd_t pmd)
 {
-	return native_pmd_val(pmd) & PTE_FLAGS_MASK;
+	return native_pmd_val(pmd) & pmd_flags_mask(pmd);
 }
 
 static inline pte_t native_make_pte(pteval_t val)
@@ -337,20 +363,18 @@ static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
 }
 static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
 {
+	pgprotval_t val = pgprot_val(pgprot);
 	pgprot_t new;
-	unsigned long val;
 
-	val = pgprot_val(pgprot);
 	pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
 		((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
 	return new;
 }
 static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
 {
+	pgprotval_t val = pgprot_val(pgprot);
 	pgprot_t new;
-	unsigned long val;
 
-	val = pgprot_val(pgprot);
 	pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
 			  ((val & _PAGE_PAT_LARGE) >>
 			   (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
@@ -367,6 +391,9 @@ extern int nx_enabled;
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
 
+#define pgprot_writethrough	pgprot_writethrough
+extern pgprot_t pgprot_writethrough(pgprot_t prot);
+
 /* Indicate that x86 has its own track and untrack pfn vma functions */
 #define __HAVE_PFNMAP_TRACKING
 
diff --git a/kernel/arch/x86/include/asm/pmc_atom.h b/kernel/arch/x86/include/asm/pmc_atom.h
index bc0fc0866..aa8744c77 100644
--- a/kernel/arch/x86/include/asm/pmc_atom.h
+++ b/kernel/arch/x86/include/asm/pmc_atom.h
@@ -18,6 +18,8 @@
 
 /* ValleyView Power Control Unit PCI Device ID */
 #define	PCI_DEVICE_ID_VLV_PMC	0x0F1C
+/* CherryTrail Power Control Unit PCI Device ID */
+#define	PCI_DEVICE_ID_CHT_PMC	0x229C
 
 /* PMC Memory mapped IO registers */
 #define	PMC_BASE_ADDR_OFFSET	0x44
@@ -29,6 +31,10 @@
 #define	PMC_FUNC_DIS		0x34
 #define	PMC_FUNC_DIS_2		0x38
 
+/* CHT specific bits in FUNC_DIS2 register */
+#define	BIT_FD_GMM		BIT(3)
+#define	BIT_FD_ISH		BIT(4)
+
 /* S0ix wake event control */
 #define	PMC_S0IX_WAKE_EN	0x3C
 
@@ -75,6 +81,21 @@
 #define PMC_PSS_BIT_USB			BIT(16)
 #define PMC_PSS_BIT_USB_SUS		BIT(17)
 
+/* CHT specific bits in PSS register */
+#define	PMC_PSS_BIT_CHT_UFS		BIT(7)
+#define	PMC_PSS_BIT_CHT_UXD		BIT(11)
+#define	PMC_PSS_BIT_CHT_UXD_FD		BIT(12)
+#define	PMC_PSS_BIT_CHT_UX_ENG		BIT(15)
+#define	PMC_PSS_BIT_CHT_USB_SUS		BIT(16)
+#define	PMC_PSS_BIT_CHT_GMM		BIT(17)
+#define	PMC_PSS_BIT_CHT_ISH		BIT(18)
+#define	PMC_PSS_BIT_CHT_DFX_MASTER	BIT(26)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER1	BIT(27)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER2	BIT(28)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER3	BIT(29)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER4	BIT(30)
+#define	PMC_PSS_BIT_CHT_DFX_CLUSTER5	BIT(31)
+
 /* These registers reflect D3 status of functions */
 #define	PMC_D3_STS_0		0xA0
 
@@ -117,6 +138,10 @@
 #define	BIT_USH_SS_PHY		BIT(2)
 #define	BIT_DFX			BIT(3)
 
+/* CHT specific bits in PMC_D3_STS_1 register */
+#define	BIT_STS_GMM		BIT(1)
+#define	BIT_STS_ISH		BIT(2)
+
 /* PMC I/O Registers */
 #define	ACPI_BASE_ADDR_OFFSET	0x40
 #define	ACPI_BASE_ADDR_MASK	0xFFFFFE00
@@ -126,4 +151,8 @@
 #define	SLEEP_TYPE_MASK		0xFFFFECFF
 #define	SLEEP_TYPE_S5		0x1C00
 #define	SLEEP_ENABLE		0x2000
+
+extern int pmc_atom_read(int offset, u32 *value);
+extern int pmc_atom_write(int offset, u32 value);
+
 #endif /* PMC_ATOM_H */
diff --git a/kernel/arch/x86/include/asm/pmem.h b/kernel/arch/x86/include/asm/pmem.h
new file mode 100644
index 000000000..d8ce3ec81
--- /dev/null
+++ b/kernel/arch/x86/include/asm/pmem.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ASM_X86_PMEM_H__
+#define __ASM_X86_PMEM_H__
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+		size_t n)
+{
+	int unwritten;
+
+	/*
+	 * We are copying between two kernel buffers, if
+	 * __copy_from_user_inatomic_nocache() returns an error (page
+	 * fault) we would have already reported a general protection fault
+	 * before the WARN+BUG.
+	 */
+	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+			(void __user *) src, n);
+	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+				__func__, dst, src, unwritten))
+		BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+	/*
+	 * wmb() to 'sfence' all previous writes such that they are
+	 * architecturally visible to 'pcommit'.  Note, that we've
+	 * already arranged for pmem writes to avoid the cache via
+	 * arch_memcpy_to_pmem().
+	 */
+	wmb();
+	pcommit_sfence();
+}
+
+/**
+ * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * @vaddr:	virtual start address
+ * @size:	number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction.  This function requires explicit ordering with an
+ * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
+ */
+static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+{
+	u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+	unsigned long clflush_mask = x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
+
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += x86_clflush_size)
+		clwb(p);
+}
+
+/*
+ * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
+ * iterators, so for other types (bvec & kvec) we must do a cache write-back.
+ */
+static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
+{
+	return iter_is_iovec(i) == false;
+}
+
+/**
+ * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr:	PMEM destination address
+ * @bytes:	number of bytes to copy
+ * @i:		iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+		struct iov_iter *i)
+{
+	void *vaddr = (void __force *)addr;
+	size_t len;
+
+	/* TODO: skip the write-back by always using non-temporal stores */
+	len = copy_from_iter_nocache(vaddr, bytes, i);
+
+	if (__iter_needs_pmem_wb(i))
+		__arch_wb_cache_pmem(vaddr, bytes);
+
+	return len;
+}
+
+/**
+ * arch_clear_pmem - zero a PMEM memory range
+ * @addr:	virtual start address
+ * @size:	number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+{
+	void *vaddr = (void __force *)addr;
+
+	/* TODO: implement the zeroing via non-temporal writes */
+	if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
+		clear_page(vaddr);
+	else
+		memset(vaddr, 0, size);
+
+	__arch_wb_cache_pmem(vaddr, size);
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+	/*
+	 * We require that wmb() be an 'sfence', that is only guaranteed on
+	 * 64-bit builds
+	 */
+	return static_cpu_has(X86_FEATURE_PCOMMIT);
+}
+#endif /* CONFIG_ARCH_HAS_PMEM_API */
+#endif /* __ASM_X86_PMEM_H__ */
diff --git a/kernel/arch/x86/include/asm/preempt.h b/kernel/arch/x86/include/asm/preempt.h
index 6ef9a161d..5dbd2d0f9 100644
--- a/kernel/arch/x86/include/asm/preempt.h
+++ b/kernel/arch/x86/include/asm/preempt.h
@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
 /*
  * must be macros to avoid header recursion hell
  */
-#define init_task_preempt_count(p) do { \
-	task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
-} while (0)
+#define init_task_preempt_count(p) do { } while (0)
 
 #define init_idle_preempt_count(p, cpu) do { \
-	task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
 	per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
 } while (0)
 
@@ -101,13 +98,13 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 /*
  * Returns true when we need to resched and can (barring IRQ state).
  */
-static __always_inline bool should_resched(void)
+static __always_inline bool should_resched(int preempt_offset)
 {
 #ifdef CONFIG_PREEMPT_LAZY
-	return unlikely(!raw_cpu_read_4(__preempt_count) || \
+	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset ||
 			test_thread_flag(TIF_NEED_RESCHED_LAZY));
 #else
-	return unlikely(!raw_cpu_read_4(__preempt_count));
+	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
 #endif
 }
 
@@ -115,11 +112,9 @@ static __always_inline bool should_resched(void)
   extern asmlinkage void ___preempt_schedule(void);
 # define __preempt_schedule() asm ("call ___preempt_schedule")
   extern asmlinkage void preempt_schedule(void);
-# ifdef CONFIG_CONTEXT_TRACKING
-    extern asmlinkage void ___preempt_schedule_context(void);
-#   define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
-    extern asmlinkage void preempt_schedule_context(void);
-# endif
+  extern asmlinkage void ___preempt_schedule_notrace(void);
+# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace")
+  extern asmlinkage void preempt_schedule_notrace(void);
 #endif
 
 #endif /* __ASM_PREEMPT_H */
diff --git a/kernel/arch/x86/include/asm/processor.h b/kernel/arch/x86/include/asm/processor.h
index 23ba6765b..2d5a50cb6 100644
--- a/kernel/arch/x86/include/asm/processor.h
+++ b/kernel/arch/x86/include/asm/processor.h
@@ -6,12 +6,12 @@
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
+struct vm86;
 
-#include <asm/vm86.h>
 #include <asm/math_emu.h>
 #include <asm/segment.h>
 #include <asm/types.h>
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
 #include <asm/current.h>
 #include <asm/cpufeature.h>
 #include <asm/page.h>
@@ -21,6 +21,7 @@ struct mm_struct;
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
 #include <asm/special_insns.h>
+#include <asm/fpu/types.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -52,11 +53,16 @@ static inline void *current_text_addr(void)
 	return pc;
 }
 
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
 #ifdef CONFIG_X86_VSMP
 # define ARCH_MIN_TASKALIGN		(1 << INTERNODE_CACHE_SHIFT)
 # define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
 #else
-# define ARCH_MIN_TASKALIGN		16
+# define ARCH_MIN_TASKALIGN		__alignof__(union fpregs_state)
 # define ARCH_MIN_MMSTRUCT_ALIGN	0
 #endif
 
@@ -166,7 +172,6 @@ extern const struct seq_operations cpuinfo_op;
 #define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
-extern void fpu_detect(struct cpuinfo_x86 *c);
 
 extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
@@ -313,128 +318,6 @@ struct orig_ist {
 	unsigned long		ist[7];
 };
 
-#define	MXCSR_DEFAULT		0x1f80
-
-struct i387_fsave_struct {
-	u32			cwd;	/* FPU Control Word		*/
-	u32			swd;	/* FPU Status Word		*/
-	u32			twd;	/* FPU Tag Word			*/
-	u32			fip;	/* FPU IP Offset		*/
-	u32			fcs;	/* FPU IP Selector		*/
-	u32			foo;	/* FPU Operand Pointer Offset	*/
-	u32			fos;	/* FPU Operand Pointer Selector	*/
-
-	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
-	u32			st_space[20];
-
-	/* Software status information [not touched by FSAVE ]:		*/
-	u32			status;
-};
-
-struct i387_fxsave_struct {
-	u16			cwd; /* Control Word			*/
-	u16			swd; /* Status Word			*/
-	u16			twd; /* Tag Word			*/
-	u16			fop; /* Last Instruction Opcode		*/
-	union {
-		struct {
-			u64	rip; /* Instruction Pointer		*/
-			u64	rdp; /* Data Pointer			*/
-		};
-		struct {
-			u32	fip; /* FPU IP Offset			*/
-			u32	fcs; /* FPU IP Selector			*/
-			u32	foo; /* FPU Operand Offset		*/
-			u32	fos; /* FPU Operand Selector		*/
-		};
-	};
-	u32			mxcsr;		/* MXCSR Register State */
-	u32			mxcsr_mask;	/* MXCSR Mask		*/
-
-	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
-	u32			st_space[32];
-
-	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
-	u32			xmm_space[64];
-
-	u32			padding[12];
-
-	union {
-		u32		padding1[12];
-		u32		sw_reserved[12];
-	};
-
-} __attribute__((aligned(16)));
-
-struct i387_soft_struct {
-	u32			cwd;
-	u32			swd;
-	u32			twd;
-	u32			fip;
-	u32			fcs;
-	u32			foo;
-	u32			fos;
-	/* 8*10 bytes for each FP-reg = 80 bytes: */
-	u32			st_space[20];
-	u8			ftop;
-	u8			changed;
-	u8			lookahead;
-	u8			no_update;
-	u8			rm;
-	u8			alimit;
-	struct math_emu_info	*info;
-	u32			entry_eip;
-};
-
-struct ymmh_struct {
-	/* 16 * 16 bytes for each YMMH-reg = 256 bytes */
-	u32 ymmh_space[64];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
-	u8 reserved[128];
-};
-
-struct bndreg {
-	u64 lower_bound;
-	u64 upper_bound;
-} __packed;
-
-struct bndcsr {
-	u64 bndcfgu;
-	u64 bndstatus;
-} __packed;
-
-struct xsave_hdr_struct {
-	u64 xstate_bv;
-	u64 xcomp_bv;
-	u64 reserved[6];
-} __attribute__((packed));
-
-struct xsave_struct {
-	struct i387_fxsave_struct i387;
-	struct xsave_hdr_struct xsave_hdr;
-	struct ymmh_struct ymmh;
-	struct lwp_struct lwp;
-	struct bndreg bndreg[4];
-	struct bndcsr bndcsr;
-	/* new processor state extensions will go here */
-} __attribute__ ((packed, aligned (64)));
-
-union thread_xstate {
-	struct i387_fsave_struct	fsave;
-	struct i387_fxsave_struct	fxsave;
-	struct i387_soft_struct		soft;
-	struct xsave_struct		xsave;
-};
-
-struct fpu {
-	unsigned int last_cpu;
-	unsigned int has_fpu;
-	union thread_xstate *state;
-};
-
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
@@ -483,8 +366,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
 
 struct perf_event;
 
@@ -508,6 +389,7 @@ struct thread_struct {
 	unsigned long		fs;
 #endif
 	unsigned long		gs;
+
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -518,32 +400,22 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
-	/* floating point and extended processor state */
-	struct fpu		fpu;
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_VM86
 	/* Virtual 86 mode info */
-	struct vm86_struct __user *vm86_info;
-	unsigned long		screen_bitmap;
-	unsigned long		v86flags;
-	unsigned long		v86mask;
-	unsigned long		saved_sp0;
-	unsigned int		saved_fs;
-	unsigned int		saved_gs;
+	struct vm86		*vm86;
 #endif
 	/* IO permissions: */
 	unsigned long		*io_bitmap_ptr;
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
 	/*
-	 * fpu_counter contains the number of consecutive context switches
-	 * that the FPU is used. If this is over a threshold, the lazy fpu
-	 * saving becomes unlazy to save the trap. This is an unsigned char
-	 * so that after 256 times the counter wraps and the behavior turns
-	 * lazy again; this to deal with bursty apps that only use FPU for
-	 * a short time
+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+	 * the end.
 	 */
-	unsigned char fpu_counter;
 };
 
 /*
@@ -600,6 +472,7 @@ static inline unsigned long current_top_of_stack(void)
 #else
 #define __cpuid			native_cpuid
 #define paravirt_enabled()	0
+#define paravirt_has(x) 	0
 
 static inline void load_sp0(struct tss_struct *tss,
 			    struct thread_struct *thread)
@@ -684,12 +557,12 @@ static inline unsigned int cpuid_edx(unsigned int op)
 }
 
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
+static __always_inline void rep_nop(void)
 {
 	asm volatile("rep; nop" ::: "memory");
 }
 
-static inline void cpu_relax(void)
+static __always_inline void cpu_relax(void)
 {
 	rep_nop();
 }
@@ -773,14 +646,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 
 extern void set_task_blockstep(struct task_struct *task, bool on);
 
-/*
- * from system description table in BIOS. Mostly for MCA use, but
- * others may find it useful:
- */
-extern unsigned int		machine_id;
-extern unsigned int		machine_submodel_id;
-extern unsigned int		BIOS_revision;
-
 /* Boot loader type from the setup header: */
 extern int			bootloader_type;
 extern int			bootloader_version;
@@ -842,7 +707,6 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define INIT_THREAD  {							  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
-	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
 }
@@ -928,24 +792,25 @@ extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
 /* Register/unregister a process' MPX related resource */
-#define MPX_ENABLE_MANAGEMENT(tsk)	mpx_enable_management((tsk))
-#define MPX_DISABLE_MANAGEMENT(tsk)	mpx_disable_management((tsk))
+#define MPX_ENABLE_MANAGEMENT()	mpx_enable_management()
+#define MPX_DISABLE_MANAGEMENT()	mpx_disable_management()
 
 #ifdef CONFIG_X86_INTEL_MPX
-extern int mpx_enable_management(struct task_struct *tsk);
-extern int mpx_disable_management(struct task_struct *tsk);
+extern int mpx_enable_management(void);
+extern int mpx_disable_management(void);
 #else
-static inline int mpx_enable_management(struct task_struct *tsk)
+static inline int mpx_enable_management(void)
 {
 	return -EINVAL;
 }
-static inline int mpx_disable_management(struct task_struct *tsk)
+static inline int mpx_disable_management(void)
 {
 	return -EINVAL;
 }
 #endif /* CONFIG_X86_INTEL_MPX */
 
 extern u16 amd_get_nb_id(int cpu);
+extern u32 amd_get_nodes_per_socket(void);
 
 static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
 {
diff --git a/kernel/arch/x86/include/asm/proto.h b/kernel/arch/x86/include/asm/proto.h
index a90f8972d..a4a77286c 100644
--- a/kernel/arch/x86/include/asm/proto.h
+++ b/kernel/arch/x86/include/asm/proto.h
@@ -5,12 +5,14 @@
 
 /* misc architecture specific prototypes */
 
-void system_call(void);
 void syscall_init(void);
 
-void ia32_syscall(void);
-void ia32_cstar_target(void);
-void ia32_sysenter_target(void);
+void entry_SYSCALL_64(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_32(void);
+void entry_INT80_compat(void);
+void entry_SYSENTER_32(void);
+void entry_SYSENTER_compat(void);
 
 void x86_configure_nx(void);
 void x86_report_nx(void);
diff --git a/kernel/arch/x86/include/asm/ptrace.h b/kernel/arch/x86/include/asm/ptrace.h
index 5fabf1362..6271281f9 100644
--- a/kernel/arch/x86/include/asm/ptrace.h
+++ b/kernel/arch/x86/include/asm/ptrace.h
@@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
 				       unsigned long phase1_result);
 
 extern long syscall_trace_enter(struct pt_regs *);
-extern void syscall_trace_leave(struct pt_regs *);
 
 static inline unsigned long regs_return_value(struct pt_regs *regs)
 {
diff --git a/kernel/arch/x86/include/asm/pvclock-abi.h b/kernel/arch/x86/include/asm/pvclock-abi.h
index 6167fd798..67f082301 100644
--- a/kernel/arch/x86/include/asm/pvclock-abi.h
+++ b/kernel/arch/x86/include/asm/pvclock-abi.h
@@ -41,5 +41,7 @@ struct pvclock_wall_clock {
 
 #define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
 #define PVCLOCK_GUEST_STOPPED	(1 << 1)
+/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/kernel/arch/x86/include/asm/pvclock.h b/kernel/arch/x86/include/asm/pvclock.h
index d6b078e9f..7a6bed5c0 100644
--- a/kernel/arch/x86/include/asm/pvclock.h
+++ b/kernel/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 static __always_inline
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 {
-	u64 delta = __native_read_tsc() - src->tsc_timestamp;
+	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 				   src->tsc_shift);
 }
@@ -76,17 +76,10 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 	u8 ret_flags;
 
 	version = src->version;
-	/* Note: emulated platforms which do not advertise SSE2 support
-	 * result in kvmclock not using the necessary RDTSC barriers.
-	 * Without barriers, it is possible that RDTSC instruction reads from
-	 * the time stamp counter outside rdtsc_barrier protected section
-	 * below, resulting in violation of monotonicity.
-	 */
-	rdtsc_barrier();
+
 	offset = pvclock_get_nsec_offset(src);
 	ret = src->system_time + offset;
 	ret_flags = src->flags;
-	rdtsc_barrier();
 
 	*cycles = ret;
 	*flags = ret_flags;
diff --git a/kernel/arch/x86/include/asm/qrwlock.h b/kernel/arch/x86/include/asm/qrwlock.h
index ae0e241e2..c537cbb03 100644
--- a/kernel/arch/x86/include/asm/qrwlock.h
+++ b/kernel/arch/x86/include/asm/qrwlock.h
@@ -2,16 +2,6 @@
 #define _ASM_X86_QRWLOCK_H
 
 #include <asm-generic/qrwlock_types.h>
-
-#ifndef CONFIG_X86_PPRO_FENCE
-#define queue_write_unlock queue_write_unlock
-static inline void queue_write_unlock(struct qrwlock *lock)
-{
-        barrier();
-        ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
-}
-#endif
-
 #include <asm-generic/qrwlock.h>
 
 #endif /* _ASM_X86_QRWLOCK_H */
diff --git a/kernel/arch/x86/include/asm/qspinlock.h b/kernel/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 000000000..eaba08076
--- /dev/null
+++ b/kernel/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,66 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+
+#define	queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+	smp_store_release((u8 *)lock, 0);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
+{
+	if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
+
+	/*
+	 * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+	 * back to a Test-and-Set spinlock, because fair locks have
+	 * horrible lock 'holder' preemption issues.
+	 */
+
+	do {
+		while (atomic_read(&lock->val) != 0)
+			cpu_relax();
+	} while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
+
+	return true;
+}
+#endif /* CONFIG_PARAVIRT */
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/kernel/arch/x86/include/asm/qspinlock_paravirt.h b/kernel/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000..b002e711b
--- /dev/null
+++ b/kernel/arch/x86/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+
+#endif
diff --git a/kernel/arch/x86/include/asm/serial.h b/kernel/arch/x86/include/asm/serial.h
index 8378b8c91..bb658211e 100644
--- a/kernel/arch/x86/include/asm/serial.h
+++ b/kernel/arch/x86/include/asm/serial.h
@@ -11,7 +11,7 @@
 #define BASE_BAUD (1843200/16)
 
 /* Standard COM flags (except for COM4, because of the 8514 problem) */
-#ifdef CONFIG_SERIAL_DETECT_IRQ
+#ifdef CONFIG_SERIAL_8250_DETECT_IRQ
 # define STD_COMX_FLAGS	(UPF_BOOT_AUTOCONF |	UPF_SKIP_TEST	| UPF_AUTO_IRQ)
 # define STD_COM4_FLAGS	(UPF_BOOT_AUTOCONF |	0		| UPF_AUTO_IRQ)
 #else
diff --git a/kernel/arch/x86/include/asm/setup.h b/kernel/arch/x86/include/asm/setup.h
index f69e06b28..11af24e09 100644
--- a/kernel/arch/x86/include/asm/setup.h
+++ b/kernel/arch/x86/include/asm/setup.h
@@ -60,17 +60,24 @@ static inline void x86_ce4100_early_setup(void) { }
 #ifndef _SETUP
 
 #include <asm/espfix.h>
+#include <linux/kernel.h>
 
 /*
  * This is set up by the setup-routine at boot-time
  */
 extern struct boot_params boot_params;
+extern char _text[];
 
 static inline bool kaslr_enabled(void)
 {
 	return !!(boot_params.hdr.loadflags & KASLR_FLAG);
 }
 
+static inline unsigned long kaslr_offset(void)
+{
+	return (unsigned long)&_text - __START_KERNEL;
+}
+
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/kernel/arch/x86/include/asm/sigcontext.h b/kernel/arch/x86/include/asm/sigcontext.h
index 9dfce4e04..e6cd2c489 100644
--- a/kernel/arch/x86/include/asm/sigcontext.h
+++ b/kernel/arch/x86/include/asm/sigcontext.h
@@ -1,79 +1,8 @@
 #ifndef _ASM_X86_SIGCONTEXT_H
 #define _ASM_X86_SIGCONTEXT_H
 
-#include <uapi/asm/sigcontext.h>
-
-#ifdef __i386__
-struct sigcontext {
-	unsigned short gs, __gsh;
-	unsigned short fs, __fsh;
-	unsigned short es, __esh;
-	unsigned short ds, __dsh;
-	unsigned long di;
-	unsigned long si;
-	unsigned long bp;
-	unsigned long sp;
-	unsigned long bx;
-	unsigned long dx;
-	unsigned long cx;
-	unsigned long ax;
-	unsigned long trapno;
-	unsigned long err;
-	unsigned long ip;
-	unsigned short cs, __csh;
-	unsigned long flags;
-	unsigned long sp_at_signal;
-	unsigned short ss, __ssh;
+/* This is a legacy header - all kernel code includes <uapi/asm/sigcontext.h> directly. */
 
-	/*
-	 * fpstate is really (struct _fpstate *) or (struct _xstate *)
-	 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
-	 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
-	 * of extended memory layout. See comments at the definition of
-	 * (struct _fpx_sw_bytes)
-	 */
-	void __user *fpstate;		/* zero when no FPU/extended context */
-	unsigned long oldmask;
-	unsigned long cr2;
-};
-#else /* __i386__ */
-struct sigcontext {
-	unsigned long r8;
-	unsigned long r9;
-	unsigned long r10;
-	unsigned long r11;
-	unsigned long r12;
-	unsigned long r13;
-	unsigned long r14;
-	unsigned long r15;
-	unsigned long di;
-	unsigned long si;
-	unsigned long bp;
-	unsigned long bx;
-	unsigned long dx;
-	unsigned long ax;
-	unsigned long cx;
-	unsigned long sp;
-	unsigned long ip;
-	unsigned long flags;
-	unsigned short cs;
-	unsigned short gs;
-	unsigned short fs;
-	unsigned short __pad0;
-	unsigned long err;
-	unsigned long trapno;
-	unsigned long oldmask;
-	unsigned long cr2;
+#include <uapi/asm/sigcontext.h>
 
-	/*
-	 * fpstate is really (struct _fpstate *) or (struct _xstate *)
-	 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
-	 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
-	 * of extended memory layout. See comments at the definition of
-	 * (struct _fpx_sw_bytes)
-	 */
-	void __user *fpstate;		/* zero when no FPU/extended context */
-	unsigned long reserved1[8];
-};
-#endif /* !__i386__ */
 #endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/kernel/arch/x86/include/asm/sigframe.h b/kernel/arch/x86/include/asm/sigframe.h
index 7c7c27c97..34edd1650 100644
--- a/kernel/arch/x86/include/asm/sigframe.h
+++ b/kernel/arch/x86/include/asm/sigframe.h
@@ -1,15 +1,14 @@
 #ifndef _ASM_X86_SIGFRAME_H
 #define _ASM_X86_SIGFRAME_H
 
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
 #include <asm/siginfo.h>
 #include <asm/ucontext.h>
+#include <linux/compat.h>
 
 #ifdef CONFIG_X86_32
 #define sigframe_ia32		sigframe
 #define rt_sigframe_ia32	rt_sigframe
-#define sigcontext_ia32		sigcontext
-#define _fpstate_ia32		_fpstate
 #define ucontext_ia32		ucontext
 #else /* !CONFIG_X86_32 */
 
@@ -23,7 +22,7 @@
 struct sigframe_ia32 {
 	u32 pretcode;
 	int sig;
-	struct sigcontext_ia32 sc;
+	struct sigcontext_32 sc;
 	/*
 	 * fpstate is unused. fpstate is moved/allocated after
 	 * retcode[] below. This movement allows to have the FP state and the
@@ -32,7 +31,7 @@ struct sigframe_ia32 {
 	 * the offset of extramask[] in the sigframe and thus prevent any
 	 * legacy application accessing/modifying it.
 	 */
-	struct _fpstate_ia32 fpstate_unused;
+	struct _fpstate_32 fpstate_unused;
 #ifdef CONFIG_IA32_EMULATION
 	unsigned int extramask[_COMPAT_NSIG_WORDS-1];
 #else /* !CONFIG_IA32_EMULATION */
@@ -69,6 +68,15 @@ struct rt_sigframe {
 
 #ifdef CONFIG_X86_X32_ABI
 
+struct ucontext_x32 {
+	unsigned int	  uc_flags;
+	unsigned int 	  uc_link;
+	compat_stack_t	  uc_stack;
+	unsigned int	  uc__pad0;     /* needed for alignment */
+	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
+	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
+};
+
 struct rt_sigframe_x32 {
 	u64 pretcode;
 	struct ucontext_x32 uc;
diff --git a/kernel/arch/x86/include/asm/signal.h b/kernel/arch/x86/include/asm/signal.h
index b1b08a28c..3f5b4ee2e 100644
--- a/kernel/arch/x86/include/asm/signal.h
+++ b/kernel/arch/x86/include/asm/signal.h
@@ -32,7 +32,7 @@ typedef struct {
  * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
  * trap.
  */
-#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
+#if defined(CONFIG_PREEMPT_RT_FULL)
 #define ARCH_RT_DELAYS_SIGNAL_SEND
 #endif
 
@@ -43,11 +43,11 @@ typedef sigset_t compat_sigset_t;
 #endif /* __ASSEMBLY__ */
 #include <uapi/asm/signal.h>
 #ifndef __ASSEMBLY__
-extern void do_notify_resume(struct pt_regs *, void *, __u32);
+extern void do_signal(struct pt_regs *regs);
 
 #define __ARCH_HAS_SA_RESTORER
 
-#include <asm/sigcontext.h>
+#include <uapi/asm/sigcontext.h>
 
 #ifdef __i386__
 
diff --git a/kernel/arch/x86/include/asm/simd.h b/kernel/arch/x86/include/asm/simd.h
index ee80b92f0..6c8a7ed13 100644
--- a/kernel/arch/x86/include/asm/simd.h
+++ b/kernel/arch/x86/include/asm/simd.h
@@ -1,5 +1,5 @@
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /*
  * may_use_simd - whether it is allowable at this time to issue SIMD
diff --git a/kernel/arch/x86/include/asm/smp.h b/kernel/arch/x86/include/asm/smp.h
index 17a8dced1..222a6a3ca 100644
--- a/kernel/arch/x86/include/asm/smp.h
+++ b/kernel/arch/x86/include/asm/smp.h
@@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
 
-static inline struct cpumask *cpu_sibling_mask(int cpu)
-{
-	return per_cpu(cpu_sibling_map, cpu);
-}
-
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
-	return per_cpu(cpu_core_map, cpu);
-}
-
 static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 {
 	return per_cpu(cpu_llc_shared_map, cpu);
diff --git a/kernel/arch/x86/include/asm/special_insns.h b/kernel/arch/x86/include/asm/special_insns.h
index aeb4666e0..2270e41b3 100644
--- a/kernel/arch/x86/include/asm/special_insns.h
+++ b/kernel/arch/x86/include/asm/special_insns.h
@@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p)
 		: [pax] "a" (p));
 }
 
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM.  The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing.
+ *
+ * Example:
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ *         unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ *         void *vend = vaddr + size;
+ *         void *p;
+ *
+ *         for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ *              p < vend; p += boot_cpu_data.x86_clflush_size)
+ *                 clwb(p);
+ *
+ *         // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ *         // MFENCE via mb() also works
+ *         wmb();
+ *
+ *         // PCOMMIT and the required SFENCE for ordering
+ *         pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
 static inline void pcommit_sfence(void)
 {
 	alternative(ASM_NOP7,
diff --git a/kernel/arch/x86/include/asm/spinlock.h b/kernel/arch/x86/include/asm/spinlock.h
index 64b611782..be0a05913 100644
--- a/kernel/arch/x86/include/asm/spinlock.h
+++ b/kernel/arch/x86/include/asm/spinlock.h
@@ -42,6 +42,10 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 		cpu_relax();
 	}
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 /*
  * Read-write spinlocks, allowing multiple readers
diff --git a/kernel/arch/x86/include/asm/spinlock_types.h b/kernel/arch/x86/include/asm/spinlock_types.h
index 5f9d7572d..65c3e37f8 100644
--- a/kernel/arch/x86/include/asm/spinlock_types.h
+++ b/kernel/arch/x86/include/asm/spinlock_types.h
@@ -23,6 +23,9 @@ typedef u32 __ticketpair_t;
 
 #define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 typedef struct arch_spinlock {
 	union {
 		__ticketpair_t head_tail;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 #include <asm-generic/qrwlock_types.h>
 
diff --git a/kernel/arch/x86/include/asm/stackprotector.h b/kernel/arch/x86/include/asm/stackprotector.h
index 64fb5cbe5..02fa39652 100644
--- a/kernel/arch/x86/include/asm/stackprotector.h
+++ b/kernel/arch/x86/include/asm/stackprotector.h
@@ -39,7 +39,9 @@
 #include <asm/processor.h>
 #include <asm/percpu.h>
 #include <asm/desc.h>
+
 #include <linux/random.h>
+#include <linux/sched.h>
 
 /*
  * 24 byte read-only segment initializer for stack canary.  Linker
@@ -72,13 +74,12 @@ static __always_inline void boot_init_stack_canary(void)
 	 * For preempt-rt we need to weaken the randomness a bit, as
 	 * we can't call into the random generator from atomic context
 	 * due to locking constraints. We just leave canary
-	 * uninitialized and use the TSC based randomness on top of
-	 * it.
+	 * uninitialized and use the TSC based randomness on top of it.
 	 */
 #ifndef CONFIG_PREEMPT_RT_FULL
 	get_random_bytes(&canary, sizeof(canary));
 #endif
-	tsc = __native_read_tsc();
+	tsc = rdtsc();
 	canary += tsc + (tsc << 32UL);
 
 	current->stack_canary = canary;
diff --git a/kernel/arch/x86/include/asm/string_64.h b/kernel/arch/x86/include/asm/string_64.h
index e46611969..ff8b9a17d 100644
--- a/kernel/arch/x86/include/asm/string_64.h
+++ b/kernel/arch/x86/include/asm/string_64.h
@@ -27,12 +27,11 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
+extern void *memcpy(void *to, const void *from, size_t len);
 extern void *__memcpy(void *to, const void *from, size_t len);
 
 #ifndef CONFIG_KMEMCHECK
-#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
-extern void *memcpy(void *to, const void *from, size_t len);
-#else
+#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
 #define memcpy(dst, src, len)					\
 ({								\
 	size_t __len = (len);					\
diff --git a/kernel/arch/x86/include/asm/suspend_32.h b/kernel/arch/x86/include/asm/suspend_32.h
index 552d6c90a..d1793f068 100644
--- a/kernel/arch/x86/include/asm/suspend_32.h
+++ b/kernel/arch/x86/include/asm/suspend_32.h
@@ -7,7 +7,7 @@
 #define _ASM_X86_SUSPEND_32_H
 
 #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /* image of the saved processor state */
 struct saved_context {
diff --git a/kernel/arch/x86/include/asm/suspend_64.h b/kernel/arch/x86/include/asm/suspend_64.h
index bc6232834..7ebf0ebe4 100644
--- a/kernel/arch/x86/include/asm/suspend_64.h
+++ b/kernel/arch/x86/include/asm/suspend_64.h
@@ -7,7 +7,7 @@
 #define _ASM_X86_SUSPEND_64_H
 
 #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /*
  * Image of the saved processor state, used by the low level ACPI suspend to
diff --git a/kernel/arch/x86/include/asm/syscall.h b/kernel/arch/x86/include/asm/syscall.h
index d6a756ae0..999b7cd2e 100644
--- a/kernel/arch/x86/include/asm/syscall.h
+++ b/kernel/arch/x86/include/asm/syscall.h
@@ -20,9 +20,21 @@
 #include <asm/thread_info.h>	/* for TS_COMPAT */
 #include <asm/unistd.h>
 
-typedef void (*sys_call_ptr_t)(void);
+typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
+					  unsigned long, unsigned long,
+					  unsigned long, unsigned long);
 extern const sys_call_ptr_t sys_call_table[];
 
+#if defined(CONFIG_X86_32)
+#define ia32_sys_call_table sys_call_table
+#define __NR_syscall_compat_max __NR_syscall_max
+#define IA32_NR_syscalls NR_syscalls
+#endif
+
+#if defined(CONFIG_IA32_EMULATION)
+extern const sys_call_ptr_t ia32_sys_call_table[];
+#endif
+
 /*
  * Only the low 32 bits of orig_ax are meaningful, so we return int.
  * This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/kernel/arch/x86/include/asm/syscalls.h b/kernel/arch/x86/include/asm/syscalls.h
index 592a6a672..91dfcafe2 100644
--- a/kernel/arch/x86/include/asm/syscalls.h
+++ b/kernel/arch/x86/include/asm/syscalls.h
@@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
 asmlinkage unsigned long sys_sigreturn(void);
 
 /* kernel/vm86_32.c */
+struct vm86_struct;
 asmlinkage long sys_vm86old(struct vm86_struct __user *);
 asmlinkage long sys_vm86(unsigned long, unsigned long);
 
diff --git a/kernel/arch/x86/include/asm/thread_info.h b/kernel/arch/x86/include/asm/thread_info.h
index 606144afb..ddb63bd90 100644
--- a/kernel/arch/x86/include/asm/thread_info.h
+++ b/kernel/arch/x86/include/asm/thread_info.h
@@ -27,14 +27,17 @@
  * Without this offset, that can result in a page fault.  (We are
  * careful that, in this case, the value we read doesn't matter.)
  *
- * In vm86 mode, the hardware frame is much longer still, but we neither
- * access the extra members from NMI context, nor do we write such a
- * frame at sp0 at all.
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
  *
  * x86_64 has a fixed-length stack frame.
  */
 #ifdef CONFIG_X86_32
-# define TOP_OF_KERNEL_STACK_PADDING 8
+# ifdef CONFIG_VM86
+#  define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
 #else
 # define TOP_OF_KERNEL_STACK_PADDING 0
 #endif
@@ -54,11 +57,9 @@ struct thread_info {
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
-	int			saved_preempt_count;
-	int			preempt_lazy_count;	/* 0 => lazy preemptable
-							 <0  => BUG */
 	mm_segment_t		addr_limit;
-	void __user		*sysenter_return;
+	int			preempt_lazy_count;	/* 0 => lazy preemptable
+							  <0 => BUG */
 	unsigned int		sig_on_uaccess_error:1;
 	unsigned int		uaccess_err:1;	/* uaccess failed */
 };
@@ -68,7 +69,6 @@ struct thread_info {
 	.task		= &tsk,			\
 	.flags		= 0,			\
 	.cpu		= 0,			\
-	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
 	.addr_limit	= KERNEL_DS,		\
 }
 
@@ -144,27 +144,11 @@ struct thread_info {
 	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
 	 _TIF_NOHZ)
 
-/* work to do in syscall_trace_leave() */
-#define _TIF_WORK_SYSCALL_EXIT	\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |	\
-	 _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
-
-/* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK							\
-	(0x0000FFFF &							\
-	 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|			\
-	   _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
-
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK						\
 	((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT |	\
 	_TIF_NOHZ)
 
-/* Only used for 64 bit */
-#define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |				\
-	 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
-
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
@@ -183,8 +167,6 @@ struct thread_info {
  */
 #ifndef __ASSEMBLY__
 
-DECLARE_PER_CPU(unsigned long, kernel_stack);
-
 static inline struct thread_info *current_thread_info(void)
 {
 	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
@@ -203,9 +185,13 @@ static inline unsigned long current_stack_pointer(void)
 
 #else /* !__ASSEMBLY__ */
 
+#ifdef CONFIG_X86_64
+# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+#endif
+
 /* Load thread_info address into "reg" */
 #define GET_THREAD_INFO(reg) \
-	_ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
+	_ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
 	_ASM_SUB $(THREAD_SIZE),reg ;
 
 /*
diff --git a/kernel/arch/x86/include/asm/tlbflush.h b/kernel/arch/x86/include/asm/tlbflush.h
index cd791948b..6df202940 100644
--- a/kernel/arch/x86/include/asm/tlbflush.h
+++ b/kernel/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
 
 #endif	/* SMP */
 
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {		\
+	inc_irq_stat(irq_tlb_count);	\
+	local_flush_tlb();		\
+}
+
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)	\
 	native_flush_tlb_others(mask, mm, start, end)
diff --git a/kernel/arch/x86/include/asm/topology.h b/kernel/arch/x86/include/asm/topology.h
index 0e8f04f2c..0fb46482d 100644
--- a/kernel/arch/x86/include/asm/topology.h
+++ b/kernel/arch/x86/include/asm/topology.h
@@ -26,7 +26,7 @@
 #define _ASM_X86_TOPOLOGY_H
 
 #ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
 #  define ENABLE_TOPO_DEFINES
 # endif
 #else
@@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
-#define topology_thread_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/kernel/arch/x86/include/asm/trace/irq_vectors.h b/kernel/arch/x86/include/asm/trace/irq_vectors.h
index 4cab89000..38a09a13a 100644
--- a/kernel/arch/x86/include/asm/trace/irq_vectors.h
+++ b/kernel/arch/x86/include/asm/trace/irq_vectors.h
@@ -101,6 +101,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single);
 DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
 
 /*
+ * deferred_error_apic - called when entering/exiting a deferred apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+
+/*
  * thermal_apic - called when entering/exiting a thermal apic interrupt
  * vector handler
  */
diff --git a/kernel/arch/x86/include/asm/trace/mpx.h b/kernel/arch/x86/include/asm/trace/mpx.h
new file mode 100644
index 000000000..0f492fc50
--- /dev/null
+++ b/kernel/arch/x86/include/asm/trace/mpx.h
@@ -0,0 +1,133 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mpx
+
+#if !defined(_TRACE_MPX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MPX_H
+
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_X86_INTEL_MPX
+
+TRACE_EVENT(mpx_bounds_register_exception,
+
+	TP_PROTO(void *addr_referenced,
+		 const struct mpx_bndreg *bndreg),
+	TP_ARGS(addr_referenced, bndreg),
+
+	TP_STRUCT__entry(
+		__field(void *, addr_referenced)
+		__field(u64, lower_bound)
+		__field(u64, upper_bound)
+	),
+
+	TP_fast_assign(
+		__entry->addr_referenced = addr_referenced;
+		__entry->lower_bound = bndreg->lower_bound;
+		__entry->upper_bound = bndreg->upper_bound;
+	),
+	/*
+	 * Note that we are printing out the '~' of the upper
+	 * bounds register here.  It is actually stored in its
+	 * one's complement form so that its 'init' state
+	 * corresponds to all 0's.  But, that looks like
+	 * gibberish when printed out, so print out the 1's
+	 * complement instead of the actual value here.  Note
+	 * though that you still need to specify filters for the
+	 * actual value, not the displayed one.
+	 */
+	TP_printk("address referenced: 0x%p bounds: lower: 0x%llx ~upper: 0x%llx",
+		__entry->addr_referenced,
+		__entry->lower_bound,
+		~__entry->upper_bound
+	)
+);
+
+TRACE_EVENT(bounds_exception_mpx,
+
+	TP_PROTO(const struct mpx_bndcsr *bndcsr),
+	TP_ARGS(bndcsr),
+
+	TP_STRUCT__entry(
+		__field(u64, bndcfgu)
+		__field(u64, bndstatus)
+	),
+
+	TP_fast_assign(
+		/* need to get rid of the 'const' on bndcsr */
+		__entry->bndcfgu   = (u64)bndcsr->bndcfgu;
+		__entry->bndstatus = (u64)bndcsr->bndstatus;
+	),
+
+	TP_printk("bndcfgu:0x%llx bndstatus:0x%llx",
+		__entry->bndcfgu,
+		__entry->bndstatus)
+);
+
+DECLARE_EVENT_CLASS(mpx_range_trace,
+
+	TP_PROTO(unsigned long start,
+		 unsigned long end),
+	TP_ARGS(start, end),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, start)
+		__field(unsigned long, end)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end   = end;
+	),
+
+	TP_printk("[0x%p:0x%p]",
+		(void *)__entry->start,
+		(void *)__entry->end
+	)
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_zap,
+	TP_PROTO(unsigned long start, unsigned long end),
+	TP_ARGS(start, end)
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_search,
+	TP_PROTO(unsigned long start, unsigned long end),
+	TP_ARGS(start, end)
+);
+
+TRACE_EVENT(mpx_new_bounds_table,
+
+	TP_PROTO(unsigned long table_vaddr),
+	TP_ARGS(table_vaddr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, table_vaddr)
+	),
+
+	TP_fast_assign(
+		__entry->table_vaddr = table_vaddr;
+	),
+
+	TP_printk("table vaddr:%p", (void *)__entry->table_vaddr)
+);
+
+#else
+
+/*
+ * This gets used outside of MPX-specific code, so we need a stub.
+ */
+static inline
+void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr)
+{
+}
+
+#endif /* CONFIG_X86_INTEL_MPX */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mpx
+#endif /* _TRACE_MPX_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/arch/x86/include/asm/traps.h b/kernel/arch/x86/include/asm/traps.h
index 4e49d7dff..c34966197 100644
--- a/kernel/arch/x86/include/asm/traps.h
+++ b/kernel/arch/x86/include/asm/traps.h
@@ -108,11 +108,12 @@ extern int panic_on_unrecovered_nmi;
 void math_emulate(struct math_emu_info *);
 #ifndef CONFIG_X86_32
 asmlinkage void smp_thermal_interrupt(void);
-asmlinkage void mce_threshold_interrupt(void);
+asmlinkage void smp_threshold_interrupt(void);
+asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
-extern enum ctx_state ist_enter(struct pt_regs *regs);
-extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_end_non_atomic(void);
 
diff --git a/kernel/arch/x86/include/asm/tsc.h b/kernel/arch/x86/include/asm/tsc.h
index 94605c0e9..6d7c5479b 100644
--- a/kernel/arch/x86/include/asm/tsc.h
+++ b/kernel/arch/x86/include/asm/tsc.h
@@ -21,28 +21,12 @@ extern void disable_TSC(void);
 
 static inline cycles_t get_cycles(void)
 {
-	unsigned long long ret = 0;
-
 #ifndef CONFIG_X86_TSC
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	rdtscll(ret);
-
-	return ret;
-}
 
-static __always_inline cycles_t vget_cycles(void)
-{
-	/*
-	 * We only do VDSOs on TSC capable CPUs, so this shouldn't
-	 * access boot_cpu_data (which is not VDSO-safe):
-	 */
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-	return (cycles_t)__native_read_tsc();
+	return rdtsc();
 }
 
 extern void tsc_init(void);
@@ -51,6 +35,7 @@ extern int unsynchronized_tsc(void);
 extern int check_tsc_unstable(void);
 extern int check_tsc_disabled(void);
 extern unsigned long native_calibrate_tsc(void);
+extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
 
 extern int tsc_clocksource_reliable;
 
diff --git a/kernel/arch/x86/include/asm/uaccess.h b/kernel/arch/x86/include/asm/uaccess.h
index a8df874f3..09b1b0ab9 100644
--- a/kernel/arch/x86/include/asm/uaccess.h
+++ b/kernel/arch/x86/include/asm/uaccess.h
@@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	 * limit, not add it to the address).
 	 */
 	if (__builtin_constant_p(size))
-		return addr > limit - size;
+		return unlikely(addr > limit - size);
 
 	/* Arbitrary sizes? Be careful about overflow */
 	addr += size;
-	if (addr < size)
+	if (unlikely(addr < size))
 		return true;
-	return addr > limit;
+	return unlikely(addr > limit);
 }
 
 #define __range_not_ok(addr, size, limit)				\
@@ -182,7 +182,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
 		     : "=a" (__ret_gu), "=r" (__val_gu)			\
 		     : "0" (ptr), "i" (sizeof(*(ptr))));		\
 	(x) = (__force __typeof__(*(ptr))) __val_gu;			\
-	__ret_gu;							\
+	__builtin_expect(__ret_gu, 0);					\
 })
 
 #define __put_user_x(size, x, ptr, __ret_pu)			\
@@ -278,7 +278,7 @@ extern void __put_user_8(void);
 		__put_user_x(X, __pu_val, ptr, __ret_pu);	\
 		break;						\
 	}							\
-	__ret_pu;						\
+	__builtin_expect(__ret_pu, 0);				\
 })
 
 #define __put_user_size(x, ptr, size, retval, errret)			\
@@ -401,7 +401,7 @@ do {									\
 ({								\
 	int __pu_err;						\
 	__put_user_size((x), (ptr), (size), __pu_err, -EFAULT);	\
-	__pu_err;						\
+	__builtin_expect(__pu_err, 0);				\
 })
 
 #define __get_user_nocheck(x, ptr, size)				\
@@ -410,7 +410,7 @@ do {									\
 	unsigned long __gu_val;						\
 	__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-	__gu_err;							\
+	__builtin_expect(__gu_err, 0);					\
 })
 
 /* FIXME: this hack is definitely wrong -AK */
diff --git a/kernel/arch/x86/include/asm/uaccess_32.h b/kernel/arch/x86/include/asm/uaccess_32.h
index 7c8ad3451..f5dcb5204 100644
--- a/kernel/arch/x86/include/asm/uaccess_32.h
+++ b/kernel/arch/x86/include/asm/uaccess_32.h
@@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
 			return ret;
+		case 8:
+			__put_user_size(*(u64 *)from, (u64 __user *)to,
+					8, ret, 8);
+			return ret;
 		}
 	}
 	return __copy_to_user_ll(to, from, n);
diff --git a/kernel/arch/x86/include/asm/user.h b/kernel/arch/x86/include/asm/user.h
index ccab4af16..59a54e869 100644
--- a/kernel/arch/x86/include/asm/user.h
+++ b/kernel/arch/x86/include/asm/user.h
@@ -14,8 +14,8 @@ struct user_ymmh_regs {
 	__u32 ymmh_space[64];
 };
 
-struct user_xsave_hdr {
-	__u64 xstate_bv;
+struct user_xstate_header {
+	__u64 xfeatures;
 	__u64 reserved1[2];
 	__u64 reserved2[5];
 };
@@ -41,11 +41,11 @@ struct user_xsave_hdr {
  * particular process/thread.
  *
  * Also when the user modifies certain state FP/SSE/etc through the
- * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
+ * ptrace interface, they must ensure that the header.xfeatures
  * bytes[512..519] of the memory layout are updated correspondingly.
  * i.e., for example when FP state is modified to a non-init state,
- * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
- * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
+ * header.xfeatures's bit 0 must be set to '1', when SSE is modified to
+ * non-init state, header.xfeatures's bit 1 must to be set to '1', etc.
  */
 #define USER_XSTATE_FX_SW_WORDS 6
 #define USER_XSTATE_XCR0_WORD	0
@@ -55,7 +55,7 @@ struct user_xstateregs {
 		__u64 fpx_space[58];
 		__u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
 	} i387;
-	struct user_xsave_hdr xsave_hdr;
+	struct user_xstate_header header;
 	struct user_ymmh_regs ymmh;
 	/* further processor state extensions go here */
 };
diff --git a/kernel/arch/x86/include/asm/uv/uv_hub.h b/kernel/arch/x86/include/asm/uv/uv_hub.h
index c2729abe0..01ec643ce 100644
--- a/kernel/arch/x86/include/asm/uv/uv_hub.h
+++ b/kernel/arch/x86/include/asm/uv/uv_hub.h
@@ -609,7 +609,7 @@ struct uv_cpu_nmi_s {
 
 DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
 
-#define uv_hub_nmi			(uv_cpu_nmi.hub)
+#define uv_hub_nmi			this_cpu_read(uv_cpu_nmi.hub)
 #define uv_cpu_nmi_per(cpu)		(per_cpu(uv_cpu_nmi, cpu))
 #define uv_hub_nmi_per(cpu)		(uv_cpu_nmi_per(cpu).hub)
 
diff --git a/kernel/arch/x86/include/asm/vdso.h b/kernel/arch/x86/include/asm/vdso.h
index 8021bd28c..756de9190 100644
--- a/kernel/arch/x86/include/asm/vdso.h
+++ b/kernel/arch/x86/include/asm/vdso.h
@@ -26,7 +26,7 @@ struct vdso_image {
 	long sym___kernel_sigreturn;
 	long sym___kernel_rt_sigreturn;
 	long sym___kernel_vsyscall;
-	long sym_VDSO32_SYSENTER_RETURN;
+	long sym_int80_landing_pad;
 };
 
 #ifdef CONFIG_X86_64
@@ -38,13 +38,7 @@ extern const struct vdso_image vdso_image_x32;
 #endif
 
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
-extern const struct vdso_image vdso_image_32_int80;
-#ifdef CONFIG_COMPAT
-extern const struct vdso_image vdso_image_32_syscall;
-#endif
-extern const struct vdso_image vdso_image_32_sysenter;
-
-extern const struct vdso_image *selected_vdso32;
+extern const struct vdso_image vdso_image_32;
 #endif
 
 extern void __init init_vdso_image(const struct vdso_image *image);
diff --git a/kernel/arch/x86/include/asm/vm86.h b/kernel/arch/x86/include/asm/vm86.h
index 1d8de3f3f..1e491f3af 100644
--- a/kernel/arch/x86/include/asm/vm86.h
+++ b/kernel/arch/x86/include/asm/vm86.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_VM86_H
 #define _ASM_X86_VM86_H
 
-
 #include <asm/ptrace.h>
 #include <uapi/asm/vm86.h>
 
@@ -28,43 +27,49 @@ struct kernel_vm86_regs {
 	unsigned short gs, __gsh;
 };
 
-struct kernel_vm86_struct {
-	struct kernel_vm86_regs regs;
-/*
- * the below part remains on the kernel stack while we are in VM86 mode.
- * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
- * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
- * 'struct kernel_vm86_regs' with the then actual values.
- * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
- * in kernelspace, hence we need not reget the data from userspace.
- */
-#define VM86_TSS_ESP0 flags
+struct vm86 {
+	struct vm86plus_struct __user *user_vm86;
+	struct pt_regs regs32;
+	unsigned long veflags;
+	unsigned long veflags_mask;
+	unsigned long saved_sp0;
+
 	unsigned long flags;
 	unsigned long screen_bitmap;
 	unsigned long cpu_type;
 	struct revectored_struct int_revectored;
 	struct revectored_struct int21_revectored;
 	struct vm86plus_info_struct vm86plus;
-	struct pt_regs *regs32;   /* here we save the pointer to the old regs */
-/*
- * The below is not part of the structure, but the stack layout continues
- * this way. In front of 'return-eip' may be some data, depending on
- * compilation, so we don't rely on this and save the pointer to 'oldregs'
- * in 'regs32' above.
- * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
-
-	long return-eip;        from call to vm86()
-	struct pt_regs oldregs;  user space registers as saved by syscall
- */
 };
 
 #ifdef CONFIG_VM86
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+void save_v86_state(struct kernel_vm86_regs *, int);
 
 struct task_struct;
+
+#define free_vm86(t) do {				\
+	struct thread_struct *__t = (t);		\
+	if (__t->vm86 != NULL) {			\
+		kfree(__t->vm86);			\
+		__t->vm86 = NULL;			\
+	}						\
+} while (0)
+
+/*
+ * Support for VM86 programs to request interrupts for
+ * real mode hardware drivers:
+ */
+#define FIRST_VM86_IRQ		 3
+#define LAST_VM86_IRQ		15
+
+static inline int invalid_vm86_irq(int irq)
+{
+	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+
 void release_vm86_irqs(struct task_struct *);
 
 #else
@@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
 	return 0;
 }
 
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
+
+#define free_vm86(t) do { } while(0)
+
 #endif /* CONFIG_VM86 */
 
 #endif /* _ASM_X86_VM86_H */
diff --git a/kernel/arch/x86/include/asm/vmx.h b/kernel/arch/x86/include/asm/vmx.h
index da772edd1..14c63c7e8 100644
--- a/kernel/arch/x86/include/asm/vmx.h
+++ b/kernel/arch/x86/include/asm/vmx.h
@@ -47,6 +47,7 @@
 #define CPU_BASED_MOV_DR_EXITING                0x00800000
 #define CPU_BASED_UNCOND_IO_EXITING             0x01000000
 #define CPU_BASED_USE_IO_BITMAPS                0x02000000
+#define CPU_BASED_MONITOR_TRAP_FLAG             0x08000000
 #define CPU_BASED_USE_MSR_BITMAPS               0x10000000
 #define CPU_BASED_MONITOR_EXITING               0x20000000
 #define CPU_BASED_PAUSE_EXITING                 0x40000000
@@ -71,7 +72,8 @@
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
-
+#define SECONDARY_EXEC_PCOMMIT			0x00200000
+#define SECONDARY_EXEC_TSC_SCALING              0x02000000
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
@@ -166,6 +168,8 @@ enum vmcs_field {
 	VMWRITE_BITMAP                  = 0x00002028,
 	XSS_EXIT_BITMAP                 = 0x0000202C,
 	XSS_EXIT_BITMAP_HIGH            = 0x0000202D,
+	TSC_MULTIPLIER                  = 0x00002032,
+	TSC_MULTIPLIER_HIGH             = 0x00002033,
 	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
 	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
 	VMCS_LINK_POINTER               = 0x00002800,
@@ -367,29 +371,29 @@ enum vmcs_field {
 #define TYPE_PHYSICAL_APIC_EVENT        (10 << 12)
 #define TYPE_PHYSICAL_APIC_INST         (15 << 12)
 
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1 << 13)
-
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1 << 1)
-#define AR_TYPE_WRITEABLE_MASK (1 << 2)
-#define AR_TYPE_CODE_MASK (1 << 3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
-
-#define AR_UNUSABLE_MASK (1 << 16)
-#define AR_S_MASK (1 << 4)
-#define AR_P_MASK (1 << 7)
-#define AR_L_MASK (1 << 13)
-#define AR_DB_MASK (1 << 14)
-#define AR_G_MASK (1 << 15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
-
-#define AR_RESERVD_MASK 0xfffe0f00
+/* segment AR in VMCS -- these are different from what LAR reports */
+#define VMX_SEGMENT_AR_L_MASK (1 << 13)
+
+#define VMX_AR_TYPE_ACCESSES_MASK 1
+#define VMX_AR_TYPE_READABLE_MASK (1 << 1)
+#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define VMX_AR_TYPE_CODE_MASK (1 << 3)
+#define VMX_AR_TYPE_MASK 0x0f
+#define VMX_AR_TYPE_BUSY_64_TSS 11
+#define VMX_AR_TYPE_BUSY_32_TSS 11
+#define VMX_AR_TYPE_BUSY_16_TSS 3
+#define VMX_AR_TYPE_LDT 2
+
+#define VMX_AR_UNUSABLE_MASK (1 << 16)
+#define VMX_AR_S_MASK (1 << 4)
+#define VMX_AR_P_MASK (1 << 7)
+#define VMX_AR_L_MASK (1 << 13)
+#define VMX_AR_DB_MASK (1 << 14)
+#define VMX_AR_G_MASK (1 << 15)
+#define VMX_AR_DPL_SHIFT 5
+#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3)
+
+#define VMX_AR_RESERVD_MASK 0xfffe0f00
 
 #define TSS_PRIVATE_MEMSLOT			(KVM_USER_MEM_SLOTS + 0)
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	(KVM_USER_MEM_SLOTS + 1)
@@ -415,6 +419,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
+#define VMX_VPID_INVVPID_BIT                    (1ull << 0) /* (32 - 32) */
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
 
diff --git a/kernel/arch/x86/include/asm/x86_init.h b/kernel/arch/x86/include/asm/x86_init.h
index f58a9c7a3..cd0fc0cc7 100644
--- a/kernel/arch/x86/include/asm/x86_init.h
+++ b/kernel/arch/x86/include/asm/x86_init.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_PLATFORM_H
 #define _ASM_X86_PLATFORM_H
 
-#include <asm/pgtable_types.h>
 #include <asm/bootparam.h>
 
 struct mpc_bus;
@@ -171,38 +170,17 @@ struct x86_platform_ops {
 };
 
 struct pci_dev;
-struct msi_msg;
 
 struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
-	void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq,
-				unsigned int dest, struct msi_msg *msg,
-			       u8 hpet_id);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev);
-	int  (*setup_hpet_msi)(unsigned int irq, unsigned int id);
 };
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_data;
-struct cpumask;
-
 struct x86_io_apic_ops {
-	void		(*init)   (void);
 	unsigned int	(*read)   (unsigned int apic, unsigned int reg);
-	void		(*write)  (unsigned int apic, unsigned int reg, unsigned int value);
-	void		(*modify) (unsigned int apic, unsigned int reg, unsigned int value);
 	void		(*disable)(void);
-	void		(*print_entries)(unsigned int apic, unsigned int nr_entries);
-	int		(*set_affinity)(struct irq_data *data,
-					const struct cpumask *mask,
-					bool force);
-	int		(*setup_entry)(int irq, struct IO_APIC_route_entry *entry,
-				       unsigned int destination, int vector,
-				       struct io_apic_irq_attr *attr);
-	void		(*eoi_ioapic_pin)(int apic, int pin, int vector);
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/kernel/arch/x86/include/asm/xcr.h b/kernel/arch/x86/include/asm/xcr.h
deleted file mode 100644
index f2cba4e79..000000000
--- a/kernel/arch/x86/include/asm/xcr.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2008 rPath, Inc. - All Rights Reserved
- *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2 or (at your
- *   option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * asm-x86/xcr.h
- *
- * Definitions for the eXtended Control Register instructions
- */
-
-#ifndef _ASM_X86_XCR_H
-#define _ASM_X86_XCR_H
-
-#define XCR_XFEATURE_ENABLED_MASK	0x00000000
-
-#ifdef __KERNEL__
-# ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-static inline u64 xgetbv(u32 index)
-{
-	u32 eax, edx;
-
-	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (index));
-	return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-		     : : "a" (eax), "d" (edx), "c" (index));
-}
-
-# endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_XCR_H */
diff --git a/kernel/arch/x86/include/asm/xen/events.h b/kernel/arch/x86/include/asm/xen/events.h
index 608a79d5a..e6911caf5 100644
--- a/kernel/arch/x86/include/asm/xen/events.h
+++ b/kernel/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 /* No need for a barrier -- XCHG is a barrier on x86. */
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
 
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+	return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
 #endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/kernel/arch/x86/include/asm/xen/hypercall.h b/kernel/arch/x86/include/asm/xen/hypercall.h
index ca08a27b9..4c20dd333 100644
--- a/kernel/arch/x86/include/asm/xen/hypercall.h
+++ b/kernel/arch/x86/include/asm/xen/hypercall.h
@@ -336,10 +336,10 @@ HYPERVISOR_update_descriptor(u64 ma, u64 desc)
 	return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
 }
 
-static inline int
+static inline long
 HYPERVISOR_memory_op(unsigned int cmd, void *arg)
 {
-	return _hypercall2(int, memory_op, cmd, arg);
+	return _hypercall2(long, memory_op, cmd, arg);
 }
 
 static inline int
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
 	return _hypercall1(int, tmem_op, op);
 }
 
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+	return _hypercall2(int, xenpmu_op, op, arg);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
diff --git a/kernel/arch/x86/include/asm/xen/hypervisor.h b/kernel/arch/x86/include/asm/xen/hypervisor.h
index d866959e5..8b2d4bea9 100644
--- a/kernel/arch/x86/include/asm/xen/hypervisor.h
+++ b/kernel/arch/x86/include/asm/xen/hypervisor.h
@@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void)
 }
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num);
+void xen_arch_unregister_cpu(int num);
+#endif
+
 #endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/kernel/arch/x86/include/asm/xen/interface.h b/kernel/arch/x86/include/asm/xen/interface.h
index 3400dbaec..62ca03ef5 100644
--- a/kernel/arch/x86/include/asm/xen/interface.h
+++ b/kernel/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
  *
  * Guest OS interface to x86 Xen.
  *
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
  */
 
 #ifndef _ASM_X86_XEN_INTERFACE_H
 #define _ASM_X86_XEN_INTERFACE_H
 
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
 #ifdef __XEN__
 #define __DEFINE_GUEST_HANDLE(name, type) \
     typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
  * start of the GDT because some stupid OSes export hard-coded selector values
  * in their ABI. These hard-coded values are always near the start of the GDT,
  * so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
  */
 #define FIRST_RESERVED_GDT_PAGE  14
 #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
 #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
 
 /*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
  * The privilege level specifies which modes may enter a trap via a software
  * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
  * privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
 DEFINE_GUEST_HANDLE_STRUCT(trap_info);
 
 struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
-    unsigned long nmi_reason;
+	/*
+	 * Number of valid entries in the p2m table(s) anchored at
+	 * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+	 */
+	unsigned long max_pfn;
+	/*
+	 * Frame containing list of mfns containing list of mfns containing p2m.
+	 * A value of 0 indicates it has not yet been set up, ~0 indicates it
+	 * has been set to invalid e.g. due to the p2m being too large for the
+	 * 3-level p2m tree. In this case the linear mapper p2m list anchored
+	 * at p2m_vaddr is to be used.
+	 */
+	xen_pfn_t pfn_to_mfn_frame_list_list;
+	unsigned long nmi_reason;
+	/*
+	 * Following three fields are valid if p2m_cr3 contains a value
+	 * different from 0.
+	 * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+	 * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+	 * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+	 * of a L3 or L4 page table.
+	 * p2m_vaddr holds the virtual address of the linear p2m list. All
+	 * entries in the range [0...max_pfn[ are accessible via this pointer.
+	 * p2m_generation will be incremented by the guest before and after each
+	 * change of the mappings of the p2m list. p2m_generation starts at 0
+	 * and a value with the least significant bit set indicates that a
+	 * mapping update is in progress. This allows guest external software
+	 * (e.g. in Dom0) to verify that read mappings are consistent and
+	 * whether they have changed since the last check.
+	 * Modifying a p2m element in the linear p2m list is allowed via an
+	 * atomic write only.
+	 */
+	unsigned long p2m_cr3;		/* cr3 value of the p2m address space */
+	unsigned long p2m_vaddr;	/* virtual address of the p2m list */
+	unsigned long p2m_generation;	/* generation count of p2m mapping */
 };
 #endif	/* !__ASSEMBLY__ */
 
@@ -137,13 +197,31 @@ struct arch_shared_info {
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled
  * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
  */
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
     struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online                   5
+#define VGCF_online                    (1<<_VGCF_online)
     unsigned long flags;                    /* VGCF_* flags                 */
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
 #endif
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+	/*
+	 * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+	 * For PV(H) guests these fields are RO.
+	 */
+	uint32_t counters;
+	uint32_t ctrls;
+
+	/* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+	uint64_t regs[];
+#elif defined(__GNUC__)
+	uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+	uint64_t counter;
+	uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+	/*
+	 * Offsets to fixed and architectural counter MSRs (relative to
+	 * xen_pmu_arch.c.intel).
+	 * For PV(H) guests these fields are RO.
+	 */
+	uint32_t fixed_counters;
+	uint32_t arch_counters;
+
+	/* PMU registers */
+	uint64_t global_ctrl;
+	uint64_t global_ovf_ctrl;
+	uint64_t global_status;
+	uint64_t fixed_ctrl;
+	uint64_t ds_area;
+	uint64_t pebs_enable;
+	uint64_t debugctl;
+
+	/* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+	uint64_t regs[];
+#elif defined(__GNUC__)
+	uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+	uint64_t ip;
+	uint64_t sp;
+	uint64_t flags;
+	uint16_t cs;
+	uint16_t ss;
+	uint8_t cpl;
+	uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED	   (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER	   (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL	   (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV	   (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+	union {
+		/*
+		 * Processor's registers at the time of interrupt.
+		 * WO for hypervisor, RO for guests.
+		 */
+		struct xen_pmu_regs regs;
+		/*
+		 * Padding for adding new registers to xen_pmu_regs in
+		 * the future
+		 */
+#define XENPMU_REGS_PAD_SZ  64
+		uint8_t pad[XENPMU_REGS_PAD_SZ];
+	} r;
+
+	/* WO for hypervisor, RO for guest */
+	uint64_t pmu_flags;
+
+	/*
+	 * APIC LVTPC register.
+	 * RW for both hypervisor and guest.
+	 * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+	 * during XENPMU_flush or XENPMU_lvtpc_set.
+	 */
+	union {
+		uint32_t lapic_lvtpc;
+		uint64_t pad;
+	} l;
+
+	/*
+	 * Vendor-specific PMU registers.
+	 * RW for both hypervisor and guest (see exceptions above).
+	 * Guest's updates to this field are verified and then loaded by the
+	 * hypervisor into hardware during XENPMU_flush
+	 */
+	union {
+		struct xen_pmu_amd_ctxt amd;
+		struct xen_pmu_intel_ctxt intel;
+
+		/*
+		 * Padding for contexts (fixed parts only, does not include
+		 * MSR banks that are specified by offsets)
+		 */
+#define XENPMU_CTXT_PAD_SZ  128
+		uint8_t pad[XENPMU_CTXT_PAD_SZ];
+	} c;
+};
+
 #endif	/* !__ASSEMBLY__ */
 
 /*
diff --git a/kernel/arch/x86/include/asm/xen/page.h b/kernel/arch/x86/include/asm/xen/page.h
index c44a5d53e..f5fb840b4 100644
--- a/kernel/arch/x86/include/asm/xen/page.h
+++ b/kernel/arch/x86/include/asm/xen/page.h
@@ -12,7 +12,7 @@
 #include <asm/pgtable.h>
 
 #include <xen/interface/xen.h>
-#include <xen/grant_table.h>
+#include <xen/interface/grant_table.h>
 #include <xen/features.h>
 
 /* Xen machine address */
@@ -35,9 +35,7 @@ typedef struct xpaddr {
 #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
 #define IDENTITY_FRAME(m)	((m) | IDENTITY_FRAME_BIT)
 
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES						\
-    ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
 
 extern unsigned long *machine_to_phys_mapping;
 extern unsigned long  machine_to_phys_nr;
@@ -45,11 +43,13 @@ extern unsigned long *xen_p2m_addr;
 extern unsigned long  xen_p2m_size;
 extern unsigned long  xen_max_p2m_pfn;
 
+extern int xen_alloc_p2m_entry(unsigned long pfn);
+
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern unsigned long set_phys_range_identity(unsigned long pfn_s,
-					     unsigned long pfn_e);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+						    unsigned long pfn_e);
 
 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
@@ -103,6 +103,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
 	unsigned long mfn;
 
+	/*
+	 * Some x86 code are still using pfn_to_mfn instead of
+	 * pfn_to_mfn. This will have to be removed when we figured
+	 * out which call.
+	 */
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return pfn;
 
@@ -149,6 +154,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
 
+	/*
+	 * Some x86 code are still using mfn_to_pfn instead of
+	 * gfn_to_pfn. This will have to be removed when we figure
+	 * out which call.
+	 */
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
 
@@ -178,6 +188,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
 	return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
 }
 
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return pfn;
+	else
+		return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return gfn;
+	else
+		return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn)		pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn)		gfn_to_pfn(bfn)
+
 /*
  * We detect special mappings in one of two ways:
  *  1. If the MFN is an I/O page then Xen will set the m2p entry
@@ -198,7 +229,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
  *      require. In all the cases we care about, the FOREIGN_FRAME bit is
  *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
  */
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
 
@@ -217,6 +248,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 #define virt_to_mfn(v)		(pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v)		(pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g)		(__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
 static inline unsigned long pte_mfn(pte_t pte)
 {
 	return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
@@ -263,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr);
 #define xen_unmap(cookie) iounmap((cookie))
 
 static inline bool xen_arch_need_swiotlb(struct device *dev,
-					 unsigned long pfn,
-					 unsigned long mfn)
+					 phys_addr_t phys,
+					 dma_addr_t dev_addr)
 {
 	return false;
 }
diff --git a/kernel/arch/x86/include/asm/xor.h b/kernel/arch/x86/include/asm/xor.h
index d8829751b..1f5c5161e 100644
--- a/kernel/arch/x86/include/asm/xor.h
+++ b/kernel/arch/x86/include/asm/xor.h
@@ -36,7 +36,7 @@
  * no advantages to be gotten from x86-64 here anyways.
  */
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #ifdef CONFIG_X86_32
 /* reduce register pressure */
diff --git a/kernel/arch/x86/include/asm/xor_32.h b/kernel/arch/x86/include/asm/xor_32.h
index ce05722e3..5a08bc8bf 100644
--- a/kernel/arch/x86/include/asm/xor_32.h
+++ b/kernel/arch/x86/include/asm/xor_32.h
@@ -26,7 +26,7 @@
 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 static void
 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
diff --git a/kernel/arch/x86/include/asm/xor_avx.h b/kernel/arch/x86/include/asm/xor_avx.h
index 492b29802..7c0a517ec 100644
--- a/kernel/arch/x86/include/asm/xor_avx.h
+++ b/kernel/arch/x86/include/asm/xor_avx.h
@@ -18,7 +18,7 @@
 #ifdef CONFIG_AS_AVX
 
 #include <linux/compiler.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #define BLOCK4(i) \
 		BLOCK(32 * i, 0) \
diff --git a/kernel/arch/x86/include/asm/xsave.h b/kernel/arch/x86/include/asm/xsave.h
deleted file mode 100644
index c9a6d68b8..000000000
--- a/kernel/arch/x86/include/asm/xsave.h
+++ /dev/null
@@ -1,257 +0,0 @@
-#ifndef __ASM_X86_XSAVE_H
-#define __ASM_X86_XSAVE_H
-
-#include <linux/types.h>
-#include <asm/processor.h>
-
-#define XSTATE_CPUID		0x0000000d
-
-#define XSTATE_FP		0x1
-#define XSTATE_SSE		0x2
-#define XSTATE_YMM		0x4
-#define XSTATE_BNDREGS		0x8
-#define XSTATE_BNDCSR		0x10
-#define XSTATE_OPMASK		0x20
-#define XSTATE_ZMM_Hi256	0x40
-#define XSTATE_Hi16_ZMM		0x80
-
-#define XSTATE_FPSSE	(XSTATE_FP | XSTATE_SSE)
-#define XSTATE_AVX512	(XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-/* Bit 63 of XCR0 is reserved for future expansion */
-#define XSTATE_EXTEND_MASK	(~(XSTATE_FPSSE | (1ULL << 63)))
-
-#define FXSAVE_SIZE	512
-
-#define XSAVE_HDR_SIZE	    64
-#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
-
-#define XSAVE_YMM_SIZE	    256
-#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-
-/* Supported features which support lazy state saving */
-#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM		      \
-			| XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-
-/* Supported features which require eager state saving */
-#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
-
-/* All currently supported features */
-#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
-
-#ifdef CONFIG_X86_64
-#define REX_PREFIX	"0x48, "
-#else
-#define REX_PREFIX
-#endif
-
-extern unsigned int xstate_size;
-extern u64 pcntxt_mask;
-extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern struct xsave_struct *init_xstate_buf;
-
-extern void xsave_init(void);
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
-extern int init_fpu(struct task_struct *child);
-
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-#define xstate_fault	".section .fixup,\"ax\"\n"	\
-			"3:  movl $-1,%[err]\n"		\
-			"    jmp  2b\n"			\
-			".previous\n"			\
-			_ASM_EXTABLE(1b, 3b)		\
-			: [err] "=r" (err)
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XSAVES"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	else
-		asm volatile("1:"XSAVE"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	return err;
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XRSTORS"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	else
-		asm volatile("1:"XRSTOR"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	return err;
-}
-
-/*
- * Save processor xstate to xsave area.
- */
-static inline int xsave_state(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	/*
-	 * If xsaves is enabled, xsaves replaces xsaveopt because
-	 * it supports compact format and supervisor states in addition to
-	 * modified optimization in xsaveopt.
-	 *
-	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
-	 * because xsaveopt supports modified optimization which is not
-	 * supported by xsave.
-	 *
-	 * If none of xsaves and xsaveopt is enabled, use xsave.
-	 */
-	alternative_input_2(
-		"1:"XSAVE,
-		XSAVEOPT,
-		X86_FEATURE_XSAVEOPT,
-		XSAVES,
-		X86_FEATURE_XSAVES,
-		[fx] "D" (fx), "a" (lmask), "d" (hmask) :
-		"memory");
-	asm volatile("2:\n\t"
-		     xstate_fault
-		     : "0" (0)
-		     : "memory");
-
-	return err;
-}
-
-/*
- * Restore processor xstate from xsave area.
- */
-static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
-{
-	int err = 0;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	/*
-	 * Use xrstors to restore context if it is enabled. xrstors supports
-	 * compacted format of xsave area which is not supported by xrstor.
-	 */
-	alternative_input(
-		"1: " XRSTOR,
-		XRSTORS,
-		X86_FEATURE_XSAVES,
-		"D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-		: "memory");
-
-	asm volatile("2:\n"
-		     xstate_fault
-		     : "0" (0)
-		     : "memory");
-
-	return err;
-}
-
-/*
- * Save xstate context for old process during context switch.
- */
-static inline void fpu_xsave(struct fpu *fpu)
-{
-	xsave_state(&fpu->state->xsave, -1);
-}
-
-/*
- * Restore xstate context for new process during context switch.
- */
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
-{
-	return xrstor_state(fx, -1);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
- */
-static inline int xsave_user(struct xsave_struct __user *buf)
-{
-	int err;
-
-	/*
-	 * Clear the xsave header first, so that reserved fields are
-	 * initialized to zero.
-	 */
-	err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr));
-	if (unlikely(err))
-		return -EFAULT;
-
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XSAVE"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault
-			     : "D" (buf), "a" (-1), "d" (-1), "0" (0)
-			     : "memory");
-	return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
-{
-	int err = 0;
-	struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XRSTOR"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault
-			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
-			     : "memory");	/* memory required? */
-	return err;
-}
-
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
-void setup_xstate_comp(void);
-
-#endif
diff --git a/kernel/arch/x86/include/uapi/asm/bitsperlong.h b/kernel/arch/x86/include/uapi/asm/bitsperlong.h
index b0ae1c4dc..217909b4d 100644
--- a/kernel/arch/x86/include/uapi/asm/bitsperlong.h
+++ b/kernel/arch/x86/include/uapi/asm/bitsperlong.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_X86_BITSPERLONG_H
 #define __ASM_X86_BITSPERLONG_H
 
-#ifdef __x86_64__
+#if defined(__x86_64__) && !defined(__ILP32__)
 # define __BITS_PER_LONG 64
 #else
 # define __BITS_PER_LONG 32
diff --git a/kernel/arch/x86/include/uapi/asm/bootparam.h b/kernel/arch/x86/include/uapi/asm/bootparam.h
index ab456dc23..329254373 100644
--- a/kernel/arch/x86/include/uapi/asm/bootparam.h
+++ b/kernel/arch/x86/include/uapi/asm/bootparam.h
@@ -120,7 +120,7 @@ struct boot_params {
 	__u8  _pad3[16];				/* 0x070 */
 	__u8  hd0_info[16];	/* obsolete! */		/* 0x080 */
 	__u8  hd1_info[16];	/* obsolete! */		/* 0x090 */
-	struct sys_desc_table sys_desc_table;		/* 0x0a0 */
+	struct sys_desc_table sys_desc_table; /* obsolete! */	/* 0x0a0 */
 	struct olpc_ofw_header olpc_ofw_header;		/* 0x0b0 */
 	__u32 ext_ramdisk_image;			/* 0x0c0 */
 	__u32 ext_ramdisk_size;				/* 0x0c4 */
diff --git a/kernel/arch/x86/include/uapi/asm/e820.h b/kernel/arch/x86/include/uapi/asm/e820.h
index 960a8a9dc..9dafe59cf 100644
--- a/kernel/arch/x86/include/uapi/asm/e820.h
+++ b/kernel/arch/x86/include/uapi/asm/e820.h
@@ -32,11 +32,12 @@
 #define E820_ACPI	3
 #define E820_NVS	4
 #define E820_UNUSABLE	5
+#define E820_PMEM	7
 
 /*
  * This is a non-standardized way to represent ADR or NVDIMM regions that
  * persist over a reboot.  The kernel will ignore their special capabilities
- * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
  *
  * ( Note that older platforms also used 6 for the same type of memory,
  *   but newer versions switched to 12 as 6 was assigned differently.  Some
diff --git a/kernel/arch/x86/include/uapi/asm/hyperv.h b/kernel/arch/x86/include/uapi/asm/hyperv.h
index ce6068dbc..040d4083c 100644
--- a/kernel/arch/x86/include/uapi/asm/hyperv.h
+++ b/kernel/arch/x86/include/uapi/asm/hyperv.h
@@ -27,6 +27,8 @@
 #define HV_X64_MSR_VP_RUNTIME_AVAILABLE		(1 << 0)
 /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
 #define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE	(1 << 1)
+/* Partition reference TSC MSR is available */
+#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE              (1 << 9)
 
 /* A partition's reference time stamp counter (TSC) page */
 #define HV_X64_MSR_REFERENCE_TSC		0x40000021
@@ -108,6 +110,8 @@
 #define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4)
 /* Support for a virtual guest idle state is available */
 #define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5)
+/* Guest crash data handler available */
+#define HV_X64_GUEST_CRASH_MSR_AVAILABLE		(1 << 10)
 
 /*
  * Implementation recommendations. Indicates which behaviors the hypervisor
@@ -149,6 +153,12 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET			0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME			0x40000010
+
 /* MSR used to read the per-partition time reference counter */
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 
@@ -199,6 +209,17 @@
 #define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
 #define HV_X64_MSR_STIMER3_COUNT		0x400000B7
 
+/* Hyper-V guest crash notification MSR's */
+#define HV_X64_MSR_CRASH_P0			0x40000100
+#define HV_X64_MSR_CRASH_P1			0x40000101
+#define HV_X64_MSR_CRASH_P2			0x40000102
+#define HV_X64_MSR_CRASH_P3			0x40000103
+#define HV_X64_MSR_CRASH_P4			0x40000104
+#define HV_X64_MSR_CRASH_CTL			0x40000105
+#define HV_X64_MSR_CRASH_CTL_NOTIFY		(1ULL << 63)
+#define HV_X64_MSR_CRASH_PARAMS		\
+		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
@@ -236,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
 	__s64 tsc_offset;
 } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
 
+/* Define the number of synthetic interrupt sources. */
+#define HV_SYNIC_SINT_COUNT		(16)
+/* Define the expected SynIC version. */
+#define HV_SYNIC_VERSION_1		(0x1)
+
+#define HV_SYNIC_CONTROL_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SIMP_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SIEFP_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SINT_MASKED		(1ULL << 16)
+#define HV_SYNIC_SINT_AUTO_EOI		(1ULL << 17)
+#define HV_SYNIC_SINT_VECTOR_MASK	(0xFF)
+
 #endif
diff --git a/kernel/arch/x86/include/uapi/asm/kvm.h b/kernel/arch/x86/include/uapi/asm/kvm.h
index d7dcef58a..cd54147cb 100644
--- a/kernel/arch/x86/include/uapi/asm/kvm.h
+++ b/kernel/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,8 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_IOAPIC       2
 #define KVM_NR_IRQCHIPS          3
 
+#define KVM_RUN_X86_SMM		 (1 << 0)
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
@@ -281,6 +283,7 @@ struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_NMI_PENDING	0x00000001
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
 #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
+#define KVM_VCPUEVENT_VALID_SMM		0x00000008
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS	0x01
@@ -309,7 +312,13 @@ struct kvm_vcpu_events {
 	} nmi;
 	__u32 sipi_vector;
 	__u32 flags;
-	__u32 reserved[10];
+	struct {
+		__u8 smm;
+		__u8 pending;
+		__u8 smm_inside_nmi;
+		__u8 latched_init;
+	} smi;
+	__u32 reserved[9];
 };
 
 /* for KVM_GET/SET_DEBUGREGS */
@@ -345,4 +354,7 @@ struct kvm_xcrs {
 struct kvm_sync_regs {
 };
 
+#define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED	(1 << 1)
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/kernel/arch/x86/include/uapi/asm/mce.h b/kernel/arch/x86/include/uapi/asm/mce.h
index a0eab85ce..03429da2f 100644
--- a/kernel/arch/x86/include/uapi/asm/mce.h
+++ b/kernel/arch/x86/include/uapi/asm/mce.h
@@ -2,7 +2,7 @@
 #define _UAPI_ASM_X86_MCE_H
 
 #include <linux/types.h>
-#include <asm/ioctls.h>
+#include <linux/ioctl.h>
 
 /* Fields are zero when not available */
 struct mce {
@@ -15,7 +15,8 @@ struct mce {
 	__u64 time;	/* wall time_t when error was detected */
 	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
 	__u8  inject_flags;	/* software inject flags */
-	__u16  pad;
+	__u8  severity;
+	__u8  usable_addr;
 	__u32 cpuid;	/* CPUID 1 EAX */
 	__u8  cs;		/* code segment */
 	__u8  bank;	/* machine check bank */
diff --git a/kernel/arch/x86/include/uapi/asm/msr.h b/kernel/arch/x86/include/uapi/asm/msr.h
index 155e51048..c41f4fe25 100644
--- a/kernel/arch/x86/include/uapi/asm/msr.h
+++ b/kernel/arch/x86/include/uapi/asm/msr.h
@@ -1,8 +1,6 @@
 #ifndef _UAPI_ASM_X86_MSR_H
 #define _UAPI_ASM_X86_MSR_H
 
-#include <asm/msr-index.h>
-
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
diff --git a/kernel/arch/x86/include/uapi/asm/mtrr.h b/kernel/arch/x86/include/uapi/asm/mtrr.h
index d0acb658c..7528dcf59 100644
--- a/kernel/arch/x86/include/uapi/asm/mtrr.h
+++ b/kernel/arch/x86/include/uapi/asm/mtrr.h
@@ -103,7 +103,7 @@ struct mtrr_state_type {
 #define MTRRIOC_GET_PAGE_ENTRY   _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
 #define MTRRIOC_KILL_PAGE_ENTRY  _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry)
 
-/*  These are the region types  */
+/* MTRR memory types, which are defined in SDM */
 #define MTRR_TYPE_UNCACHABLE 0
 #define MTRR_TYPE_WRCOMB     1
 /*#define MTRR_TYPE_         2*/
@@ -113,5 +113,11 @@ struct mtrr_state_type {
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_NUM_TYPES       7
 
+/*
+ * Invalid MTRR memory type.  mtrr_type_lookup() returns this value when
+ * MTRRs are disabled.  Note, this value is allocated from the reserved
+ * values (0x7-0xff) of the MTRR memory types.
+ */
+#define MTRR_TYPE_INVALID    0xff
 
 #endif /* _UAPI_ASM_X86_MTRR_H */
diff --git a/kernel/arch/x86/include/uapi/asm/processor-flags.h b/kernel/arch/x86/include/uapi/asm/processor-flags.h
index 180a0c3c2..79887abcb 100644
--- a/kernel/arch/x86/include/uapi/asm/processor-flags.h
+++ b/kernel/arch/x86/include/uapi/asm/processor-flags.h
@@ -37,8 +37,6 @@
 #define X86_EFLAGS_VM		_BITUL(X86_EFLAGS_VM_BIT)
 #define X86_EFLAGS_AC_BIT	18 /* Alignment Check/Access Control */
 #define X86_EFLAGS_AC		_BITUL(X86_EFLAGS_AC_BIT)
-#define X86_EFLAGS_AC_BIT	18 /* Alignment Check/Access Control */
-#define X86_EFLAGS_AC		_BITUL(X86_EFLAGS_AC_BIT)
 #define X86_EFLAGS_VIF_BIT	19 /* Virtual Interrupt Flag */
 #define X86_EFLAGS_VIF		_BITUL(X86_EFLAGS_VIF_BIT)
 #define X86_EFLAGS_VIP_BIT	20 /* Virtual Interrupt Pending */
diff --git a/kernel/arch/x86/include/uapi/asm/sigcontext.h b/kernel/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f9081..d485232f1 100644
--- a/kernel/arch/x86/include/uapi/asm/sigcontext.h
+++ b/kernel/arch/x86/include/uapi/asm/sigcontext.h
@@ -1,221 +1,360 @@
 #ifndef _UAPI_ASM_X86_SIGCONTEXT_H
 #define _UAPI_ASM_X86_SIGCONTEXT_H
 
+/*
+ * Linux signal context definitions. The sigcontext includes a complex
+ * hierarchy of CPU and FPU state, available to user-space (on the stack) when
+ * a signal handler is executed.
+ *
+ * As over the years this ABI grew from its very simple roots towards
+ * supporting more and more CPU state organically, some of the details (which
+ * were rather clever hacks back in the days) became a bit quirky by today.
+ *
+ * The current ABI includes flexible provisions for future extensions, so we
+ * won't have to grow new quirks for quite some time. Promise!
+ */
+
 #include <linux/compiler.h>
 #include <linux/types.h>
 
-#define FP_XSTATE_MAGIC1	0x46505853U
-#define FP_XSTATE_MAGIC2	0x46505845U
-#define FP_XSTATE_MAGIC2_SIZE	sizeof(FP_XSTATE_MAGIC2)
+#define FP_XSTATE_MAGIC1		0x46505853U
+#define FP_XSTATE_MAGIC2		0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE		sizeof(FP_XSTATE_MAGIC2)
 
 /*
- * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
- * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
- * are used to extended the fpstate pointer in the sigcontext, which now
- * includes the extended state information along with fpstate information.
+ * Bytes 464..511 in the current 512-byte layout of the FXSAVE/FXRSTOR frame
+ * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes are
+ * used to extend the fpstate pointer in the sigcontext, which now includes the
+ * extended state information along with fpstate information.
+ *
+ * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a
+ * sw_reserved.extended_size bytes large extended context area present. (The
+ * last 32-bit word of this extended area (at the
+ * fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to
+ * FP_XSTATE_MAGIC2 so that you can sanity check your size calculations.)
  *
- * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
- * area and FP_XSTATE_MAGIC2 at the end of memory layout
- * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
- * extended state information in the memory layout pointed by the fpstate
- * pointer in sigcontext.
+ * This extended area typically grows with newer CPUs that have larger and
+ * larger XSAVE areas.
  */
 struct _fpx_sw_bytes {
-	__u32 magic1;		/* FP_XSTATE_MAGIC1 */
-	__u32 extended_size;	/* total size of the layout referred by
-				 * fpstate pointer in the sigcontext.
-				 */
-	__u64 xstate_bv;
-				/* feature bit mask (including fp/sse/extended
-				 * state) that is present in the memory
-				 * layout.
-				 */
-	__u32 xstate_size;	/* actual xsave state size, based on the
-				 * features saved in the layout.
-				 * 'extended_size' will be greater than
-				 * 'xstate_size'.
-				 */
-	__u32 padding[7];	/*  for future use. */
+	/*
+	 * If set to FP_XSTATE_MAGIC1 then this is an xstate context.
+	 * 0 if a legacy frame.
+	 */
+	__u32				magic1;
+
+	/*
+	 * Total size of the fpstate area:
+	 *
+	 *  - if magic1 == 0 then it's sizeof(struct _fpstate)
+	 *  - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate)
+	 *    plus extensions (if any)
+	 */
+	__u32				extended_size;
+
+	/*
+	 * Feature bit mask (including FP/SSE/extended state) that is present
+	 * in the memory layout:
+	 */
+	__u64				xfeatures;
+
+	/*
+	 * Actual XSAVE state size, based on the xfeatures saved in the layout.
+	 * 'extended_size' is greater than 'xstate_size':
+	 */
+	__u32				xstate_size;
+
+	/* For future use: */
+	__u32				padding[7];
 };
 
-#ifdef __i386__
 /*
- * As documented in the iBCS2 standard..
- *
- * The first part of "struct _fpstate" is just the normal i387
- * hardware setup, the extra "status" word is used to save the
- * coprocessor status word before entering the handler.
+ * As documented in the iBCS2 standard:
  *
- * Pentium III FXSR, SSE support
- *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * The first part of "struct _fpstate" is just the normal i387 hardware setup,
+ * the extra "status" word is used to save the coprocessor status word before
+ * entering the handler.
  *
- * The FPU state data structure has had to grow to accommodate the
- * extended FPU state required by the Streaming SIMD Extensions.
- * There is no documented standard to accomplish this at the moment.
+ * The FPU state data structure has had to grow to accommodate the extended FPU
+ * state required by the Streaming SIMD Extensions.  There is no documented
+ * standard to accomplish this at the moment.
  */
+
+/* 10-byte legacy floating point register: */
 struct _fpreg {
-	unsigned short significand[4];
-	unsigned short exponent;
+	__u16				significand[4];
+	__u16				exponent;
 };
 
+/* 16-byte floating point register: */
 struct _fpxreg {
-	unsigned short significand[4];
-	unsigned short exponent;
-	unsigned short padding[3];
+	__u16				significand[4];
+	__u16				exponent;
+	__u16				padding[3];
 };
 
+/* 16-byte XMM register: */
 struct _xmmreg {
-	unsigned long element[4];
+	__u32				element[4];
 };
 
-struct _fpstate {
-	/* Regular FPU environment */
-	unsigned long	cw;
-	unsigned long	sw;
-	unsigned long	tag;
-	unsigned long	ipoff;
-	unsigned long	cssel;
-	unsigned long	dataoff;
-	unsigned long	datasel;
-	struct _fpreg	_st[8];
-	unsigned short	status;
-	unsigned short	magic;		/* 0xffff = regular FPU data only */
+#define X86_FXSR_MAGIC			0x0000
+
+/*
+ * The 32-bit FPU frame:
+ */
+struct _fpstate_32 {
+	/* Legacy FPU environment: */
+	__u32				cw;
+	__u32				sw;
+	__u32				tag;
+	__u32				ipoff;
+	__u32				cssel;
+	__u32				dataoff;
+	__u32				datasel;
+	struct _fpreg			_st[8];
+	__u16				status;
+	__u16				magic;		/* 0xffff: regular FPU data only */
+							/* 0x0000: FXSR FPU data */
 
 	/* FXSR FPU environment */
-	unsigned long	_fxsr_env[6];	/* FXSR FPU env is ignored */
-	unsigned long	mxcsr;
-	unsigned long	reserved;
-	struct _fpxreg	_fxsr_st[8];	/* FXSR FPU reg data is ignored */
-	struct _xmmreg	_xmm[8];
-	unsigned long	padding1[44];
+	__u32				_fxsr_env[6];	/* FXSR FPU env is ignored */
+	__u32				mxcsr;
+	__u32				reserved;
+	struct _fpxreg			_fxsr_st[8];	/* FXSR FPU reg data is ignored */
+	struct _xmmreg			_xmm[8];	/* First 8 XMM registers */
+	union {
+		__u32			padding1[44];	/* Second 8 XMM registers plus padding */
+		__u32			padding[44];	/* Alias name for old user-space */
+	};
 
 	union {
-		unsigned long	padding2[12];
-		struct _fpx_sw_bytes sw_reserved; /* represents the extended
-						   * state info */
+		__u32			padding2[12];
+		struct _fpx_sw_bytes	sw_reserved;	/* Potential extended state is encoded here */
 	};
 };
 
-#define X86_FXSR_MAGIC		0x0000
-
-#ifndef __KERNEL__
 /*
- * User-space might still rely on the old definition:
+ * The 64-bit FPU frame. (FXSAVE format and later)
+ *
+ * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is
+ *        larger: 'struct _xstate'. Note that 'struct _xstate' embedds
+ *        'struct _fpstate' so that you can always assume the _fpstate portion
+ *        exists so that you can check the magic value.
+ *
+ * Note2: Reserved fields may someday contain valuable data. Always
+ *	  save/restore them when you change signal frames.
  */
-struct sigcontext {
-	unsigned short gs, __gsh;
-	unsigned short fs, __fsh;
-	unsigned short es, __esh;
-	unsigned short ds, __dsh;
-	unsigned long edi;
-	unsigned long esi;
-	unsigned long ebp;
-	unsigned long esp;
-	unsigned long ebx;
-	unsigned long edx;
-	unsigned long ecx;
-	unsigned long eax;
-	unsigned long trapno;
-	unsigned long err;
-	unsigned long eip;
-	unsigned short cs, __csh;
-	unsigned long eflags;
-	unsigned long esp_at_signal;
-	unsigned short ss, __ssh;
-	struct _fpstate __user *fpstate;
-	unsigned long oldmask;
-	unsigned long cr2;
-};
-#endif /* !__KERNEL__ */
-
-#else /* __i386__ */
-
-/* FXSAVE frame */
-/* Note: reserved1/2 may someday contain valuable data. Always save/restore
-   them when you change signal frames. */
-struct _fpstate {
-	__u16	cwd;
-	__u16	swd;
-	__u16	twd;		/* Note this is not the same as the
-				   32bit/x87/FSAVE twd */
-	__u16	fop;
-	__u64	rip;
-	__u64	rdp;
-	__u32	mxcsr;
-	__u32	mxcsr_mask;
-	__u32	st_space[32];	/* 8*16 bytes for each FP-reg */
-	__u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg  */
-	__u32	reserved2[12];
+struct _fpstate_64 {
+	__u16				cwd;
+	__u16				swd;
+	/* Note this is not the same as the 32-bit/x87/FSAVE twd: */
+	__u16				twd;
+	__u16				fop;
+	__u64				rip;
+	__u64				rdp;
+	__u32				mxcsr;
+	__u32				mxcsr_mask;
+	__u32				st_space[32];	/*  8x  FP registers, 16 bytes each */
+	__u32				xmm_space[64];	/* 16x XMM registers, 16 bytes each */
+	__u32				reserved2[12];
 	union {
-		__u32	reserved3[12];
-		struct _fpx_sw_bytes sw_reserved; /* represents the extended
-						   * state information */
+		__u32			reserved3[12];
+		struct _fpx_sw_bytes	sw_reserved;	/* Potential extended state is encoded here */
 	};
 };
 
-#ifndef __KERNEL__
+#ifdef __i386__
+# define _fpstate _fpstate_32
+#else
+# define _fpstate _fpstate_64
+#endif
+
+struct _header {
+	__u64				xfeatures;
+	__u64				reserved1[2];
+	__u64				reserved2[5];
+};
+
+struct _ymmh_state {
+	/* 16x YMM registers, 16 bytes each: */
+	__u32				ymmh_space[64];
+};
+
 /*
- * User-space might still rely on the old definition:
+ * Extended state pointed to by sigcontext::fpstate.
+ *
+ * In addition to the fpstate, information encoded in _xstate::xstate_hdr
+ * indicates the presence of other extended state information supported
+ * by the CPU and kernel:
  */
-struct sigcontext {
-	__u64 r8;
-	__u64 r9;
-	__u64 r10;
-	__u64 r11;
-	__u64 r12;
-	__u64 r13;
-	__u64 r14;
-	__u64 r15;
-	__u64 rdi;
-	__u64 rsi;
-	__u64 rbp;
-	__u64 rbx;
-	__u64 rdx;
-	__u64 rax;
-	__u64 rcx;
-	__u64 rsp;
-	__u64 rip;
-	__u64 eflags;		/* RFLAGS */
-	__u16 cs;
-	__u16 gs;
-	__u16 fs;
-	__u16 __pad0;
-	__u64 err;
-	__u64 trapno;
-	__u64 oldmask;
-	__u64 cr2;
-	struct _fpstate __user *fpstate;	/* zero when no FPU context */
-#ifdef __ILP32__
-	__u32 __fpstate_pad;
-#endif
-	__u64 reserved1[8];
+struct _xstate {
+	struct _fpstate			fpstate;
+	struct _header			xstate_hdr;
+	struct _ymmh_state		ymmh;
+	/* New processor state extensions go here: */
 };
-#endif /* !__KERNEL__ */
 
-#endif /* !__i386__ */
+/*
+ * The 32-bit signal frame:
+ */
+struct sigcontext_32 {
+	__u16				gs, __gsh;
+	__u16				fs, __fsh;
+	__u16				es, __esh;
+	__u16				ds, __dsh;
+	__u32				di;
+	__u32				si;
+	__u32				bp;
+	__u32				sp;
+	__u32				bx;
+	__u32				dx;
+	__u32				cx;
+	__u32				ax;
+	__u32				trapno;
+	__u32				err;
+	__u32				ip;
+	__u16				cs, __csh;
+	__u32				flags;
+	__u32				sp_at_signal;
+	__u16				ss, __ssh;
 
-struct _xsave_hdr {
-	__u64 xstate_bv;
-	__u64 reserved1[2];
-	__u64 reserved2[5];
+	/*
+	 * fpstate is really (struct _fpstate *) or (struct _xstate *)
+	 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+	 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+	 * of extended memory layout. See comments at the definition of
+	 * (struct _fpx_sw_bytes)
+	 */
+	__u32				fpstate; /* Zero when no FPU/extended context */
+	__u32				oldmask;
+	__u32				cr2;
 };
 
-struct _ymmh_state {
-	/* 16 * 16 bytes for each YMMH-reg */
-	__u32 ymmh_space[64];
+/*
+ * The 64-bit signal frame:
+ */
+struct sigcontext_64 {
+	__u64				r8;
+	__u64				r9;
+	__u64				r10;
+	__u64				r11;
+	__u64				r12;
+	__u64				r13;
+	__u64				r14;
+	__u64				r15;
+	__u64				di;
+	__u64				si;
+	__u64				bp;
+	__u64				bx;
+	__u64				dx;
+	__u64				ax;
+	__u64				cx;
+	__u64				sp;
+	__u64				ip;
+	__u64				flags;
+	__u16				cs;
+	__u16				gs;
+	__u16				fs;
+	__u16				__pad0;
+	__u64				err;
+	__u64				trapno;
+	__u64				oldmask;
+	__u64				cr2;
+
+	/*
+	 * fpstate is really (struct _fpstate *) or (struct _xstate *)
+	 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+	 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+	 * of extended memory layout. See comments at the definition of
+	 * (struct _fpx_sw_bytes)
+	 */
+	__u64				fpstate; /* Zero when no FPU/extended context */
+	__u64				reserved1[8];
 };
 
 /*
- * Extended state pointed by the fpstate pointer in the sigcontext.
- * In addition to the fpstate, information encoded in the xstate_hdr
- * indicates the presence of other extended state information
- * supported by the processor and OS.
+ * Create the real 'struct sigcontext' type:
  */
-struct _xstate {
-	struct _fpstate fpstate;
-	struct _xsave_hdr xstate_hdr;
-	struct _ymmh_state ymmh;
-	/* new processor state extensions go here */
+#ifdef __KERNEL__
+# ifdef __i386__
+#  define sigcontext sigcontext_32
+# else
+#  define sigcontext sigcontext_64
+# endif
+#endif
+
+/*
+ * The old user-space sigcontext definition, just in case user-space still
+ * relies on it. The kernel definition (in asm/sigcontext.h) has unified
+ * field names but otherwise the same layout.
+ */
+#ifndef __KERNEL__
+
+#define _fpstate_ia32			_fpstate_32
+#define sigcontext_ia32			sigcontext_32
+
+
+# ifdef __i386__
+struct sigcontext {
+	__u16				gs, __gsh;
+	__u16				fs, __fsh;
+	__u16				es, __esh;
+	__u16				ds, __dsh;
+	__u32				edi;
+	__u32				esi;
+	__u32				ebp;
+	__u32				esp;
+	__u32				ebx;
+	__u32				edx;
+	__u32				ecx;
+	__u32				eax;
+	__u32				trapno;
+	__u32				err;
+	__u32				eip;
+	__u16				cs, __csh;
+	__u32				eflags;
+	__u32				esp_at_signal;
+	__u16				ss, __ssh;
+	struct _fpstate __user		*fpstate;
+	__u32				oldmask;
+	__u32				cr2;
 };
+# else /* __x86_64__: */
+struct sigcontext {
+	__u64				r8;
+	__u64				r9;
+	__u64				r10;
+	__u64				r11;
+	__u64				r12;
+	__u64				r13;
+	__u64				r14;
+	__u64				r15;
+	__u64				rdi;
+	__u64				rsi;
+	__u64				rbp;
+	__u64				rbx;
+	__u64				rdx;
+	__u64				rax;
+	__u64				rcx;
+	__u64				rsp;
+	__u64				rip;
+	__u64				eflags;		/* RFLAGS */
+	__u16				cs;
+	__u16				gs;
+	__u16				fs;
+	__u16				__pad0;
+	__u64				err;
+	__u64				trapno;
+	__u64				oldmask;
+	__u64				cr2;
+	struct _fpstate __user		*fpstate;	/* Zero when no FPU context */
+#  ifdef __ILP32__
+	__u32				__fpstate_pad;
+#  endif
+	__u64				reserved1[8];
+};
+# endif /* __x86_64__ */
+#endif /* !__KERNEL__ */
 
 #endif /* _UAPI_ASM_X86_SIGCONTEXT_H */
diff --git a/kernel/arch/x86/include/uapi/asm/sigcontext32.h b/kernel/arch/x86/include/uapi/asm/sigcontext32.h
index ad1478c4a..a92b0f0dc 100644
--- a/kernel/arch/x86/include/uapi/asm/sigcontext32.h
+++ b/kernel/arch/x86/include/uapi/asm/sigcontext32.h
@@ -1,77 +1,8 @@
 #ifndef _ASM_X86_SIGCONTEXT32_H
 #define _ASM_X86_SIGCONTEXT32_H
 
-#include <linux/types.h>
+/* This is a legacy file - all the type definitions are in sigcontext.h: */
 
-/* signal context for 32bit programs. */
-
-#define X86_FXSR_MAGIC		0x0000
-
-struct _fpreg {
-	unsigned short significand[4];
-	unsigned short exponent;
-};
-
-struct _fpxreg {
-	unsigned short significand[4];
-	unsigned short exponent;
-	unsigned short padding[3];
-};
-
-struct _xmmreg {
-	__u32	element[4];
-};
-
-/* FSAVE frame with extensions */
-struct _fpstate_ia32 {
-	/* Regular FPU environment */
-	__u32 	cw;
-	__u32	sw;
-	__u32	tag;	/* not compatible to 64bit twd */
-	__u32	ipoff;
-	__u32	cssel;
-	__u32	dataoff;
-	__u32	datasel;
-	struct _fpreg	_st[8];
-	unsigned short	status;
-	unsigned short	magic;		/* 0xffff = regular FPU data only */
-
-	/* FXSR FPU environment */
-	__u32	_fxsr_env[6];
-	__u32	mxcsr;
-	__u32	reserved;
-	struct _fpxreg	_fxsr_st[8];
-	struct _xmmreg	_xmm[8];	/* It's actually 16 */
-	__u32	padding[44];
-	union {
-		__u32 padding2[12];
-		struct _fpx_sw_bytes sw_reserved;
-	};
-};
-
-struct sigcontext_ia32 {
-       unsigned short gs, __gsh;
-       unsigned short fs, __fsh;
-       unsigned short es, __esh;
-       unsigned short ds, __dsh;
-       unsigned int di;
-       unsigned int si;
-       unsigned int bp;
-       unsigned int sp;
-       unsigned int bx;
-       unsigned int dx;
-       unsigned int cx;
-       unsigned int ax;
-       unsigned int trapno;
-       unsigned int err;
-       unsigned int ip;
-       unsigned short cs, __csh;
-       unsigned int flags;
-       unsigned int sp_at_signal;
-       unsigned short ss, __ssh;
-       unsigned int fpstate;		/* really (struct _fpstate_ia32 *) */
-       unsigned int oldmask;
-       unsigned int cr2;
-};
+#include <asm/sigcontext.h>
 
 #endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/kernel/arch/x86/include/uapi/asm/svm.h b/kernel/arch/x86/include/uapi/asm/svm.h
index b5d7640ab..8a4add8e4 100644
--- a/kernel/arch/x86/include/uapi/asm/svm.h
+++ b/kernel/arch/x86/include/uapi/asm/svm.h
@@ -100,6 +100,7 @@
 	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" }, \
 	{ SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" }, \
 	{ SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" }, \
+	{ SVM_EXIT_EXCP_BASE + AC_VECTOR,       "AC excp" }, \
 	{ SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" }, \
 	{ SVM_EXIT_INTR,        "interrupt" }, \
 	{ SVM_EXIT_NMI,         "nmi" }, \
diff --git a/kernel/arch/x86/include/uapi/asm/vmx.h b/kernel/arch/x86/include/uapi/asm/vmx.h
index 1fe92181e..5b15d94a3 100644
--- a/kernel/arch/x86/include/uapi/asm/vmx.h
+++ b/kernel/arch/x86/include/uapi/asm/vmx.h
@@ -58,6 +58,7 @@
 #define EXIT_REASON_INVALID_STATE       33
 #define EXIT_REASON_MSR_LOAD_FAIL       34
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MONITOR_TRAP_FLAG   37
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
 #define EXIT_REASON_MCE_DURING_VMENTRY  41
@@ -77,6 +78,7 @@
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
+#define EXIT_REASON_PCOMMIT             65
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -106,6 +108,7 @@
 	{ EXIT_REASON_MSR_READ,              "MSR_READ" }, \
 	{ EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
 	{ EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
+	{ EXIT_REASON_MONITOR_TRAP_FLAG,     "MONITOR_TRAP_FLAG" }, \
 	{ EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
 	{ EXIT_REASON_PAUSE_INSTRUCTION,     "PAUSE_INSTRUCTION" }, \
 	{ EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
@@ -124,7 +127,8 @@
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
-	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
+	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
+	{ EXIT_REASON_PCOMMIT,               "PCOMMIT" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
diff --git a/kernel/arch/x86/kernel/Makefile b/kernel/arch/x86/kernel/Makefile
index 9bcd0b56c..b1b78ffe0 100644
--- a/kernel/arch/x86/kernel/Makefile
+++ b/kernel/arch/x86/kernel/Makefile
@@ -22,18 +22,17 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
 
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
-obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
+obj-y			:= process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y			+= time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= mcount_64.o
-obj-y			+= syscall_$(BITS).o vsyscall_gtod.o
-obj-$(CONFIG_IA32_EMULATION)	+= syscall_32.o
-obj-$(CONFIG_X86_VSYSCALL_EMULATION)	+= vsyscall_64.o vsyscall_emu_64.o
 obj-$(CONFIG_X86_ESPFIX64)	+= espfix_64.o
 obj-$(CONFIG_SYSFS)	+= ksysfs.o
 obj-y			+= bootflag.o e820.o
@@ -44,7 +43,7 @@ obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
 obj-y				+= process.o
-obj-y				+= i387.o xsave.o
+obj-y				+= fpu/
 obj-y				+= ptrace.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
@@ -72,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)	+= machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE)	+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-y				+= kprobes/
@@ -95,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
-obj-$(CONFIG_X86_PMEM_LEGACY)	+= pmem.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
@@ -110,8 +109,6 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
-obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
-obj-$(CONFIG_PMC_ATOM)			+= pmc_atom.o
 
 ###
 # 64 bit specific files
diff --git a/kernel/arch/x86/kernel/acpi/boot.c b/kernel/arch/x86/kernel/acpi/boot.c
index 07bea8022..e75907601 100644
--- a/kernel/arch/x86/kernel/acpi/boot.c
+++ b/kernel/arch/x86/kernel/acpi/boot.c
@@ -31,12 +31,12 @@
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/irq.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 
+#include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
@@ -400,57 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 	return 0;
 }
 
-static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
-			   int polarity)
-{
-	int irq, node;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-
-	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
-	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
-	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-	if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
-		pr_warn("Failed to set pin attr for GSI%d\n", gsi);
-		return -1;
-	}
-
-	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
-	if (irq < 0)
-		return irq;
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi)
-		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
-	return irq;
-}
-
-static void mp_unregister_gsi(u32 gsi)
-{
-	int irq;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return;
-
-	irq = mp_map_gsi_to_irq(gsi, 0);
-	if (irq > 0)
-		mp_unmap_irq(irq);
-}
-
-static struct irq_domain_ops acpi_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
 	struct acpi_madt_io_apic *ioapic = NULL;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	ioapic = (struct acpi_madt_io_apic *)header;
@@ -653,7 +609,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	if (trigger == ACPI_LEVEL_SENSITIVE)
-		eisa_set_level_irq(gsi);
+		elcr_set_level_irq(gsi);
 #endif
 
 	return gsi;
@@ -664,10 +620,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 				    int trigger, int polarity)
 {
 	int irq = gsi;
-
 #ifdef CONFIG_X86_IO_APIC
+	int node;
+	struct irq_alloc_info info;
+
+	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+	ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
 	mutex_lock(&acpi_ioapic_lock);
-	irq = mp_register_gsi(dev, gsi, trigger, polarity);
+	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (irq >= 0 && enable_update_mptable &&
+	    acpi_gbl_FADT.sci_interrupt != gsi)
+		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 
@@ -677,8 +644,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 static void acpi_unregister_gsi_ioapic(u32 gsi)
 {
 #ifdef CONFIG_X86_IO_APIC
+	int irq;
+
 	mutex_lock(&acpi_ioapic_lock);
-	mp_unregister_gsi(gsi);
+	irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+	if (irq > 0)
+		mp_unmap_irq(irq);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 }
@@ -740,7 +711,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 #endif
 }
 
-static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
 {
 	int cpu;
 
@@ -756,12 +727,6 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 	*pcpu = cpu;
 	return 0;
 }
-
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
-{
-	return _acpi_map_lsapic(handle, physid, pcpu);
-}
 EXPORT_SYMBOL(acpi_map_cpu);
 
 int acpi_unmap_cpu(int cpu)
@@ -787,7 +752,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
 	u64 addr;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
@@ -1011,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
 {
 	int count;
 	int x2count = 0;
+	int ret;
+	struct acpi_subtable_proc madt_proc[2];
 
 	if (!cpu_has_apic)
 		return -ENODEV;
@@ -1034,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void)
 				      acpi_parse_sapic, MAX_LOCAL_APIC);
 
 	if (!count) {
-		x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
-					acpi_parse_x2apic, MAX_LOCAL_APIC);
-		count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
-					acpi_parse_lapic, MAX_LOCAL_APIC);
+		memset(madt_proc, 0, sizeof(madt_proc));
+		madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+		madt_proc[0].handler = acpi_parse_lapic;
+		madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+		madt_proc[1].handler = acpi_parse_x2apic;
+		ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+				sizeof(struct acpi_table_madt),
+				madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+		if (ret < 0) {
+			printk(KERN_ERR PREFIX
+					"Error parsing LAPIC/X2APIC entries\n");
+			return ret;
+		}
+
+		x2count = madt_proc[0].count;
+		count = madt_proc[1].count;
 	}
 	if (!count && !x2count) {
 		printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/kernel/arch/x86/kernel/acpi/sleep.c b/kernel/arch/x86/kernel/acpi/sleep.c
index d1daead5f..adb3eaf8f 100644
--- a/kernel/arch/x86/kernel/acpi/sleep.c
+++ b/kernel/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
 #include <asm/cacheflush.h>
 #include <asm/realmode.h>
 
+#include <linux/ftrace.h>
 #include "../../realmode/rm/wakeup.h"
 #include "sleep.h"
 
@@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void)
        saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
+	/*
+	 * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+	 * inconsistent call/return info after it jumps to the wakeup vector.
+	 */
+	pause_graph_tracing();
 	do_suspend_lowlevel();
+	unpause_graph_tracing();
 	return 0;
 }
 
diff --git a/kernel/arch/x86/kernel/acpi/wakeup_32.S b/kernel/arch/x86/kernel/acpi/wakeup_32.S
index 665c6b7d2..0c26b1b44 100644
--- a/kernel/arch/x86/kernel/acpi/wakeup_32.S
+++ b/kernel/arch/x86/kernel/acpi/wakeup_32.S
@@ -12,11 +12,13 @@ ENTRY(wakeup_pmode_return)
 wakeup_pmode_return:
 	movw	$__KERNEL_DS, %ax
 	movw	%ax, %ss
-	movw	%ax, %ds
-	movw	%ax, %es
 	movw	%ax, %fs
 	movw	%ax, %gs
 
+	movw	$__USER_DS, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+
 	# reload the gdt, as we need the full 32 bit address
 	lidt	saved_idt
 	lldt	saved_ldt
diff --git a/kernel/arch/x86/kernel/acpi/wakeup_64.S b/kernel/arch/x86/kernel/acpi/wakeup_64.S
index ae693b51e..8c35df468 100644
--- a/kernel/arch/x86/kernel/acpi/wakeup_64.S
+++ b/kernel/arch/x86/kernel/acpi/wakeup_64.S
@@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel)
 	pushfq
 	popq	pt_regs_flags(%rax)
 
-	movq	$resume_point, saved_rip(%rip)
+	movq	$.Lresume_point, saved_rip(%rip)
 
 	movq	%rsp, saved_rsp
 	movq	%rbp, saved_rbp
@@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel)
 	xorl	%eax, %eax
 	call	x86_acpi_enter_sleep_state
 	/* in case something went wrong, restore the machine status and go on */
-	jmp	resume_point
+	jmp	.Lresume_point
 
 	.align 4
-resume_point:
+.Lresume_point:
 	/* We don't restore %rax, it must be 0 anyway */
 	movq	$saved_context, %rax
 	movq	saved_context_cr4(%rax), %rbx
diff --git a/kernel/arch/x86/kernel/alternative.c b/kernel/arch/x86/kernel/alternative.c
index aef653193..25f909362 100644
--- a/kernel/arch/x86/kernel/alternative.c
+++ b/kernel/arch/x86/kernel/alternative.c
@@ -21,6 +21,10 @@
 #include <asm/io.h>
 #include <asm/fixmap.h>
 
+int __read_mostly alternatives_patched;
+
+EXPORT_SYMBOL_GPL(alternatives_patched);
+
 #define MAX_PATCH_LEN (255-1)
 
 static int __initdata_or_module debug_alternative;
@@ -227,6 +231,15 @@ void __init arch_init_ideal_nops(void)
 #endif
 		}
 		break;
+
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 > 0xf) {
+			ideal_nops = p6_nops;
+			return;
+		}
+
+		/* fall through */
+
 	default:
 #ifdef CONFIG_X86_64
 		ideal_nops = k8_nops;
@@ -325,10 +338,15 @@ done:
 
 static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
 {
+	unsigned long flags;
+
 	if (instr[0] != 0x90)
 		return;
 
+	local_irq_save(flags);
 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+	sync_core();
+	local_irq_restore(flags);
 
 	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
 		   instr, a->instrlen - a->padlen, a->padlen);
@@ -627,6 +645,7 @@ void __init alternative_instructions(void)
 	apply_paravirt(__parainstructions, __parainstructions_end);
 
 	restart_nmi();
+	alternatives_patched = 1;
 }
 
 /**
diff --git a/kernel/arch/x86/kernel/amd_nb.c b/kernel/arch/x86/kernel/amd_nb.c
index 5caed1dd7..29fa475ec 100644
--- a/kernel/arch/x86/kernel/amd_nb.c
+++ b/kernel/arch/x86/kernel/amd_nb.c
@@ -89,9 +89,7 @@ int amd_cache_northbridges(void)
 			next_northbridge(link, amd_nb_link_ids);
 	}
 
-	/* GART present only on Fam15h upto model 0fh */
-	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
-	    (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+	if (amd_gart_present())
 		amd_northbridges.flags |= AMD_NB_GART;
 
 	/*
diff --git a/kernel/arch/x86/kernel/apb_timer.c b/kernel/arch/x86/kernel/apb_timer.c
index 6a7c23ff2..222a57076 100644
--- a/kernel/arch/x86/kernel/apb_timer.c
+++ b/kernel/arch/x86/kernel/apb_timer.c
@@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void)
 
 static void apbt_setup_irq(struct apbt_dev *adev)
 {
-	/* timer0 irq has been setup early */
-	if (adev->irq == 0)
-		return;
-
 	irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 }
@@ -267,7 +263,7 @@ static int apbt_clocksource_register(void)
 
 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	rdtscll(start);
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -277,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
@@ -394,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
-	t1 = __native_read_tsc();
+	t1 = rdtsc();
 
 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
-	t2 = __native_read_tsc();
+	t2 = rdtsc();
 
 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
diff --git a/kernel/arch/x86/kernel/aperture_64.c b/kernel/arch/x86/kernel/aperture_64.c
index 76164e173..6e85f7136 100644
--- a/kernel/arch/x86/kernel/aperture_64.c
+++ b/kernel/arch/x86/kernel/aperture_64.c
@@ -262,6 +262,9 @@ void __init early_gart_iommu_check(void)
 	u64 aper_base = 0, last_aper_base = 0;
 	int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
 
+	if (!amd_gart_present())
+		return;
+
 	if (!early_pci_allowed())
 		return;
 
@@ -355,6 +358,9 @@ int __init gart_iommu_hole_init(void)
 	int fix, slot, valid_agp = 0;
 	int i, node;
 
+	if (!amd_gart_present())
+		return -ENODEV;
+
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
 		return -ENODEV;
@@ -452,7 +458,7 @@ out:
 		   force_iommu ||
 		   valid_agp ||
 		   fallback_aper_force) {
-		pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+		pr_info("Your BIOS doesn't leave an aperture memory hole\n");
 		pr_info("Please enable the IOMMU option in the BIOS setup\n");
 		pr_info("This costs you %dMB of RAM\n",
 			32 << fallback_aper_order);
diff --git a/kernel/arch/x86/kernel/apic/apic.c b/kernel/arch/x86/kernel/apic/apic.c
index cde732c1b..2f69e3b18 100644
--- a/kernel/arch/x86/kernel/apic/apic.c
+++ b/kernel/arch/x86/kernel/apic/apic.c
@@ -336,6 +336,13 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 	apic_write(APIC_LVTT, lvtt_value);
 
 	if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+		/*
+		 * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+		 * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+		 * According to Intel, MFENCE can do the serialization here.
+		 */
+		asm volatile("mfence" : : : "memory");
+
 		printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
 		return;
 	}
@@ -457,45 +464,45 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
-	rdtscll(tsc);
+	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }
 
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt)
+static int lapic_timer_shutdown(struct clock_event_device *evt)
 {
-	unsigned long flags;
 	unsigned int v;
 
 	/* Lapic used as dummy for broadcast ? */
 	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
-		return;
+		return 0;
 
-	local_irq_save(flags);
+	v = apic_read(APIC_LVTT);
+	v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+	apic_write(APIC_LVTT, v);
+	apic_write(APIC_TMICT, 0);
+	return 0;
+}
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-		__setup_APIC_LVTT(lapic_timer_frequency,
-				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		v = apic_read(APIC_LVTT);
-		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write(APIC_LVTT, v);
-		apic_write(APIC_TMICT, 0);
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+	/* Lapic used as dummy for broadcast ? */
+	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
 
-	local_irq_restore(flags);
+	__setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+	return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, true);
 }
 
 /*
@@ -513,15 +520,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
  * The local apic timer can be used for any function which is CPU local.
  */
 static struct clock_event_device lapic_clockevent = {
-	.name		= "lapic",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-	.shift		= 32,
-	.set_mode	= lapic_timer_setup,
-	.set_next_event	= lapic_next_event,
-	.broadcast	= lapic_timer_broadcast,
-	.rating		= 100,
-	.irq		= -1,
+	.name			= "lapic",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+				  | CLOCK_EVT_FEAT_DUMMY,
+	.shift			= 32,
+	.set_state_shutdown	= lapic_timer_shutdown,
+	.set_state_periodic	= lapic_timer_set_periodic,
+	.set_state_oneshot	= lapic_timer_set_oneshot,
+	.set_next_event		= lapic_next_event,
+	.broadcast		= lapic_timer_broadcast,
+	.rating			= 100,
+	.irq			= -1,
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
@@ -592,7 +602,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();
 
 	switch (lapic_cal_loops++) {
 	case 0:
@@ -778,7 +788,7 @@ static int __init calibrate_APIC_clock(void)
 		 * Setup the apic timer manually
 		 */
 		levt->event_handler = lapic_cal_handler;
-		lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+		lapic_timer_set_periodic(levt);
 		lapic_cal_loops = -1;
 
 		/* Let the interrupts run */
@@ -788,7 +798,8 @@ static int __init calibrate_APIC_clock(void)
 			cpu_relax();
 
 		/* Stop the lapic timer */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+		local_irq_disable();
+		lapic_timer_shutdown(levt);
 
 		/* Jiffies delta */
 		deltaj = lapic_cal_j2 - lapic_cal_j1;
@@ -799,8 +810,8 @@ static int __init calibrate_APIC_clock(void)
 			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
 		else
 			levt->features |= CLOCK_EVT_FEAT_DUMMY;
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();
 
 	if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
 		pr_warning("APIC timer disabled due to verification failure\n");
@@ -878,7 +889,7 @@ static void local_apic_timer_interrupt(void)
 	if (!evt->event_handler) {
 		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
 		/* Switch it off */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+		lapic_timer_shutdown(evt);
 		return;
 	}
 
@@ -1209,7 +1220,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();
 
 	if (disable_apic) {
 		disable_ioapic_support();
@@ -1293,7 +1304,7 @@ void setup_local_APIC(void)
 		}
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
-				rdtscll(ntsc);
+				ntsc = rdtsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 				max_loops--;
@@ -1420,7 +1431,7 @@ enum {
 };
 static int x2apic_state;
 
-static inline void __x2apic_disable(void)
+static void __x2apic_disable(void)
 {
 	u64 msr;
 
@@ -1436,7 +1447,7 @@ static inline void __x2apic_disable(void)
 	printk_once(KERN_INFO "x2apic disabled\n");
 }
 
-static inline void __x2apic_enable(void)
+static void __x2apic_enable(void)
 {
 	u64 msr;
 
@@ -1796,7 +1807,7 @@ int apic_version[MAX_LOCAL_APIC];
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
-static inline void __smp_spurious_interrupt(u8 vector)
+static void __smp_spurious_interrupt(u8 vector)
 {
 	u32 v;
 
@@ -1837,7 +1848,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
 /*
  * This interrupt should never happen with our APIC/SMP architecture
  */
-static inline void __smp_error_interrupt(struct pt_regs *regs)
+static void __smp_error_interrupt(struct pt_regs *regs)
 {
 	u32 v;
 	u32 i = 0;
diff --git a/kernel/arch/x86/kernel/apic/apic_flat_64.c b/kernel/arch/x86/kernel/apic/apic_flat_64.c
index de918c410..f92ab3697 100644
--- a/kernel/arch/x86/kernel/apic/apic_flat_64.c
+++ b/kernel/arch/x86/kernel/apic/apic_flat_64.c
@@ -191,7 +191,6 @@ static struct apic apic_flat =  {
 	.send_IPI_all			= flat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
@@ -299,7 +298,6 @@ static struct apic apic_physflat =  {
 	.send_IPI_all			= physflat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/kernel/arch/x86/kernel/apic/apic_noop.c b/kernel/arch/x86/kernel/apic/apic_noop.c
index b205cdbdb..0d96749cf 100644
--- a/kernel/arch/x86/kernel/apic/apic_noop.c
+++ b/kernel/arch/x86/kernel/apic/apic_noop.c
@@ -152,7 +152,6 @@ struct apic apic_noop = {
 
 	.wakeup_secondary_cpu		= noop_wakeup_secondary_cpu,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= noop_apic_read,
diff --git a/kernel/arch/x86/kernel/apic/apic_numachip.c b/kernel/arch/x86/kernel/apic/apic_numachip.c
index 017149cde..2bd2292a3 100644
--- a/kernel/arch/x86/kernel/apic/apic_numachip.c
+++ b/kernel/arch/x86/kernel/apic/apic_numachip.c
@@ -11,30 +11,21 @@
  *
  */
 
-#include <linux/errno.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ctype.h>
 #include <linux/init.h>
-#include <linux/hardirq.h>
-#include <linux/delay.h>
 
 #include <asm/numachip/numachip.h>
 #include <asm/numachip/numachip_csr.h>
-#include <asm/smp.h>
-#include <asm/apic.h>
 #include <asm/ipi.h>
 #include <asm/apic_flat_64.h>
 #include <asm/pgtable.h>
+#include <asm/pci_x86.h>
 
-static int numachip_system __read_mostly;
+u8 numachip_system __read_mostly;
+static const struct apic apic_numachip1;
+static const struct apic apic_numachip2;
+static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
 
-static const struct apic apic_numachip;
-
-static unsigned int get_apic_id(unsigned long x)
+static unsigned int numachip1_get_apic_id(unsigned long x)
 {
 	unsigned long value;
 	unsigned int id = (x >> 24) & 0xff;
@@ -47,7 +38,7 @@ static unsigned int get_apic_id(unsigned long x)
 	return id;
 }
 
-static unsigned long set_apic_id(unsigned int id)
+static unsigned long numachip1_set_apic_id(unsigned int id)
 {
 	unsigned long x;
 
@@ -55,9 +46,17 @@ static unsigned long set_apic_id(unsigned int id)
 	return x;
 }
 
-static unsigned int read_xapic_id(void)
+static unsigned int numachip2_get_apic_id(unsigned long x)
+{
+	u64 mcfg;
+
+	rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+	return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
+}
+
+static unsigned long numachip2_set_apic_id(unsigned int id)
 {
-	return get_apic_id(apic_read(APIC_ID));
+	return id << 24;
 }
 
 static int numachip_apic_id_valid(int apicid)
@@ -68,7 +67,7 @@ static int numachip_apic_id_valid(int apicid)
 
 static int numachip_apic_id_registered(void)
 {
-	return physid_isset(read_xapic_id(), phys_cpu_present_map);
+	return 1;
 }
 
 static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
@@ -76,37 +75,48 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
 	return initial_apic_id >> index_msb;
 }
 
-static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static void numachip1_apic_icr_write(int apicid, unsigned int val)
 {
-	union numachip_csr_g3_ext_irq_gen int_gen;
-
-	int_gen.s._destination_apic_id = phys_apicid;
-	int_gen.s._vector = 0;
-	int_gen.s._msgtype = APIC_DM_INIT >> 8;
-	int_gen.s._index = 0;
-
-	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+	write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
+}
 
-	int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
-	int_gen.s._vector = start_rip >> 12;
+static void numachip2_apic_icr_write(int apicid, unsigned int val)
+{
+	numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
+}
 
-	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+	numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
+	numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
+		(start_rip >> 12));
 
-	atomic_set(&init_deasserted, 1);
 	return 0;
 }
 
 static void numachip_send_IPI_one(int cpu, int vector)
 {
-	union numachip_csr_g3_ext_irq_gen int_gen;
-	int apicid = per_cpu(x86_cpu_to_apicid, cpu);
-
-	int_gen.s._destination_apic_id = apicid;
-	int_gen.s._vector = vector;
-	int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
-	int_gen.s._index = 0;
+	int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu);
+	unsigned int dmode;
+
+	preempt_disable();
+	local_apicid = __this_cpu_read(x86_cpu_to_apicid);
+
+	/* Send via local APIC where non-local part matches */
+	if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		__default_send_IPI_dest_field(apicid, vector,
+			APIC_DEST_PHYSICAL);
+		local_irq_restore(flags);
+		preempt_enable();
+		return;
+	}
+	preempt_enable();
 
-	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+	dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED;
+	numachip_apic_icr_write(apicid, dmode | vector);
 }
 
 static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -150,9 +160,14 @@ static void numachip_send_IPI_self(int vector)
 	apic_write(APIC_SELF_IPI, vector);
 }
 
-static int __init numachip_probe(void)
+static int __init numachip1_probe(void)
 {
-	return apic == &apic_numachip;
+	return apic == &apic_numachip1;
+}
+
+static int __init numachip2_probe(void)
+{
+	return apic == &apic_numachip2;
 }
 
 static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
@@ -173,11 +188,19 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
 
 static int __init numachip_system_init(void)
 {
-	if (!numachip_system)
+	/* Map the LCSR area and set up the apic_icr_write function */
+	switch (numachip_system) {
+	case 1:
+		init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+		numachip_apic_icr_write = numachip1_apic_icr_write;
+		break;
+	case 2:
+		init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
+		numachip_apic_icr_write = numachip2_apic_icr_write;
+		break;
+	default:
 		return 0;
-
-	init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
-	init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+	}
 
 	x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
 	x86_init.pci.arch_init = pci_numachip_init;
@@ -186,21 +209,94 @@ static int __init numachip_system_init(void)
 }
 early_initcall(numachip_system_init);
 
-static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	if (!strncmp(oem_id, "NUMASC", 6)) {
-		numachip_system = 1;
-		return 1;
-	}
+	if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+	    (strncmp(oem_table_id, "NCONNECT", 8) != 0))
+		return 0;
 
-	return 0;
+	numachip_system = 1;
+
+	return 1;
+}
+
+static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+	    (strncmp(oem_table_id, "NCONECT2", 8) != 0))
+		return 0;
+
+	numachip_system = 2;
+
+	return 1;
+}
+
+/* APIC IPIs are queued */
+static void numachip_apic_wait_icr_idle(void)
+{
 }
 
-static const struct apic apic_numachip __refconst = {
+/* APIC NMI IPIs are queued */
+static u32 numachip_safe_apic_wait_icr_idle(void)
+{
+	return 0;
+}
 
+static const struct apic apic_numachip1 __refconst = {
 	.name				= "NumaConnect system",
-	.probe				= numachip_probe,
-	.acpi_madt_oem_check		= numachip_acpi_madt_oem_check,
+	.probe				= numachip1_probe,
+	.acpi_madt_oem_check		= numachip1_acpi_madt_oem_check,
+	.apic_id_valid			= numachip_apic_id_valid,
+	.apic_id_registered		= numachip_apic_id_registered,
+
+	.irq_delivery_mode		= dest_Fixed,
+	.irq_dest_mode			= 0, /* physical */
+
+	.target_cpus			= online_target_cpus,
+	.disable_esr			= 0,
+	.dest_logical			= 0,
+	.check_apicid_used		= NULL,
+
+	.vector_allocation_domain	= default_vector_allocation_domain,
+	.init_apic_ldr			= flat_init_apic_ldr,
+
+	.ioapic_phys_id_map		= NULL,
+	.setup_apic_routing		= NULL,
+	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
+	.apicid_to_cpu_present		= NULL,
+	.check_phys_apicid_present	= default_check_phys_apicid_present,
+	.phys_pkg_id			= numachip_phys_pkg_id,
+
+	.get_apic_id			= numachip1_get_apic_id,
+	.set_apic_id			= numachip1_set_apic_id,
+	.apic_id_mask			= 0xffU << 24,
+
+	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
+
+	.send_IPI_mask			= numachip_send_IPI_mask,
+	.send_IPI_mask_allbutself	= numachip_send_IPI_mask_allbutself,
+	.send_IPI_allbutself		= numachip_send_IPI_allbutself,
+	.send_IPI_all			= numachip_send_IPI_all,
+	.send_IPI_self			= numachip_send_IPI_self,
+
+	.wakeup_secondary_cpu		= numachip_wakeup_secondary,
+	.inquire_remote_apic		= NULL, /* REMRD not supported */
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.eoi_write			= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= numachip_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= numachip_safe_apic_wait_icr_idle,
+};
+
+apic_driver(apic_numachip1);
+
+static const struct apic apic_numachip2 __refconst = {
+	.name				= "NumaConnect2 system",
+	.probe				= numachip2_probe,
+	.acpi_madt_oem_check		= numachip2_acpi_madt_oem_check,
 	.apic_id_valid			= numachip_apic_id_valid,
 	.apic_id_registered		= numachip_apic_id_registered,
 
@@ -222,8 +318,8 @@ static const struct apic apic_numachip __refconst = {
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
 	.phys_pkg_id			= numachip_phys_pkg_id,
 
-	.get_apic_id			= get_apic_id,
-	.set_apic_id			= set_apic_id,
+	.get_apic_id			= numachip2_get_apic_id,
+	.set_apic_id			= numachip2_set_apic_id,
 	.apic_id_mask			= 0xffU << 24,
 
 	.cpu_mask_to_apicid_and		= default_cpu_mask_to_apicid_and,
@@ -235,7 +331,6 @@ static const struct apic apic_numachip __refconst = {
 	.send_IPI_self			= numachip_send_IPI_self,
 
 	.wakeup_secondary_cpu		= numachip_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL, /* REMRD not supported */
 
 	.read				= native_apic_mem_read,
@@ -243,8 +338,8 @@ static const struct apic apic_numachip __refconst = {
 	.eoi_write			= native_apic_mem_write,
 	.icr_read			= native_apic_icr_read,
 	.icr_write			= native_apic_icr_write,
-	.wait_icr_idle			= native_apic_wait_icr_idle,
-	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
+	.wait_icr_idle			= numachip_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= numachip_safe_apic_wait_icr_idle,
 };
-apic_driver(apic_numachip);
 
+apic_driver(apic_numachip2);
diff --git a/kernel/arch/x86/kernel/apic/bigsmp_32.c b/kernel/arch/x86/kernel/apic/bigsmp_32.c
index c4a8d63f8..971cf8875 100644
--- a/kernel/arch/x86/kernel/apic/bigsmp_32.c
+++ b/kernel/arch/x86/kernel/apic/bigsmp_32.c
@@ -186,7 +186,6 @@ static struct apic apic_bigsmp = {
 	.send_IPI_all			= bigsmp_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/kernel/arch/x86/kernel/apic/htirq.c b/kernel/arch/x86/kernel/apic/htirq.c
index 816f36e97..ae50d3454 100644
--- a/kernel/arch/x86/kernel/apic/htirq.c
+++ b/kernel/arch/x86/kernel/apic/htirq.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Add support of hierarchical irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,78 +16,112 @@
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/htirq.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/hypertransport.h>
 
+static struct irq_domain *htirq_domain;
+
 /*
  * Hypertransport interrupt support
  */
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
-	struct ht_irq_msg msg;
-
-	fetch_ht_irq_msg(irq, &msg);
-
-	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
-	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
-	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-	write_ht_irq_msg(irq, &msg);
-}
-
 static int
 ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
+	struct irq_data *parent = data->parent_data;
 	int ret;
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
-
-	target_ht_irq(data->irq, dest, cfg->vector);
-	return IRQ_SET_MASK_OK_NOCOPY;
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		struct ht_irq_msg msg;
+		struct irq_cfg *cfg = irqd_cfg(data);
+
+		fetch_ht_irq_msg(data->irq, &msg);
+		msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
+				    HT_IRQ_LOW_DEST_ID_MASK);
+		msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
+				  HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
+		msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+		msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
+		write_ht_irq_msg(data->irq, &msg);
+	}
+
+	return ret;
 }
 
 static struct irq_chip ht_irq_chip = {
 	.name			= "PCI-HT",
 	.irq_mask		= mask_ht_irq,
 	.irq_unmask		= unmask_ht_irq,
-	.irq_ack		= apic_ack_edge,
+	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= ht_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
-	struct irq_cfg *cfg;
-	struct ht_irq_msg msg;
-	unsigned dest;
-	int err;
+	struct ht_irq_cfg *ht_cfg;
+	struct irq_alloc_info *info = arg;
+	struct pci_dev *dev;
+	irq_hw_number_t hwirq;
+	int ret;
 
-	if (disable_apic)
-		return -ENXIO;
+	if (nr_irqs > 1 || !info)
+		return -EINVAL;
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	dev = info->ht_dev;
+	hwirq = (info->ht_idx & 0xFF) |
+		PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
+		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
+	if (irq_find_mapping(domain, hwirq) > 0)
+		return -EEXIST;
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
+	if (!ht_cfg)
+		return -ENOMEM;
 
-	msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(ht_cfg);
+		return ret;
+	}
+
+	/* Initialize msg to a value that will never match the first write. */
+	ht_cfg->msg.address_lo = 0xffffffff;
+	ht_cfg->msg.address_hi = 0xffffffff;
+	ht_cfg->dev = info->ht_dev;
+	ht_cfg->update = info->ht_update;
+	ht_cfg->pos = info->ht_pos;
+	ht_cfg->idx = 0x10 + (info->ht_idx * 2);
+	irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
+			    handle_edge_irq, ht_cfg, "edge");
+
+	return 0;
+}
+
+static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs)
+{
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
 
+static void htirq_domain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+	msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
 	msg.address_lo =
 		HT_IRQ_LOW_BASE |
-		HT_IRQ_LOW_DEST_ID(dest) |
+		HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
 		HT_IRQ_LOW_VECTOR(cfg->vector) |
 		((apic->irq_dest_mode == 0) ?
 			HT_IRQ_LOW_DM_PHYSICAL :
@@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 			HT_IRQ_LOW_MT_FIXED :
 			HT_IRQ_LOW_MT_ARBITRATED) |
 		HT_IRQ_LOW_IRQ_MASKED;
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	write_ht_irq_msg(irq, &msg);
+static void htirq_domain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
 
-	irq_set_chip_and_handler_name(irq, &ht_irq_chip,
-				      handle_edge_irq, "edge");
+	memset(&msg, 0, sizeof(msg));
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	dev_dbg(&dev->dev, "irq %d for HT\n", irq);
+static const struct irq_domain_ops htirq_domain_ops = {
+	.alloc		= htirq_domain_alloc,
+	.free		= htirq_domain_free,
+	.activate	= htirq_domain_activate,
+	.deactivate	= htirq_domain_deactivate,
+};
 
-	return 0;
+void arch_init_htirq_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
+
+	htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+	if (!htirq_domain)
+		pr_warn("failed to initialize irqdomain for HTIRQ.\n");
+	else
+		htirq_domain->parent = parent;
+}
+
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+		      ht_irq_update_t *update)
+{
+	struct irq_alloc_info info;
+
+	if (!htirq_domain)
+		return -ENOSYS;
+
+	init_irq_alloc_info(&info, NULL);
+	info.ht_idx = idx;
+	info.ht_pos = pos;
+	info.ht_dev = dev;
+	info.ht_update = update;
+
+	return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
+				     &info);
+}
+
+void arch_teardown_ht_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
 }
diff --git a/kernel/arch/x86/kernel/apic/hw_nmi.c b/kernel/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925..045e424fb 100644
--- a/kernel/arch/x86/kernel/apic/hw_nmi.c
+++ b/kernel/arch/x86/kernel/apic/hw_nmi.c
@@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
 #endif
 
 #ifdef arch_trigger_all_cpu_backtrace
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
-static unsigned long backtrace_flag;
-
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
 {
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
+	apic->send_IPI_mask(mask, NMI_VECTOR);
 }
 
 void arch_trigger_all_cpu_backtrace(bool include_self)
 {
-	struct nmi_seq_buf *s;
-	int len;
-	int cpu;
-	int i;
-	int this_cpu = get_cpu();
-
-	if (test_and_set_bit(0, &backtrace_flag)) {
-		/*
-		 * If there is already a trigger_all_cpu_backtrace() in progress
-		 * (backtrace_flag == 1), don't output double cpu dump infos.
-		 */
-		put_cpu();
-		return;
-	}
-
-	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-	if (!include_self)
-		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
-
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
-	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
-		pr_info("sending NMI to %s CPUs:\n",
-			(include_self ? "all" : "other"));
-		apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
-	}
-
-	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
-	for (i = 0; i < 10 * 1000; i++) {
-		if (cpumask_empty(to_cpumask(backtrace_mask)))
-			break;
-		mdelay(1);
-		touch_softlockup_watchdog();
-	}
-
-	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
-	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int last_i = 0;
-
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
-	put_cpu();
-}
-
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
+	nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
 }
 
 static int
 arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
-	int cpu;
-
-	cpu = smp_processor_id();
-
-	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
-		printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-		show_regs(regs);
-		this_cpu_write(printk_func, printk_func_save);
-
-		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+	if (nmi_cpu_backtrace(regs))
 		return NMI_HANDLED;
-	}
 
 	return NMI_DONE;
 }
diff --git a/kernel/arch/x86/kernel/apic/io_apic.c b/kernel/arch/x86/kernel/apic/io_apic.c
index 07c6aba75..678c711e2 100644
--- a/kernel/arch/x86/kernel/apic/io_apic.c
+++ b/kernel/arch/x86/kernel/apic/io_apic.c
@@ -18,6 +18,16 @@
  *					and Rolf G. Tews
  *					for testing these extensively
  *	Paul Diefenbaugh	:	Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ *	We used to have a workaround for a bug in SiS chips which
+ *	required to rewrite the index register for a read-modify-write
+ *	operation as the chip lost the index information which was
+ *	setup for the read already. We cache the data now, so that
+ *	workaround has been removed.
  */
 
 #include <linux/mm.h>
@@ -31,13 +41,13 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/syscore_ops.h>
-#include <linux/irqdomain.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 
+#include <asm/irqdomain.h>
 #include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -63,27 +73,31 @@
 #define	for_each_ioapic_pin(idx, pin)	\
 	for_each_ioapic((idx))		\
 		for_each_pin((idx), (pin))
-
 #define for_each_irq_pin(entry, head) \
 	list_for_each_entry(entry, &head, list)
 
-/*
- *      Is the SiS APIC rmw bug present ?
- *      -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_MUTEX(ioapic_mutex);
 static unsigned int ioapic_dynirq_base;
 static int ioapic_initialized;
 
-struct mp_pin_info {
+struct irq_pin_list {
+	struct list_head list;
+	int apic, pin;
+};
+
+struct mp_chip_data {
+	struct list_head irq_2_pin;
+	struct IO_APIC_route_entry entry;
 	int trigger;
 	int polarity;
-	int node;
-	int set;
 	u32 count;
+	bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+	u32 gsi_base;
+	u32 gsi_end;
 };
 
 static struct ioapic {
@@ -101,7 +115,6 @@ static struct ioapic {
 	struct mp_ioapic_gsi  gsi_config;
 	struct ioapic_domain_cfg irqdomain_cfg;
 	struct irq_domain *irqdomain;
-	struct mp_pin_info *pin_info;
 	struct resource *iomem_res;
 } ioapics[MAX_IO_APICS];
 
@@ -117,7 +130,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx)
 	return ioapics[ioapic_idx].mp_config.apicaddr;
 }
 
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
 {
 	return &ioapics[ioapic_idx].gsi_config;
 }
@@ -129,11 +142,16 @@ static inline int mp_ioapic_pin_count(int ioapic)
 	return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
 }
 
-u32 mp_pin_to_gsi(int ioapic, int pin)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
 {
 	return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
 }
 
+static inline bool mp_is_legacy_irq(int irq)
+{
+	return irq >= 0 && irq < nr_legacy_irqs();
+}
+
 /*
  * Initialize all legacy IRQs and all pins on the first IOAPIC
  * if we have legacy interrupt controller. Kernel boot option "pirq="
@@ -144,12 +162,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq)
 	if (!nr_legacy_irqs())
 		return 0;
 
-	return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
-}
-
-static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
-{
-	return ioapics[ioapic_idx].pin_info + pin;
+	return ioapic == 0 || mp_is_legacy_irq(irq);
 }
 
 static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
@@ -216,16 +229,6 @@ void mp_save_irq(struct mpc_intsrc *m)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
-struct irq_pin_list {
-	struct list_head list;
-	int apic, pin;
-};
-
-static struct irq_pin_list *alloc_irq_pin_list(int node)
-{
-	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-}
-
 static void alloc_ioapic_saved_registers(int idx)
 {
 	size_t size;
@@ -247,8 +250,7 @@ static void free_ioapic_saved_registers(int idx)
 
 int __init arch_early_ioapic_init(void)
 {
-	struct irq_cfg *cfg;
-	int i, node = cpu_to_node(0);
+	int i;
 
 	if (!nr_legacy_irqs())
 		io_apic_irqs = ~0UL;
@@ -256,16 +258,6 @@ int __init arch_early_ioapic_init(void)
 	for_each_ioapic(i)
 		alloc_ioapic_saved_registers(i);
 
-	/*
-	 * For legacy IRQ's, start with assigning irq0 to irq15 to
-	 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
-	 */
-	for (i = 0; i < nr_legacy_irqs(); i++) {
-		cfg = alloc_irq_and_cfg_at(i, node);
-		cfg->vector = IRQ0_VECTOR + i;
-		cpumask_setall(cfg->domain);
-	}
-
 	return 0;
 }
 
@@ -283,7 +275,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 
-void io_apic_eoi(unsigned int apic, unsigned int vector)
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(vector, &io_apic->eoi);
@@ -296,7 +288,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
 	return readl(&io_apic->data);
 }
 
-void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+			  unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
@@ -304,21 +297,6 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu
 	writel(value, &io_apic->data);
 }
 
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-
-	if (sis_apic_bug)
-		writel(reg, &io_apic->index);
-	writel(value, &io_apic->data);
-}
-
 union entry_union {
 	struct { u32 w1, w2; };
 	struct IO_APIC_route_entry entry;
@@ -378,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 static void ioapic_mask_entry(int apic, int pin)
 {
 	unsigned long flags;
-	union entry_union eu = { .entry.mask = 1 };
+	union entry_union eu = { .entry.mask = IOAPIC_MASKED };
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -391,16 +369,17 @@ static void ioapic_mask_entry(int apic, int pin)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+				 int node, int apic, int pin)
 {
 	struct irq_pin_list *entry;
 
 	/* don't allow duplicates */
-	for_each_irq_pin(entry, cfg->irq_2_pin)
+	for_each_irq_pin(entry, data->irq_2_pin)
 		if (entry->apic == apic && entry->pin == pin)
 			return 0;
 
-	entry = alloc_irq_pin_list(node);
+	entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
 	if (!entry) {
 		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
 		       node, apic, pin);
@@ -408,16 +387,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
 	}
 	entry->apic = apic;
 	entry->pin = pin;
+	list_add_tail(&entry->list, &data->irq_2_pin);
 
-	list_add_tail(&entry->list, &cfg->irq_2_pin);
 	return 0;
 }
 
-static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
 {
 	struct irq_pin_list *tmp, *entry;
 
-	list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
+	list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
 		if (entry->apic == apic && entry->pin == pin) {
 			list_del(&entry->list);
 			kfree(entry);
@@ -425,22 +404,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
 		}
 }
 
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+				int node, int apic, int pin)
 {
-	if (__add_pin_to_irq_node(cfg, node, apic, pin))
+	if (__add_pin_to_irq_node(data, node, apic, pin))
 		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 }
 
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
 					   int oldapic, int oldpin,
 					   int newapic, int newpin)
 {
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		if (entry->apic == oldapic && entry->pin == oldpin) {
 			entry->apic = newapic;
 			entry->pin = newpin;
@@ -450,32 +430,26 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
 	}
 
 	/* old apic/pin didn't exist, so just add new ones */
-	add_pin_to_irq_node(cfg, node, newapic, newpin);
+	add_pin_to_irq_node(data, node, newapic, newpin);
 }
 
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
-				 int mask_and, int mask_or,
-				 void (*final)(struct irq_pin_list *entry))
-{
-	unsigned int reg, pin;
-
-	pin = entry->pin;
-	reg = io_apic_read(entry->apic, 0x10 + pin * 2);
-	reg &= mask_and;
-	reg |= mask_or;
-	io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
-	if (final)
-		final(entry);
-}
-
-static void io_apic_modify_irq(struct irq_cfg *cfg,
+static void io_apic_modify_irq(struct mp_chip_data *data,
 			       int mask_and, int mask_or,
 			       void (*final)(struct irq_pin_list *entry))
 {
+	union entry_union eu;
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		__io_apic_modify_irq(entry, mask_and, mask_or, final);
+	eu.entry = data->entry;
+	eu.w1 &= mask_and;
+	eu.w1 |= mask_or;
+	data->entry = eu.entry;
+
+	for_each_irq_pin(entry, data->irq_2_pin) {
+		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+		if (final)
+			final(entry);
+	}
 }
 
 static void io_apic_sync(struct irq_pin_list *entry)
@@ -490,39 +464,31 @@ static void io_apic_sync(struct irq_pin_list *entry)
 	readl(&io_apic->data);
 }
 
-static void mask_ioapic(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void mask_ioapic_irq(struct irq_data *data)
+static void __unmask_ioapic(struct mp_chip_data *data)
 {
-	mask_ioapic(irqd_cfg(data));
+	io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
-static void __unmask_ioapic(struct irq_cfg *cfg)
-{
-	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-static void unmask_ioapic(struct irq_cfg *cfg)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_ioapic(cfg);
+	__unmask_ioapic(data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_ioapic_irq(struct irq_data *data)
-{
-	unmask_ioapic(irqd_cfg(data));
-}
-
 /*
  * IO-APIC versions below 0x20 don't support EOI register.
  * For the record, here is the information about various versions:
@@ -539,7 +505,7 @@ static void unmask_ioapic_irq(struct irq_data *data)
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * mode to edge and then back to level, with RTE being masked during this.
  */
-void native_eoi_ioapic_pin(int apic, int pin, int vector)
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
 {
 	if (mpc_ioapic_ver(apic) >= 0x20) {
 		io_apic_eoi(apic, vector);
@@ -551,7 +517,7 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 		/*
 		 * Mask the entry and change the trigger mode to edge.
 		 */
-		entry1.mask = 1;
+		entry1.mask = IOAPIC_MASKED;
 		entry1.trigger = IOAPIC_EDGE;
 
 		__ioapic_write_entry(apic, pin, entry1);
@@ -563,15 +529,14 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 	}
 }
 
-void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
 {
-	struct irq_pin_list *entry;
 	unsigned long flags;
+	struct irq_pin_list *entry;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
-					       cfg->vector);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__eoi_ioapic_pin(entry->apic, entry->pin, vector);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -588,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	 * Make sure the entry is masked and re-read the contents to check
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 */
-	if (!entry.mask) {
-		entry.mask = 1;
+	if (entry.mask == IOAPIC_UNMASKED) {
+		entry.mask = IOAPIC_MASKED;
 		ioapic_write_entry(apic, pin, entry);
 		entry = ioapic_read_entry(apic, pin);
 	}
@@ -602,13 +567,12 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * set to level.
 		 */
-		if (!entry.trigger) {
+		if (entry.trigger == IOAPIC_EDGE) {
 			entry.trigger = IOAPIC_LEVEL;
 			ioapic_write_entry(apic, pin, entry);
 		}
-
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+		__eoi_ioapic_pin(apic, pin, entry.vector);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -706,8 +670,8 @@ void mask_ioapic_entries(void)
 			struct IO_APIC_route_entry entry;
 
 			entry = ioapics[apic].saved_registers[pin];
-			if (!entry.mask) {
-				entry.mask = 1;
+			if (entry.mask == IOAPIC_UNMASKED) {
+				entry.mask = IOAPIC_MASKED;
 				ioapic_write_entry(apic, pin, entry);
 			}
 		}
@@ -809,11 +773,11 @@ static int EISA_ELCR(unsigned int irq)
 
 #endif
 
-/* ISA interrupts are always polarity zero edge triggered,
+/* ISA interrupts are always active high edge triggered,
  * when listed as conforming in the MP table. */
 
-#define default_ISA_trigger(idx)	(0)
-#define default_ISA_polarity(idx)	(0)
+#define default_ISA_trigger(idx)	(IOAPIC_EDGE)
+#define default_ISA_polarity(idx)	(IOAPIC_POL_HIGH)
 
 /* EISA interrupts are always polarity zero and can be edge or level
  * trigger depending on the ELCR value.  If an interrupt is listed as
@@ -823,54 +787,56 @@ static int EISA_ELCR(unsigned int irq)
 #define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
-/* PCI interrupts are always polarity one level triggered,
+/* PCI interrupts are always active low level triggered,
  * when listed as conforming in the MP table. */
 
-#define default_PCI_trigger(idx)	(1)
-#define default_PCI_polarity(idx)	(1)
+#define default_PCI_trigger(idx)	(IOAPIC_LEVEL)
+#define default_PCI_polarity(idx)	(IOAPIC_POL_LOW)
 
 static int irq_polarity(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
-	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].irqflag & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-			if (test_bit(bus, mp_bus_not_pci))
-				polarity = default_ISA_polarity(idx);
-			else
-				polarity = default_PCI_polarity(idx);
-			break;
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+	switch (mp_irqs[idx].irqflag & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent polarity */
+		if (test_bit(bus, mp_bus_not_pci))
+			return default_ISA_polarity(idx);
+		else
+			return default_PCI_polarity(idx);
+	case 1:
+		return IOAPIC_POL_HIGH;
+	case 2:
+		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_POL_LOW;
 	}
-	return polarity;
 }
 
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	switch (mp_bus_id_to_type[bus]) {
+	case MP_BUS_PCI:
+	case MP_BUS_ISA:
+		return trigger;
+	case MP_BUS_EISA:
+		return default_EISA_trigger(idx);
+	}
+	pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+	return IOAPIC_LEVEL;
+}
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	return trigger;
+}
+#endif
+
 static int irq_trigger(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
@@ -879,153 +845,227 @@ static int irq_trigger(int idx)
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].irqflag>>2) & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent */
-			if (test_bit(bus, mp_bus_not_pci))
-				trigger = default_ISA_trigger(idx);
-			else
-				trigger = default_PCI_trigger(idx);
-#ifdef CONFIG_EISA
-			switch (mp_bus_id_to_type[bus]) {
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				default:
-				{
-					pr_warn("broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
+	switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent trigger mode */
+		if (test_bit(bus, mp_bus_not_pci))
+			trigger = default_ISA_trigger(idx);
+		else
+			trigger = default_PCI_trigger(idx);
+		/* Take EISA into account */
+		return eisa_irq_trigger(idx, bus, trigger);
+	case 1:
+		return IOAPIC_EDGE;
+	case 2:
+		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_LEVEL;
+	}
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+			   int trigger, int polarity)
+{
+	init_irq_alloc_info(info, NULL);
+	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info->ioapic_node = node;
+	info->ioapic_trigger = trigger;
+	info->ioapic_polarity = polarity;
+	info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #endif
-			break;
-		case 1: /* edge */
-		{
-			trigger = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 1;
-			break;
-		}
-		case 3: /* level */
-		{
-			trigger = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 0;
-			break;
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+				   struct irq_alloc_info *src,
+				   u32 gsi, int ioapic_idx, int pin)
+{
+	int trigger, polarity;
+
+	copy_irq_alloc_info(dst, src);
+	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+	dst->ioapic_pin = pin;
+	dst->ioapic_valid = 1;
+	if (src && src->ioapic_valid) {
+		dst->ioapic_node = src->ioapic_node;
+		dst->ioapic_trigger = src->ioapic_trigger;
+		dst->ioapic_polarity = src->ioapic_polarity;
+	} else {
+		dst->ioapic_node = NUMA_NO_NODE;
+		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+			dst->ioapic_trigger = trigger;
+			dst->ioapic_polarity = polarity;
+		} else {
+			/*
+			 * PCI interrupts are always active low level
+			 * triggered.
+			 */
+			dst->ioapic_trigger = IOAPIC_LEVEL;
+			dst->ioapic_polarity = IOAPIC_POL_LOW;
 		}
 	}
-	return trigger;
 }
 
-static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+	return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
 {
+	irq_flow_handler_t hdl;
+	bool fasteoi;
+
+	if (trigger) {
+		irq_set_status_flags(irq, IRQ_LEVEL);
+		fasteoi = true;
+	} else {
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+		fasteoi = false;
+	}
+
+	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+	__irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
+{
+	struct mp_chip_data *data = irq_get_chip_data(irq);
+
+	/*
+	 * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+	 * and polarity attirbutes. So allow the first user to reprogram the
+	 * pin with real trigger and polarity attributes.
+	 */
+	if (irq < nr_legacy_irqs() && data->count == 1) {
+		if (info->ioapic_trigger != data->trigger)
+			mp_register_handler(irq, info->ioapic_trigger);
+		data->entry.trigger = data->trigger = info->ioapic_trigger;
+		data->entry.polarity = data->polarity = info->ioapic_polarity;
+	}
+
+	return data->trigger == info->ioapic_trigger &&
+	       data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+				 struct irq_alloc_info *info)
+{
+	bool legacy = false;
 	int irq = -1;
-	int ioapic = (int)(long)domain->host_data;
 	int type = ioapics[ioapic].irqdomain_cfg.type;
 
 	switch (type) {
 	case IOAPIC_DOMAIN_LEGACY:
 		/*
-		 * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
-		 * GSIs on some weird platforms.
+		 * Dynamically allocate IRQ number for non-ISA IRQs in the first
+		 * 16 GSIs on some weird platforms.
 		 */
-		if (gsi < nr_legacy_irqs())
-			irq = irq_create_mapping(domain, pin);
-		else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+		if (!ioapic_initialized || gsi >= nr_legacy_irqs())
 			irq = gsi;
+		legacy = mp_is_legacy_irq(irq);
 		break;
 	case IOAPIC_DOMAIN_STRICT:
-		if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
-			irq = gsi;
+		irq = gsi;
 		break;
 	case IOAPIC_DOMAIN_DYNAMIC:
-		irq = irq_create_mapping(domain, pin);
 		break;
 	default:
 		WARN(1, "ioapic: unknown irqdomain type %d\n", type);
-		break;
+		return -1;
+	}
+
+	return __irq_domain_alloc_irqs(domain, irq, 1,
+				       ioapic_alloc_attr_node(info),
+				       info, legacy);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+				     int irq, int ioapic, int pin,
+				     struct irq_alloc_info *info)
+{
+	struct mp_chip_data *data;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	int node = ioapic_alloc_attr_node(info);
+
+	/*
+	 * Legacy ISA IRQ has already been allocated, just add pin to
+	 * the pin list assoicated with this IRQ and program the IOAPIC
+	 * entry. The IOAPIC entry
+	 */
+	if (irq_data && irq_data->parent_data) {
+		if (!mp_check_pin_attr(irq, info))
+			return -EBUSY;
+		if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+					  info->ioapic_pin))
+			return -ENOMEM;
+	} else {
+		irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+		if (irq >= 0) {
+			irq_data = irq_domain_get_irq_data(domain, irq);
+			data = irq_data->chip_data;
+			data->isa_irq = true;
+		}
 	}
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 
 static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
-			     unsigned int flags)
+			     unsigned int flags, struct irq_alloc_info *info)
 {
 	int irq;
+	bool legacy = false;
+	struct irq_alloc_info tmp;
+	struct mp_chip_data *data;
 	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
-	struct mp_pin_info *info = mp_pin_info(ioapic, pin);
 
 	if (!domain)
-		return -1;
+		return -ENOSYS;
 
-	mutex_lock(&ioapic_mutex);
-
-	/*
-	 * Don't use irqdomain to manage ISA IRQs because there may be
-	 * multiple IOAPIC pins sharing the same ISA IRQ number and
-	 * irqdomain only supports 1:1 mapping between IOAPIC pin and
-	 * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
-	 * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
-	 * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
-	 * available, and some BIOSes may use MP Interrupt Source records
-	 * to override IRQ numbers for PIRQs instead of reprogramming
-	 * the interrupt routing logic. Thus there may be multiple pins
-	 * sharing the same legacy IRQ number when ACPI is disabled.
-	 */
 	if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
-		if (flags & IOAPIC_MAP_ALLOC) {
-			if (info->count == 0 &&
-			    mp_irqdomain_map(domain, irq, pin) != 0)
-				irq = -1;
+		legacy = mp_is_legacy_irq(irq);
+	}
 
-			/* special handling for timer IRQ0 */
+	mutex_lock(&ioapic_mutex);
+	if (!(flags & IOAPIC_MAP_ALLOC)) {
+		if (!legacy) {
+			irq = irq_find_mapping(domain, pin);
 			if (irq == 0)
-				info->count++;
+				irq = -ENOENT;
 		}
 	} else {
-		irq = irq_find_mapping(domain, pin);
-		if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
-			irq = alloc_irq_from_domain(domain, gsi, pin);
-	}
-
-	if (flags & IOAPIC_MAP_ALLOC) {
-		/* special handling for legacy IRQs */
-		if (irq < nr_legacy_irqs() && info->count == 1 &&
-		    mp_irqdomain_map(domain, irq, pin) != 0)
-			irq = -1;
-
-		if (irq > 0)
-			info->count++;
-		else if (info->count == 0)
-			info->set = 0;
+		ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+		if (legacy)
+			irq = alloc_isa_irq_from_domain(domain, irq,
+							ioapic, pin, &tmp);
+		else if ((irq = irq_find_mapping(domain, pin)) == 0)
+			irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+		else if (!mp_check_pin_attr(irq, &tmp))
+			irq = -EBUSY;
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			data->count++;
+		}
 	}
-
 	mutex_unlock(&ioapic_mutex);
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 
 static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
@@ -1058,10 +1098,10 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
 	}
 #endif
 
-	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
 }
 
-int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
 {
 	int ioapic, pin, idx;
 
@@ -1074,31 +1114,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
 		return -1;
 
-	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
 }
 
 void mp_unmap_irq(int irq)
 {
-	struct irq_data *data = irq_get_irq_data(irq);
-	struct mp_pin_info *info;
-	int ioapic, pin;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	struct mp_chip_data *data;
 
-	if (!data || !data->domain)
+	if (!irq_data || !irq_data->domain)
 		return;
 
-	ioapic = (int)(long)data->domain->host_data;
-	pin = (int)data->hwirq;
-	info = mp_pin_info(ioapic, pin);
+	data = irq_data->chip_data;
+	if (!data || data->isa_irq)
+		return;
 
 	mutex_lock(&ioapic_mutex);
-	if (--info->count == 0) {
-		info->set = 0;
-		if (irq < nr_legacy_irqs() &&
-		    ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
-			mp_irqdomain_unmap(data->domain, irq);
-		else
-			irq_dispose_mapping(irq);
-	}
+	if (--data->count == 0)
+		irq_domain_free_irqs(irq, 1);
 	mutex_unlock(&ioapic_mutex);
 }
 
@@ -1165,7 +1198,7 @@ out:
 }
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
-static struct irq_chip ioapic_chip;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
 
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
@@ -1189,96 +1222,6 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
 
-static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
-				 unsigned long trigger)
-{
-	struct irq_chip *chip = &ioapic_chip;
-	irq_flow_handler_t hdl;
-	bool fasteoi;
-
-	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL) {
-		irq_set_status_flags(irq, IRQ_LEVEL);
-		fasteoi = true;
-	} else {
-		irq_clear_status_flags(irq, IRQ_LEVEL);
-		fasteoi = false;
-	}
-
-	if (setup_remapped_irq(irq, cfg, chip))
-		fasteoi = trigger != 0;
-
-	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
-	irq_set_chip_and_handler_name(irq, chip, hdl,
-				      fasteoi ? "fasteoi" : "edge");
-}
-
-int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int vector,
-			      struct io_apic_irq_attr *attr)
-{
-	memset(entry, 0, sizeof(*entry));
-
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode     = apic->irq_dest_mode;
-	entry->dest	     = destination;
-	entry->vector	     = vector;
-	entry->mask	     = 0;			/* enable IRQ */
-	entry->trigger	     = attr->trigger;
-	entry->polarity	     = attr->polarity;
-
-	/*
-	 * Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
-
-	return 0;
-}
-
-static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
-				struct io_apic_irq_attr *attr)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	if (!IO_APIC_IRQ(irq))
-		return;
-
-	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
-		return;
-
-	if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
-					 &dest)) {
-		pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	apic_printk(APIC_VERBOSE,KERN_DEBUG
-		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
-		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
-
-	if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
-		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	ioapic_register_intr(irq, cfg, attr->trigger);
-	if (irq < nr_legacy_irqs())
-		legacy_pic->mask(irq);
-
-	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
-}
-
 static void __init setup_IO_APIC_irqs(void)
 {
 	unsigned int ioapic, pin;
@@ -1298,106 +1241,41 @@ static void __init setup_IO_APIC_irqs(void)
 	}
 }
 
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
-					unsigned int pin, int vector)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	memset(&entry, 0, sizeof(entry));
-
-	/*
-	 * We use logical delivery to get the timer IRQ
-	 * to the first CPU.
-	 */
-	if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
-						  apic->target_cpus(), &dest)))
-		dest = BAD_APICID;
-
-	entry.dest_mode = apic->irq_dest_mode;
-	entry.mask = 0;			/* don't mask IRQ for edge */
-	entry.dest = dest;
-	entry.delivery_mode = apic->irq_delivery_mode;
-	entry.polarity = 0;
-	entry.trigger = 0;
-	entry.vector = vector;
-
-	/*
-	 * The timer IRQ doesn't have to know that behind the
-	 * scene we may have a 8259A-master in AEOI mode ...
-	 */
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
-
-	/*
-	 * Add it to the IO-APIC irq-routing table:
-	 */
-	ioapic_write_entry(ioapic_idx, pin, entry);
-}
-
-void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+void ioapic_zap_locks(void)
 {
-	int i;
-
-	pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
-
-	for (i = 0; i <= nr_entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		pr_debug(" %02x %02X  ", i, entry.dest);
-		pr_cont("%1d    %1d    %1d   %1d   %1d    "
-			"%1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector);
-	}
+	raw_spin_lock_init(&ioapic_lock);
 }
 
-void intel_ir_io_apic_print_entries(unsigned int apic,
-				    unsigned int nr_entries)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
 	int i;
+	char buf[256];
+	struct IO_APIC_route_entry entry;
+	struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
 
-	pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
-
+	printk(KERN_DEBUG "IOAPIC %d:\n", apic);
 	for (i = 0; i <= nr_entries; i++) {
-		struct IR_IO_APIC_route_entry *ir_entry;
-		struct IO_APIC_route_entry entry;
-
 		entry = ioapic_read_entry(apic, i);
-
-		ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
-
-		pr_debug(" %02x %04X ", i, ir_entry->index);
-		pr_cont("%1d   %1d    %1d    %1d   %1d   "
-			"%1d    %1d     %X    %02X\n",
-			ir_entry->format,
-			ir_entry->mask,
-			ir_entry->trigger,
-			ir_entry->irr,
-			ir_entry->polarity,
-			ir_entry->delivery_status,
-			ir_entry->index2,
-			ir_entry->zero,
-			ir_entry->vector);
+		snprintf(buf, sizeof(buf),
+			 " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+			 i,
+			 entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+			 entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+			 entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+			 entry.vector, entry.irr, entry.delivery_status);
+		if (ir_entry->format)
+			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
+			       buf, (ir_entry->index << 15) | ir_entry->index,
+			       ir_entry->zero);
+		else
+			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+			       buf,
+			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+			       "logical " : "physical",
+			       entry.dest, entry.delivery_mode);
 	}
 }
 
-void ioapic_zap_locks(void)
-{
-	raw_spin_lock_init(&ioapic_lock);
-}
-
 static void __init print_IO_APIC(int ioapic_idx)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1451,16 +1329,13 @@ static void __init print_IO_APIC(int ioapic_idx)
 	}
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-	x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+	io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
 }
 
 void __init print_IO_APICs(void)
 {
 	int ioapic_idx;
-	struct irq_cfg *cfg;
 	unsigned int irq;
-	struct irq_chip *chip;
 
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for_each_ioapic(ioapic_idx)
@@ -1480,18 +1355,20 @@ void __init print_IO_APICs(void)
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
+		struct irq_chip *chip;
+		struct mp_chip_data *data;
 
 		chip = irq_get_chip(irq);
-		if (chip != &ioapic_chip)
+		if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
 			continue;
-
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = irq_get_chip_data(irq);
+		if (!data)
 			continue;
-		if (list_empty(&cfg->irq_2_pin))
+		if (list_empty(&data->irq_2_pin))
 			continue;
+
 		printk(KERN_DEBUG "IRQ%d ", irq);
-		for_each_irq_pin(entry, cfg->irq_2_pin)
+		for_each_irq_pin(entry, data->irq_2_pin)
 			pr_cont("-> %d:%d", entry->apic, entry->pin);
 		pr_cont("\n");
 	}
@@ -1564,15 +1441,12 @@ void native_disable_io_apic(void)
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
-		entry.mask            = 0; /* Enabled */
-		entry.trigger         = 0; /* Edge */
-		entry.irr             = 0;
-		entry.polarity        = 0; /* High */
-		entry.delivery_status = 0;
-		entry.dest_mode       = 0; /* Physical */
-		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-		entry.vector          = 0;
-		entry.dest            = read_apic_id();
+		entry.mask		= IOAPIC_UNMASKED;
+		entry.trigger		= IOAPIC_EDGE;
+		entry.polarity		= IOAPIC_POL_HIGH;
+		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
+		entry.delivery_mode	= dest_ExtINT;
+		entry.dest		= read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1582,7 +1456,6 @@ void native_disable_io_apic(void)
 
 	if (cpu_has_apic || apic_from_smp_config())
 		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-
 }
 
 /*
@@ -1792,7 +1665,6 @@ static int __init timer_irq_works(void)
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
  */
-
 static unsigned int startup_ioapic_irq(struct irq_data *data)
 {
 	int was_pending = 0, irq = data->irq;
@@ -1804,74 +1676,22 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_ioapic(irqd_cfg(data));
+	__unmask_ioapic(data->chip_data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
 }
 
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		unsigned int reg;
-
-		apic = entry->apic;
-		pin = entry->pin;
-
-		io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-	}
-}
-
-int native_ioapic_set_affinity(struct irq_data *data,
-			       const struct cpumask *mask,
-			       bool force)
-{
-	unsigned int dest, irq = data->irq;
-	unsigned long flags;
-	int ret;
-
-	if (!config_enabled(CONFIG_SMP))
-		return -EPERM;
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	ret = apic_set_affinity(data, mask, &dest);
-	if (!ret) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, irqd_cfg(data));
-		ret = IRQ_SET_MASK_OK_NOCOPY;
-	}
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	return ret;
-}
-
 atomic_t irq_mis_count;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		unsigned int reg;
 		int pin;
 
@@ -1888,19 +1708,18 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 	return false;
 }
 
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irqd_is_setaffinity_pending(data) &&
 		     !irqd_irq_inprogress(data))) {
-		mask_ioapic(cfg);
+		mask_ioapic_irq(data);
 		return true;
 	}
 	return false;
 }
 
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 	if (unlikely(masked)) {
 		/* Only migrate the irq if the ack has been received.
@@ -1929,31 +1748,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data,
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(cfg))
+		if (!io_apic_level_ack_pending(data->chip_data))
 			irq_move_masked_irq(data);
-		unmask_ioapic(cfg);
+		unmask_ioapic_irq(data);
 	}
 }
 #else
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	return false;
 }
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 }
 #endif
 
-static void ack_ioapic_level(struct irq_data *data)
+static void ioapic_ack_level(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	int i, irq = data->irq;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	unsigned long v;
 	bool masked;
+	int i;
 
 	irq_complete_move(cfg);
-	masked = ioapic_irqd_mask(data, cfg);
+	masked = ioapic_irqd_mask(irq_data);
 
 	/*
 	 * It appears there is an erratum which affects at least version 0x11
@@ -2005,11 +1823,49 @@ static void ack_ioapic_level(struct irq_data *data)
 	 */
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
+		eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+	}
+
+	ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+	struct mp_chip_data *data = irq_data->chip_data;
+
+	/*
+	 * Intr-remapping uses pin number as the virtual vector
+	 * in the RTE. Actual vector is programmed in
+	 * intr-remapping table entry. Hence for the io-apic
+	 * EOI we use the pin number.
+	 */
+	ack_APIC_irq();
+	eoi_ioapic_pin(data->entry.vector, data);
+}
+
+static int ioapic_set_affinity(struct irq_data *irq_data,
+			       const struct cpumask *mask, bool force)
+{
+	struct irq_data *parent = irq_data->parent_data;
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_pin_list *entry;
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	int ret;
 
-		eoi_ioapic_irq(irq, cfg);
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+		cfg = irqd_cfg(irq_data);
+		data->entry.dest = cfg->dest_apicid;
+		data->entry.vector = cfg->vector;
+		for_each_irq_pin(entry, data->irq_2_pin)
+			__ioapic_write_entry(entry->apic, entry->pin,
+					     data->entry);
 	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	ioapic_irqd_unmask(data, cfg, masked);
+	return ret;
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
@@ -2017,10 +1873,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_startup		= startup_ioapic_irq,
 	.irq_mask		= mask_ioapic_irq,
 	.irq_unmask		= unmask_ioapic_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_eoi		= ack_ioapic_level,
-	.irq_set_affinity	= native_ioapic_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+	.name			= "IR-IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ir_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
@@ -2114,12 +1980,12 @@ static inline void __init unlock_ExtINT_logic(void)
 
 	memset(&entry1, 0, sizeof(entry1));
 
-	entry1.dest_mode = 0;			/* physical delivery */
-	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+	entry1.mask = IOAPIC_UNMASKED;
 	entry1.dest = hard_smp_processor_id();
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.polarity = entry0.polarity;
-	entry1.trigger = 0;
+	entry1.trigger = IOAPIC_EDGE;
 	entry1.vector = 0;
 
 	ioapic_write_entry(apic, pin, entry1);
@@ -2153,6 +2019,25 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+	int irq = -1;
+	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+	if (domain) {
+		struct irq_alloc_info info;
+
+		ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+		info.ioapic_id = mpc_ioapic_id(ioapic);
+		info.ioapic_pin = pin;
+		mutex_lock(&ioapic_mutex);
+		irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+		mutex_unlock(&ioapic_mutex);
+	}
+
+	return irq;
+}
+
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2163,7 +2048,9 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup);
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_data *irq_data = irq_get_irq_data(0);
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -2175,7 +2062,6 @@ static inline void __init check_timer(void)
 	 * get/set the timer IRQ vector:
 	 */
 	legacy_pic->mask(0);
-	assign_irq_vector(0, cfg, apic->target_cpus());
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2216,23 +2102,21 @@ static inline void __init check_timer(void)
 	}
 
 	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
+		/* Ok, does IRQ0 through the IOAPIC work? */
 		if (no_pin1) {
-			add_pin_to_irq_node(cfg, node, apic1, pin1);
-			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+			mp_alloc_timer_irq(apic1, pin1);
 		} else {
-			/* for edge trigger, setup_ioapic_irq already
-			 * leave it unmasked.
+			/*
+			 * for edge trigger, it's already unmasked,
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
 			 */
 			int idx;
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			if (idx != -1 && irq_trigger(idx))
-				unmask_ioapic(cfg);
+				unmask_ioapic_irq(irq_get_chip_data(0));
 		}
+		irq_domain_activate_irq(irq_data);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
@@ -2252,8 +2136,8 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
-		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+		irq_domain_activate_irq(irq_data);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2330,36 +2214,35 @@ out:
 
 static int mp_irqdomain_create(int ioapic)
 {
-	size_t size;
+	struct irq_alloc_info info;
+	struct irq_domain *parent;
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 
-	size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
-	ip->pin_info = kzalloc(size, GFP_KERNEL);
-	if (!ip->pin_info)
-		return -ENOMEM;
-
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 		return 0;
 
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info.ioapic_id = mpc_ioapic_id(ioapic);
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (!parent)
+		parent = x86_vector_domain;
+
 	ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
 					      (void *)(long)ioapic);
-	if(!ip->irqdomain) {
-		kfree(ip->pin_info);
-		ip->pin_info = NULL;
+	if (!ip->irqdomain)
 		return -ENOMEM;
-	}
+
+	ip->irqdomain->parent = parent;
 
 	if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
 	    cfg->type == IOAPIC_DOMAIN_STRICT)
 		ioapic_dynirq_base = max(ioapic_dynirq_base,
 					 gsi_cfg->gsi_end + 1);
 
-	if (gsi_cfg->gsi_base == 0)
-		irq_set_default_host(ip->irqdomain);
-
 	return 0;
 }
 
@@ -2369,8 +2252,6 @@ static void ioapic_destroy_irqdomain(int idx)
 		irq_domain_remove(ioapics[idx].irqdomain);
 		ioapics[idx].irqdomain = NULL;
 	}
-	kfree(ioapics[idx].pin_info);
-	ioapics[idx].pin_info = NULL;
 }
 
 void __init setup_IO_APIC(void)
@@ -2400,20 +2281,6 @@ void __init setup_IO_APIC(void)
 	ioapic_initialized = 1;
 }
 
-/*
- *      Called after all the initialization is done. If we didn't find any
- *      APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
-	if (sis_apic_bug == -1)
-		sis_apic_bug = 0;
-	return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
 static void resume_ioapic_id(int ioapic_idx)
 {
 	unsigned long flags;
@@ -2452,20 +2319,6 @@ static int __init ioapic_init_ops(void)
 
 device_initcall(ioapic_init_ops);
 
-static int
-io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
-{
-	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
-	int ret;
-
-	if (!cfg)
-		return -EINVAL;
-	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
-	if (!ret)
-		setup_ioapic_irq(irq, cfg, attr);
-	return ret;
-}
-
 static int io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
@@ -2669,7 +2522,9 @@ void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
 	const struct cpumask *mask;
+	struct irq_desc *desc;
 	struct irq_data *idata;
+	struct irq_chip *chip;
 
 	if (skip_ioapic_setup == 1)
 		return;
@@ -2683,19 +2538,24 @@ void __init setup_ioapic_dest(void)
 		if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
 			continue;
 
-		idata = irq_get_irq_data(irq);
+		desc = irq_to_desc(irq);
+		raw_spin_lock_irq(&desc->lock);
+		idata = irq_desc_get_irq_data(desc);
 
 		/*
 		 * Honour affinities which have been set in early boot
 		 */
 		if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
-			mask = idata->affinity;
+			mask = irq_data_get_affinity_mask(idata);
 		else
 			mask = apic->target_cpus();
 
-		x86_io_apic_ops.set_affinity(idata, mask, false);
+		chip = irq_data_get_irq_chip(idata);
+		/* Might be lapic_chip for irq 0 */
+		if (chip->irq_set_affinity)
+			chip->irq_set_affinity(idata, mask, false);
+		raw_spin_unlock_irq(&desc->lock);
 	}
-
 }
 #endif
 
@@ -2738,7 +2598,7 @@ static struct resource * __init ioapic_setup_resources(void)
 	return res;
 }
 
-void __init native_io_apic_init_mappings(void)
+void __init io_apic_init_mappings(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
@@ -2963,7 +2823,6 @@ int mp_unregister_ioapic(u32 gsi_base)
 {
 	int ioapic, pin;
 	int found = 0;
-	struct mp_pin_info *pin_info;
 
 	for_each_ioapic(ioapic)
 		if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
@@ -2976,11 +2835,17 @@ int mp_unregister_ioapic(u32 gsi_base)
 	}
 
 	for_each_pin(ioapic, pin) {
-		pin_info = mp_pin_info(ioapic, pin);
-		if (pin_info->count) {
-			pr_warn("pin%d on IOAPIC%d is still in use.\n",
-				pin, ioapic);
-			return -EBUSY;
+		u32 gsi = mp_pin_to_gsi(ioapic, pin);
+		int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+		struct mp_chip_data *data;
+
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			if (data && data->count) {
+				pr_warn("pin%d on IOAPIC%d is still in use.\n",
+					pin, ioapic);
+				return -EBUSY;
+			}
 		}
 	}
 
@@ -3007,108 +2872,145 @@ int mp_ioapic_registered(u32 gsi_base)
 	return 0;
 }
 
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
-					int ioapic, int ioapic_pin,
-					int trigger, int polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+				  struct irq_alloc_info *info)
+{
+	if (info && info->ioapic_valid) {
+		data->trigger = info->ioapic_trigger;
+		data->polarity = info->ioapic_polarity;
+	} else if (acpi_get_override_irq(gsi, &data->trigger,
+					 &data->polarity) < 0) {
+		/* PCI interrupts are always active low level triggered. */
+		data->trigger = IOAPIC_LEVEL;
+		data->polarity = IOAPIC_POL_LOW;
+	}
+}
+
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+			   struct IO_APIC_route_entry *entry)
 {
-	irq_attr->ioapic	= ioapic;
-	irq_attr->ioapic_pin	= ioapic_pin;
-	irq_attr->trigger	= trigger;
-	irq_attr->polarity	= polarity;
+	memset(entry, 0, sizeof(*entry));
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode     = apic->irq_dest_mode;
+	entry->dest	     = cfg->dest_apicid;
+	entry->vector	     = cfg->vector;
+	entry->trigger	     = data->trigger;
+	entry->polarity	     = data->polarity;
+	/*
+	 * Mask level triggered irqs. Edge triggered irqs are masked
+	 * by the irq core code in case they fire.
+	 */
+	if (data->trigger == IOAPIC_LEVEL)
+		entry->mask = IOAPIC_MASKED;
+	else
+		entry->mask = IOAPIC_UNMASKED;
 }
 
-int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-		     irq_hw_number_t hwirq)
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs, void *arg)
 {
-	int ioapic = (int)(long)domain->host_data;
-	struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
-	struct io_apic_irq_attr attr;
+	int ret, ioapic, pin;
+	struct irq_cfg *cfg;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
+	struct irq_alloc_info *info = arg;
+	unsigned long flags;
 
-	/* Get default attribute if not set by caller yet */
-	if (!info->set) {
-		u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+	if (!info || nr_irqs > 1)
+		return -EINVAL;
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (!irq_data)
+		return -EINVAL;
 
-		if (acpi_get_override_irq(gsi, &info->trigger,
-					  &info->polarity) < 0) {
-			/*
-			 * PCI interrupts are always polarity one level
-			 * triggered.
-			 */
-			info->trigger = 1;
-			info->polarity = 1;
-		}
-		info->node = NUMA_NO_NODE;
+	ioapic = mp_irqdomain_ioapic_idx(domain);
+	pin = info->ioapic_pin;
+	if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+		return -EEXIST;
 
-		/*
-		 * setup_IO_APIC_irqs() programs all legacy IRQs with default
-		 * trigger and polarity attributes. Don't set the flag for that
-		 * case so the first legacy IRQ user could reprogram the pin
-		 * with real trigger and polarity attributes.
-		 */
-		if (virq >= nr_legacy_irqs() || info->count)
-			info->set = 1;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	info->ioapic_entry = &data->entry;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(data);
+		return ret;
 	}
-	set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
-			     info->polarity);
 
-	return io_apic_setup_irq_pin(virq, info->node, &attr);
-}
+	INIT_LIST_HEAD(&data->irq_2_pin);
+	irq_data->hwirq = info->ioapic_pin;
+	irq_data->chip = (domain->parent == x86_vector_domain) ?
+			  &ioapic_chip : &ioapic_ir_chip;
+	irq_data->chip_data = data;
+	mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
 
-void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
-{
-	struct irq_data *data = irq_get_irq_data(virq);
-	struct irq_cfg *cfg = irq_cfg(virq);
-	int ioapic = (int)(long)domain->host_data;
-	int pin = (int)data->hwirq;
+	cfg = irqd_cfg(irq_data);
+	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
 
-	ioapic_mask_entry(ioapic, pin);
-	__remove_pin_from_irq(cfg, ioapic, pin);
-	WARN_ON(!list_empty(&cfg->irq_2_pin));
-	arch_teardown_hwirq(virq);
-}
+	local_irq_save(flags);
+	if (info->ioapic_entry)
+		mp_setup_entry(cfg, data, info->ioapic_entry);
+	mp_register_handler(virq, data->trigger);
+	if (virq < nr_legacy_irqs())
+		legacy_pic->mask(virq);
+	local_irq_restore(flags);
 
-int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
-{
-	int ret = 0;
-	int ioapic, pin;
-	struct mp_pin_info *info;
+	apic_printk(APIC_VERBOSE, KERN_DEBUG
+		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+		    virq, data->trigger, data->polarity, cfg->dest_apicid);
 
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return -ENODEV;
+	return 0;
+}
 
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-	info = mp_pin_info(ioapic, pin);
-	trigger = trigger ? 1 : 0;
-	polarity = polarity ? 1 : 0;
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs)
+{
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
 
-	mutex_lock(&ioapic_mutex);
-	if (!info->set) {
-		info->trigger = trigger;
-		info->polarity = polarity;
-		info->node = node;
-		info->set = 1;
-	} else if (info->trigger != trigger || info->polarity != polarity) {
-		ret = -EBUSY;
+	BUG_ON(nr_irqs != 1);
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (irq_data && irq_data->chip_data) {
+		data = irq_data->chip_data;
+		__remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+				      (int)irq_data->hwirq);
+		WARN_ON(!list_empty(&data->irq_2_pin));
+		kfree(irq_data->chip_data);
 	}
-	mutex_unlock(&ioapic_mutex);
-
-	return ret;
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+void mp_irqdomain_activate(struct irq_domain *domain,
+			   struct irq_data *irq_data)
 {
-	struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+	unsigned long flags;
+	struct irq_pin_list *entry;
+	struct mp_chip_data *data = irq_data->chip_data;
 
-	printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
-	physid_set_mask_of_physid(boot_cpu_physical_apicid,
-					 &phys_cpu_present_map);
-#endif
-	setup_local_APIC();
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__ioapic_write_entry(entry->apic, entry->pin, data->entry);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
-	io_apic_setup_irq_pin(0, 0, &attr);
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+			     struct irq_data *irq_data)
+{
+	/* It won't be called for IRQ with multiple IOAPIC pins associated */
+	ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+			  (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+	return (int)(long)domain->host_data;
 }
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+	.alloc		= mp_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
+};
diff --git a/kernel/arch/x86/kernel/apic/msi.c b/kernel/arch/x86/kernel/apic/msi.c
index d6ba2d660..5f1feb685 100644
--- a/kernel/arch/x86/kernel/apic/msi.c
+++ b/kernel/arch/x86/kernel/apic/msi.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Convert to hierarchical irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,22 +16,23 @@
 #include <linux/dmar.h>
 #include <linux/hpet.h>
 #include <linux/msi.h>
+#include <asm/irqdomain.h>
 #include <asm/msidef.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 
-void native_compose_msi_msg(struct pci_dev *pdev,
-			    unsigned int irq, unsigned int dest,
-			    struct msi_msg *msg, u8 hpet_id)
+static struct irq_domain *msi_default_domain;
+
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_cfg *cfg = irqd_cfg(data);
 
 	msg->address_hi = MSI_ADDR_BASE_HI;
 
 	if (x2apic_enabled())
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
 
 	msg->address_lo =
 		MSI_ADDR_BASE_LO |
@@ -39,7 +42,7 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		((apic->irq_delivery_mode != dest_LowestPrio) ?
 			MSI_ADDR_REDIRECTION_CPU :
 			MSI_ADDR_REDIRECTION_LOWPRI) |
-		MSI_ADDR_DEST_ID(dest);
+		MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
 	msg->data =
 		MSI_DATA_TRIGGER_EDGE |
@@ -50,180 +53,201 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		MSI_DATA_VECTOR(cfg->vector);
 }
 
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
-			   struct msi_msg *msg, u8 hpet_id)
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip pci_msi_controller = {
+	.name			= "PCI-MSI",
+	.irq_unmask		= pci_msi_unmask_irq,
+	.irq_mask		= pci_msi_mask_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	struct irq_cfg *cfg;
-	int err;
-	unsigned dest;
+	struct irq_domain *domain;
+	struct irq_alloc_info info;
 
-	if (disable_apic)
-		return -ENXIO;
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_MSI;
+	info.msi_dev = dev;
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	domain = irq_remapping_get_irq_domain(&info);
+	if (domain == NULL)
+		domain = msi_default_domain;
+	if (domain == NULL)
+		return -ENOSYS;
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+}
 
-	x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+void native_teardown_msi_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
+}
 
-	return 0;
+static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+					 msi_alloc_info_t *arg)
+{
+	return arg->msi_hwirq;
 }
 
-static int
-msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *arg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+	init_irq_alloc_info(arg, NULL);
+	arg->msi_dev = pdev;
+	if (desc->msi_attrib.is_msix) {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+	} else {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+	}
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	return 0;
+}
 
-	__get_cached_msi_msg(data->msi_desc, &msg);
+static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+}
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+	.get_hwirq	= pci_msi_get_hwirq,
+	.msi_prepare	= pci_msi_prepare,
+	.set_desc	= pci_msi_set_desc,
+};
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static struct msi_domain_info pci_msi_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
-	__pci_write_msi_msg(data->msi_desc, &msg);
+void arch_init_msi_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	msi_default_domain = pci_msi_create_irq_domain(NULL,
+					&pci_msi_domain_info, parent);
+	if (!msi_default_domain)
+		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
 }
 
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
-	.name			= "PCI-MSI",
+#ifdef CONFIG_IRQ_REMAP
+static struct irq_chip pci_msi_ir_controller = {
+	.name			= "IR-PCI-MSI",
 	.irq_unmask		= pci_msi_unmask_irq,
 	.irq_mask		= pci_msi_mask_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_set_vcpu_affinity	= irq_chip_set_vcpu_affinity_parent,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset)
-{
-	struct irq_chip *chip = &msi_chip;
-	struct msi_msg msg;
-	unsigned int irq = irq_base + irq_offset;
-	int ret;
-
-	ret = msi_compose_msg(dev, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-
-	irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
-
-	/*
-	 * MSI-X message is written per-IRQ, the offset is always 0.
-	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
-	 */
-	if (!irq_offset)
-		pci_write_msi_msg(irq, &msg);
+static struct msi_domain_info pci_msi_ir_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_ir_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+{
+	return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
+}
+#endif
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+#ifdef CONFIG_DMAR_TABLE
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	dmar_msi_write(data->irq, msg);
+}
 
-	dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq);
+static struct irq_chip dmar_msi_controller = {
+	.name			= "DMAR-MSI",
+	.irq_unmask		= dmar_msi_unmask,
+	.irq_mask		= dmar_msi_mask,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_set_affinity	= msi_domain_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.irq_write_msi_msg	= dmar_msi_write_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
 
-	return 0;
+static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->dmar_id;
 }
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int dmar_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
 {
-	struct msi_desc *msidesc;
-	unsigned int irq;
-	int node, ret;
+	irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
+			    handle_edge_irq, arg->dmar_data, "edge");
 
-	/* Multiple MSI vectors only supported with interrupt remapping */
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
+	return 0;
+}
 
-	node = dev_to_node(&dev->dev);
+static struct msi_domain_ops dmar_msi_domain_ops = {
+	.get_hwirq	= dmar_msi_get_hwirq,
+	.msi_init	= dmar_msi_init,
+};
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = irq_alloc_hwirq(node);
-		if (!irq)
-			return -ENOSPC;
+static struct msi_domain_info dmar_msi_domain_info = {
+	.ops		= &dmar_msi_domain_ops,
+	.chip		= &dmar_msi_controller,
+};
 
-		ret = setup_msi_irq(dev, msidesc, irq, 0);
-		if (ret < 0) {
-			irq_free_hwirq(irq);
-			return ret;
-		}
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+	static struct irq_domain *dmar_domain;
+	static DEFINE_MUTEX(dmar_lock);
 
-	}
-	return 0;
-}
+	mutex_lock(&dmar_lock);
+	if (dmar_domain == NULL)
+		dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info,
+						    x86_vector_domain);
+	mutex_unlock(&dmar_lock);
 
-void native_teardown_msi_irq(unsigned int irq)
-{
-	irq_free_hwirq(irq);
+	return dmar_domain;
 }
 
-#ifdef CONFIG_DMAR_TABLE
-static int
-dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      bool force)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest, irq = data->irq;
-	struct msi_msg msg;
-	int ret;
-
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	struct irq_domain *domain = dmar_get_irq_domain();
+	struct irq_alloc_info info;
 
-	dmar_msi_read(irq, &msg);
+	if (!domain)
+		return -1;
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-	msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+	info.dmar_id = id;
+	info.dmar_data = arg;
 
-	dmar_msi_write(irq, &msg);
-
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return irq_domain_alloc_irqs(domain, 1, node, &info);
 }
 
-static struct irq_chip dmar_msi_type = {
-	.name			= "DMAR_MSI",
-	.irq_unmask		= dmar_msi_unmask,
-	.irq_mask		= dmar_msi_mask,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= dmar_msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
+void dmar_free_hwirq(int irq)
 {
-	int ret;
-	struct msi_msg msg;
-
-	ret = msi_compose_msg(NULL, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq_domain_free_irqs(irq, 1);
 }
 #endif
 
@@ -231,56 +255,103 @@ int arch_setup_dmar_msi(unsigned int irq)
  * MSI message composition
  */
 #ifdef CONFIG_HPET_TIMER
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+	struct msi_domain_info *info = msi_get_domain_info(domain);
+
+	return (int)(long)info->data;
+}
 
-static int hpet_msi_set_affinity(struct irq_data *data,
-				 const struct cpumask *mask, bool force)
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
+}
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+static struct irq_chip hpet_msi_controller = {
+	.name = "HPET-MSI",
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
+	.irq_ack = irq_chip_ack_parent,
+	.irq_set_affinity = msi_domain_set_affinity,
+	.irq_retrigger = irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg = irq_msi_compose_msg,
+	.irq_write_msi_msg = hpet_msi_write_msg,
+	.flags = IRQCHIP_SKIP_SET_WAKE,
+};
 
-	hpet_msi_read(data->handler_data, &msg);
+static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->hpet_index;
+}
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static int hpet_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
+			    handle_edge_irq, arg->hpet_data, "edge");
 
-	hpet_msi_write(data->handler_data, &msg);
+	return 0;
+}
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+static void hpet_msi_free(struct irq_domain *domain,
+			  struct msi_domain_info *info, unsigned int virq)
+{
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
 }
 
-static struct irq_chip hpet_msi_type = {
-	.name = "HPET_MSI",
-	.irq_unmask = hpet_msi_unmask,
-	.irq_mask = hpet_msi_mask,
-	.irq_ack = apic_ack_edge,
-	.irq_set_affinity = hpet_msi_set_affinity,
-	.irq_retrigger = apic_retrigger_irq,
-	.flags = IRQCHIP_SKIP_SET_WAKE,
+static struct msi_domain_ops hpet_msi_domain_ops = {
+	.get_hwirq	= hpet_msi_get_hwirq,
+	.msi_init	= hpet_msi_init,
+	.msi_free	= hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+	.ops		= &hpet_msi_domain_ops,
+	.chip		= &hpet_msi_controller,
 };
 
-int default_setup_hpet_msi(unsigned int irq, unsigned int id)
+struct irq_domain *hpet_create_irq_domain(int hpet_id)
 {
-	struct irq_chip *chip = &hpet_msi_type;
-	struct msi_msg msg;
-	int ret;
+	struct irq_domain *parent;
+	struct irq_alloc_info info;
+	struct msi_domain_info *domain_info;
+
+	if (x86_vector_domain == NULL)
+		return NULL;
+
+	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+	if (!domain_info)
+		return NULL;
+
+	*domain_info = hpet_msi_domain_info;
+	domain_info->data = (void *)(long)hpet_id;
+
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_id = hpet_id;
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (parent == NULL)
+		parent = x86_vector_domain;
+	else
+		hpet_msi_controller.name = "IR-HPET-MSI";
+
+	return msi_create_irq_domain(NULL, domain_info, parent);
+}
 
-	ret = msi_compose_msg(NULL, irq, &msg, id);
-	if (ret < 0)
-		return ret;
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+		    int dev_num)
+{
+	struct irq_alloc_info info;
 
-	hpet_msi_write(irq_get_handler_data(irq), &msg);
-	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_data = dev;
+	info.hpet_id = hpet_dev_id(domain);
+	info.hpet_index = dev_num;
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
-	return 0;
+	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
 }
 #endif
diff --git a/kernel/arch/x86/kernel/apic/probe_32.c b/kernel/arch/x86/kernel/apic/probe_32.c
index bda488680..7694ae6c1 100644
--- a/kernel/arch/x86/kernel/apic/probe_32.c
+++ b/kernel/arch/x86/kernel/apic/probe_32.c
@@ -111,7 +111,6 @@ static struct apic apic_default = {
 	.send_IPI_all			= default_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,
 
-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,
 
 	.read				= native_apic_mem_read,
diff --git a/kernel/arch/x86/kernel/apic/vector.c b/kernel/arch/x86/kernel/apic/vector.c
index 6cedd7914..a35f6b547 100644
--- a/kernel/arch/x86/kernel/apic/vector.c
+++ b/kernel/arch/x86/kernel/apic/vector.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Enable support of hierarchical irqdomains
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -11,15 +13,28 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
 #include <asm/desc.h>
 #include <asm/irq_remapping.h>
 
+struct apic_chip_data {
+	struct irq_cfg		cfg;
+	cpumask_var_t		domain;
+	cpumask_var_t		old_domain;
+	u8			move_in_progress : 1;
+};
+
+struct irq_domain *x86_vector_domain;
 static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask;
+static struct irq_chip lapic_controller;
+#ifdef	CONFIG_X86_IO_APIC
+static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+#endif
 
 void lock_vector_lock(void)
 {
@@ -34,71 +49,59 @@ void unlock_vector_lock(void)
 	raw_spin_unlock(&vector_lock);
 }
 
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
 {
-	return irq_get_chip_data(irq);
+	if (!irq_data)
+		return NULL;
+
+	while (irq_data->parent_data)
+		irq_data = irq_data->parent_data;
+
+	return irq_data->chip_data;
 }
 
 struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
 {
-	return irq_data->chip_data;
+	struct apic_chip_data *data = apic_chip_data(irq_data);
+
+	return data ? &data->cfg : NULL;
 }
 
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	struct irq_cfg *cfg;
+	return irqd_cfg(irq_get_irq_data(irq));
+}
 
-	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
-	if (!cfg)
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+	struct apic_chip_data *data;
+
+	data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
+	if (!data)
 		return NULL;
-	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
-		goto out_cfg;
-	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+	if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
+		goto out_data;
+	if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
 		goto out_domain;
-#ifdef	CONFIG_X86_IO_APIC
-	INIT_LIST_HEAD(&cfg->irq_2_pin);
-#endif
-	return cfg;
+	return data;
 out_domain:
-	free_cpumask_var(cfg->domain);
-out_cfg:
-	kfree(cfg);
+	free_cpumask_var(data->domain);
+out_data:
+	kfree(data);
 	return NULL;
 }
 
-struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+static void free_apic_chip_data(struct apic_chip_data *data)
 {
-	int res = irq_alloc_desc_at(at, node);
-	struct irq_cfg *cfg;
-
-	if (res < 0) {
-		if (res != -EEXIST)
-			return NULL;
-		cfg = irq_cfg(at);
-		if (cfg)
-			return cfg;
+	if (data) {
+		free_cpumask_var(data->domain);
+		free_cpumask_var(data->old_domain);
+		kfree(data);
 	}
-
-	cfg = alloc_irq_cfg(at, node);
-	if (cfg)
-		irq_set_chip_data(at, cfg);
-	else
-		irq_free_desc(at);
-	return cfg;
-}
-
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
-	if (!cfg)
-		return;
-	irq_set_chip_data(at, NULL);
-	free_cpumask_var(cfg->domain);
-	free_cpumask_var(cfg->old_domain);
-	kfree(cfg);
 }
 
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int __assign_irq_vector(int irq, struct apic_chip_data *d,
+			       const struct cpumask *mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -113,38 +116,47 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 	 */
 	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
 	static int current_offset = VECTOR_OFFSET_START % 16;
-	int cpu, err;
-	cpumask_var_t tmp_mask;
+	int cpu, vector;
 
-	if (cfg->move_in_progress)
+	/*
+	 * If there is still a move in progress or the previous move has not
+	 * been cleaned up completely, tell the caller to come back later.
+	 */
+	if (d->move_in_progress ||
+	    cpumask_intersects(d->old_domain, cpu_online_mask))
 		return -EBUSY;
 
-	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
-		return -ENOMEM;
-
 	/* Only try and allocate irqs on cpus that are present */
-	err = -ENOSPC;
-	cpumask_clear(cfg->old_domain);
+	cpumask_clear(d->old_domain);
+	cpumask_clear(searched_cpumask);
 	cpu = cpumask_first_and(mask, cpu_online_mask);
 	while (cpu < nr_cpu_ids) {
-		int new_cpu, vector, offset;
+		int new_cpu, offset;
+
+		/* Get the possible target cpus for @mask/@cpu from the apic */
+		apic->vector_allocation_domain(cpu, vector_cpumask, mask);
 
-		apic->vector_allocation_domain(cpu, tmp_mask, mask);
+		/*
+		 * Clear the offline cpus from @vector_cpumask for searching
+		 * and verify whether the result overlaps with @mask. If true,
+		 * then the call to apic->cpu_mask_to_apicid_and() will
+		 * succeed as well. If not, no point in trying to find a
+		 * vector in this mask.
+		 */
+		cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask);
+		if (!cpumask_intersects(vector_searchmask, mask))
+			goto next_cpu;
 
-		if (cpumask_subset(tmp_mask, cfg->domain)) {
-			err = 0;
-			if (cpumask_equal(tmp_mask, cfg->domain))
-				break;
+		if (cpumask_subset(vector_cpumask, d->domain)) {
+			if (cpumask_equal(vector_cpumask, d->domain))
+				goto success;
 			/*
-			 * New cpumask using the vector is a proper subset of
-			 * the current in use mask. So cleanup the vector
-			 * allocation for the members that are not used anymore.
+			 * Mark the cpus which are not longer in the mask for
+			 * cleanup.
 			 */
-			cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
-			cpumask_and(cfg->domain, cfg->domain, tmp_mask);
-			break;
+			cpumask_andnot(d->old_domain, d->domain, vector_cpumask);
+			vector = d->cfg.vector;
+			goto update;
 		}
 
 		vector = current_vector;
@@ -156,85 +168,211 @@ next:
 			vector = FIRST_EXTERNAL_VECTOR + offset;
 		}
 
-		if (unlikely(current_vector == vector)) {
-			cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
-			cpumask_andnot(tmp_mask, mask, cfg->old_domain);
-			cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
-			continue;
-		}
+		/* If the search wrapped around, try the next cpu */
+		if (unlikely(current_vector == vector))
+			goto next_cpu;
 
 		if (test_bit(vector, used_vectors))
 			goto next;
 
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
-			if (per_cpu(vector_irq, new_cpu)[vector] >
-			    VECTOR_UNDEFINED)
+		for_each_cpu(new_cpu, vector_searchmask) {
+			if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
 				goto next;
 		}
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
-		if (cfg->vector) {
-			cpumask_copy(cfg->old_domain, cfg->domain);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
-		}
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = irq;
-		cfg->vector = vector;
-		cpumask_copy(cfg->domain, tmp_mask);
-		err = 0;
-		break;
+		/* Schedule the old vector for cleanup on all cpus */
+		if (d->cfg.vector)
+			cpumask_copy(d->old_domain, d->domain);
+		for_each_cpu(new_cpu, vector_searchmask)
+			per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
+		goto update;
+
+next_cpu:
+		/*
+		 * We exclude the current @vector_cpumask from the requested
+		 * @mask and try again with the next online cpu in the
+		 * result. We cannot modify @mask, so we use @vector_cpumask
+		 * as a temporary buffer here as it will be reassigned when
+		 * calling apic->vector_allocation_domain() above.
+		 */
+		cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask);
+		cpumask_andnot(vector_cpumask, mask, searched_cpumask);
+		cpu = cpumask_first_and(vector_cpumask, cpu_online_mask);
+		continue;
 	}
-	free_cpumask_var(tmp_mask);
+	return -ENOSPC;
 
-	return err;
+update:
+	/*
+	 * Exclude offline cpus from the cleanup mask and set the
+	 * move_in_progress flag when the result is not empty.
+	 */
+	cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
+	d->move_in_progress = !cpumask_empty(d->old_domain);
+	d->cfg.vector = vector;
+	cpumask_copy(d->domain, vector_cpumask);
+success:
+	/*
+	 * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail
+	 * as we already established, that mask & d->domain & cpu_online_mask
+	 * is not empty.
+	 */
+	BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain,
+					    &d->cfg.dest_apicid));
+	return 0;
 }
 
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int assign_irq_vector(int irq, struct apic_chip_data *data,
+			     const struct cpumask *mask)
 {
 	int err;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, cfg, mask);
+	err = __assign_irq_vector(irq, data, mask);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-void clear_irq_vector(int irq, struct irq_cfg *cfg)
+static int assign_irq_vector_policy(int irq, int node,
+				    struct apic_chip_data *data,
+				    struct irq_alloc_info *info)
+{
+	if (info && info->mask)
+		return assign_irq_vector(irq, data, info->mask);
+	if (node != NUMA_NO_NODE &&
+	    assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+		return 0;
+	return assign_irq_vector(irq, data, apic->target_cpus());
+}
+
+static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
+	struct irq_desc *desc;
 	int cpu, vector;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&vector_lock, flags);
-	BUG_ON(!cfg->vector);
+	BUG_ON(!data->cfg.vector);
 
-	vector = cfg->vector;
-	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+	vector = data->cfg.vector;
+	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
+		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 
-	cfg->vector = 0;
-	cpumask_clear(cfg->domain);
+	data->cfg.vector = 0;
+	cpumask_clear(data->domain);
 
-	if (likely(!cfg->move_in_progress)) {
-		raw_spin_unlock_irqrestore(&vector_lock, flags);
+	/*
+	 * If move is in progress or the old_domain mask is not empty,
+	 * i.e. the cleanup IPI has not been processed yet, we need to remove
+	 * the old references to desc from all cpus vector tables.
+	 */
+	if (!data->move_in_progress && cpumask_empty(data->old_domain))
 		return;
-	}
 
-	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+	desc = irq_to_desc(irq);
+	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
-			if (per_cpu(vector_irq, cpu)[vector] != irq)
+			if (per_cpu(vector_irq, cpu)[vector] != desc)
 				continue;
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 			break;
 		}
 	}
-	cfg->move_in_progress = 0;
-	raw_spin_unlock_irqrestore(&vector_lock, flags);
+	data->move_in_progress = 0;
+}
+
+void init_irq_alloc_info(struct irq_alloc_info *info,
+			 const struct cpumask *mask)
+{
+	memset(info, 0, sizeof(*info));
+	info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+	if (src)
+		*dst = *src;
+	else
+		memset(dst, 0, sizeof(*dst));
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+				 unsigned int virq, unsigned int nr_irqs)
+{
+	struct apic_chip_data *apic_data;
+	struct irq_data *irq_data;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+		if (irq_data && irq_data->chip_data) {
+			raw_spin_lock_irqsave(&vector_lock, flags);
+			clear_irq_vector(virq + i, irq_data->chip_data);
+			apic_data = irq_data->chip_data;
+			irq_domain_reset_irq_data(irq_data);
+			raw_spin_unlock_irqrestore(&vector_lock, flags);
+			free_apic_chip_data(apic_data);
+#ifdef	CONFIG_X86_IO_APIC
+			if (virq + i < nr_legacy_irqs())
+				legacy_irq_data[virq + i] = NULL;
+#endif
+		}
+	}
 }
 
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *arg)
+{
+	struct irq_alloc_info *info = arg;
+	struct apic_chip_data *data;
+	struct irq_data *irq_data;
+	int i, err, node;
+
+	if (disable_apic)
+		return -ENXIO;
+
+	/* Currently vector allocator can't guarantee contiguous allocations */
+	if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+		return -ENOSYS;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		BUG_ON(!irq_data);
+		node = irq_data_get_node(irq_data);
+#ifdef	CONFIG_X86_IO_APIC
+		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
+			data = legacy_irq_data[virq + i];
+		else
+#endif
+			data = alloc_apic_chip_data(node);
+		if (!data) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		irq_data->chip = &lapic_controller;
+		irq_data->chip_data = data;
+		irq_data->hwirq = virq + i;
+		err = assign_irq_vector_policy(virq + i, node, data, info);
+		if (err)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	x86_vector_free_irqs(domain, virq, i + 1);
+	return err;
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+	.alloc	= x86_vector_alloc_irqs,
+	.free	= x86_vector_free_irqs,
+};
+
 int __init arch_probe_nr_irqs(void)
 {
 	int nr;
@@ -255,57 +393,92 @@ int __init arch_probe_nr_irqs(void)
 	if (nr < nr_irqs)
 		nr_irqs = nr;
 
-	return nr_legacy_irqs();
+	/*
+	 * We don't know if PIC is present at this point so we need to do
+	 * probe() to get the right number of legacy IRQs.
+	 */
+	return legacy_pic->probe();
 }
 
+#ifdef	CONFIG_X86_IO_APIC
+static void init_legacy_irqs(void)
+{
+	int i, node = cpu_to_node(0);
+	struct apic_chip_data *data;
+
+	/*
+	 * For legacy IRQ's, start with assigning irq0 to irq15 to
+	 * ISA_IRQ_VECTOR(i) for all cpu's.
+	 */
+	for (i = 0; i < nr_legacy_irqs(); i++) {
+		data = legacy_irq_data[i] = alloc_apic_chip_data(node);
+		BUG_ON(!data);
+
+		data->cfg.vector = ISA_IRQ_VECTOR(i);
+		cpumask_setall(data->domain);
+		irq_set_chip_data(i, data);
+	}
+}
+#else
+static void init_legacy_irqs(void) { }
+#endif
+
 int __init arch_early_irq_init(void)
 {
+	init_legacy_irqs();
+
+	x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
+						NULL);
+	BUG_ON(x86_vector_domain == NULL);
+	irq_set_default_host(x86_vector_domain);
+
+	arch_init_msi_domain(x86_vector_domain);
+	arch_init_htirq_domain(x86_vector_domain);
+
+	BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL));
+
 	return arch_early_ioapic_init();
 }
 
+/* Initialize vector_irq on a new cpu */
 static void __setup_vector_irq(int cpu)
 {
-	/* Initialize vector_irq on a new cpu */
+	struct apic_chip_data *data;
+	struct irq_desc *desc;
 	int irq, vector;
-	struct irq_cfg *cfg;
 
-	/*
-	 * vector_lock will make sure that we don't run into irq vector
-	 * assignments that might be happening on another cpu in parallel,
-	 * while we setup our initial vector to irq mappings.
-	 */
-	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
-	for_each_active_irq(irq) {
-		cfg = irq_cfg(irq);
-		if (!cfg)
-			continue;
+	for_each_irq_desc(irq, desc) {
+		struct irq_data *idata = irq_desc_get_irq_data(desc);
 
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		data = apic_chip_data(idata);
+		if (!data || !cpumask_test_cpu(cpu, data->domain))
 			continue;
-		vector = cfg->vector;
-		per_cpu(vector_irq, cpu)[vector] = irq;
+		vector = data->cfg.vector;
+		per_cpu(vector_irq, cpu)[vector] = desc;
 	}
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
-		irq = per_cpu(vector_irq, cpu)[vector];
-		if (irq <= VECTOR_UNDEFINED)
+		desc = per_cpu(vector_irq, cpu)[vector];
+		if (IS_ERR_OR_NULL(desc))
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cpumask_test_cpu(cpu, cfg->domain))
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
+		if (!cpumask_test_cpu(cpu, data->domain))
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	}
-	raw_spin_unlock(&vector_lock);
 }
 
 /*
- * Setup the vector to irq mappings.
+ * Setup the vector to irq mappings. Must be called with vector_lock held.
  */
 void setup_vector_irq(int cpu)
 {
 	int irq;
 
+	lockdep_assert_held(&vector_lock);
 	/*
 	 * On most of the platforms, legacy PIC delivers the interrupts on the
 	 * boot cpu. But there are certain platforms where PIC interrupts are
@@ -314,20 +487,20 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
 
 	__setup_vector_irq(cpu);
 }
 
-int apic_retrigger_irq(struct irq_data *data)
+static int apic_retrigger_irq(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
+	struct apic_chip_data *data = apic_chip_data(irq_data);
 	unsigned long flags;
 	int cpu;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
-	apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+	cpu = cpumask_first_and(data->domain, cpu_online_mask);
+	apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
@@ -340,97 +513,101 @@ void apic_ack_edge(struct irq_data *data)
 	ack_APIC_irq();
 }
 
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      unsigned int *dest_id)
+static int apic_set_affinity(struct irq_data *irq_data,
+			     const struct cpumask *dest, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int irq = data->irq;
-	int err;
+	struct apic_chip_data *data = irq_data->chip_data;
+	int err, irq = irq_data->irq;
 
 	if (!config_enabled(CONFIG_SMP))
 		return -EPERM;
 
-	if (!cpumask_intersects(mask, cpu_online_mask))
+	if (!cpumask_intersects(dest, cpu_online_mask))
 		return -EINVAL;
 
-	err = assign_irq_vector(irq, cfg, mask);
-	if (err)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
-	if (err) {
-		if (assign_irq_vector(irq, cfg, data->affinity))
-			pr_err("Failed to recover vector for irq %d\n", irq);
-		return err;
-	}
+	err = assign_irq_vector(irq, data, dest);
+	return err ? err : IRQ_SET_MASK_OK;
+}
 
-	cpumask_copy(data->affinity, mask);
+static struct irq_chip lapic_controller = {
+	.irq_ack		= apic_ack_edge,
+	.irq_set_affinity	= apic_set_affinity,
+	.irq_retrigger		= apic_retrigger_irq,
+};
 
-	return 0;
+#ifdef CONFIG_SMP
+static void __send_cleanup_vector(struct apic_chip_data *data)
+{
+	raw_spin_lock(&vector_lock);
+	cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
+	data->move_in_progress = 0;
+	if (!cpumask_empty(data->old_domain))
+		apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR);
+	raw_spin_unlock(&vector_lock);
 }
 
-#ifdef CONFIG_SMP
 void send_cleanup_vector(struct irq_cfg *cfg)
 {
-	cpumask_var_t cleanup_mask;
-
-	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-		unsigned int i;
+	struct apic_chip_data *data;
 
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			apic->send_IPI_mask(cpumask_of(i),
-					    IRQ_MOVE_CLEANUP_VECTOR);
-	} else {
-		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-		free_cpumask_var(cleanup_mask);
-	}
-	cfg->move_in_progress = 0;
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (data->move_in_progress)
+		__send_cleanup_vector(data);
 }
 
 asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
 
-	ack_APIC_irq();
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
+
+	/* Prevent vectors vanishing under us */
+	raw_spin_lock(&vector_lock);
 
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		int irq;
-		unsigned int irr;
+		struct apic_chip_data *data;
 		struct irq_desc *desc;
-		struct irq_cfg *cfg;
-
-		irq = __this_cpu_read(vector_irq[vector]);
-
-		if (irq <= VECTOR_UNDEFINED)
-			continue;
+		unsigned int irr;
 
-		desc = irq_to_desc(irq);
-		if (!desc)
+	retry:
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cfg)
-			continue;
+		if (!raw_spin_trylock(&desc->lock)) {
+			raw_spin_unlock(&vector_lock);
+			cpu_relax();
+			raw_spin_lock(&vector_lock);
+			goto retry;
+		}
 
-		raw_spin_lock(&desc->lock);
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
+		if (!data)
+			goto unlock;
 
 		/*
-		 * Check if the irq migration is in progress. If so, we
-		 * haven't received the cleanup request yet for this irq.
+		 * Nothing to cleanup if irq migration is in progress
+		 * or this cpu is not set in the cleanup mask.
 		 */
-		if (cfg->move_in_progress)
+		if (data->move_in_progress ||
+		    !cpumask_test_cpu(me, data->old_domain))
 			goto unlock;
 
-		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+		/*
+		 * We have two cases to handle here:
+		 * 1) vector is unchanged but the target mask got reduced
+		 * 2) vector and the target mask has changed
+		 *
+		 * #1 is obvious, but in #2 we have two vectors with the same
+		 * irq descriptor: the old and the new vector. So we need to
+		 * make sure that we only cleanup the old vector. The new
+		 * vector has the current @vector number in the config and
+		 * this cpu is part of the target mask. We better leave that
+		 * one alone.
+		 */
+		if (vector == data->cfg.vector &&
+		    cpumask_test_cpu(me, data->domain))
 			goto unlock;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -445,25 +622,29 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
 			goto unlock;
 		}
-		__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+		cpumask_clear_cpu(me, data->old_domain);
 unlock:
 		raw_spin_unlock(&desc->lock);
 	}
 
-	irq_exit();
+	raw_spin_unlock(&vector_lock);
+
+	exiting_irq();
 }
 
 static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
 	unsigned me;
+	struct apic_chip_data *data;
 
-	if (likely(!cfg->move_in_progress))
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (likely(!data->move_in_progress))
 		return;
 
 	me = smp_processor_id();
-
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
-		send_cleanup_vector(cfg);
+	if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
+		__send_cleanup_vector(data);
 }
 
 void irq_complete_move(struct irq_cfg *cfg)
@@ -471,49 +652,50 @@ void irq_complete_move(struct irq_cfg *cfg)
 	__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
 }
 
-void irq_force_complete_move(int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-
-	if (!cfg)
-		return;
-
-	__irq_complete_move(cfg, cfg->vector);
-}
-#endif
-
 /*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
+ * Called with @desc->lock held and interrupts disabled.
  */
-int arch_setup_hwirq(unsigned int irq, int node)
+void irq_force_complete_move(struct irq_desc *desc)
 {
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	int ret;
+	struct irq_data *irqdata = irq_desc_get_irq_data(desc);
+	struct apic_chip_data *data = apic_chip_data(irqdata);
+	struct irq_cfg *cfg = data ? &data->cfg : NULL;
 
-	cfg = alloc_irq_cfg(irq, node);
 	if (!cfg)
-		return -ENOMEM;
-
-	raw_spin_lock_irqsave(&vector_lock, flags);
-	ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
-	raw_spin_unlock_irqrestore(&vector_lock, flags);
-
-	if (!ret)
-		irq_set_chip_data(irq, cfg);
-	else
-		free_irq_cfg(irq, cfg);
-	return ret;
-}
+		return;
 
-void arch_teardown_hwirq(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
+	__irq_complete_move(cfg, cfg->vector);
 
-	free_remapped_irq(irq);
-	clear_irq_vector(irq, cfg);
-	free_irq_cfg(irq, cfg);
+	/*
+	 * This is tricky. If the cleanup of @data->old_domain has not been
+	 * done yet, then the following setaffinity call will fail with
+	 * -EBUSY. This can leave the interrupt in a stale state.
+	 *
+	 * The cleanup cannot make progress because we hold @desc->lock. So in
+	 * case @data->old_domain is not yet cleaned up, we need to drop the
+	 * lock and acquire it again. @desc cannot go away, because the
+	 * hotplug code holds the sparse irq lock.
+	 */
+	raw_spin_lock(&vector_lock);
+	/* Clean out all offline cpus (including ourself) first. */
+	cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
+	while (!cpumask_empty(data->old_domain)) {
+		raw_spin_unlock(&vector_lock);
+		raw_spin_unlock(&desc->lock);
+		cpu_relax();
+		raw_spin_lock(&desc->lock);
+		/*
+		 * Reevaluate apic_chip_data. It might have been cleared after
+		 * we dropped @desc->lock.
+		 */
+		data = apic_chip_data(irqdata);
+		if (!data)
+			return;
+		raw_spin_lock(&vector_lock);
+	}
+	raw_spin_unlock(&vector_lock);
 }
+#endif
 
 static void __init print_APIC_field(int base)
 {
diff --git a/kernel/arch/x86/kernel/apic/x2apic_cluster.c b/kernel/arch/x86/kernel/apic/x2apic_cluster.c
index ab3219b3f..cc8311c4d 100644
--- a/kernel/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/kernel/arch/x86/kernel/apic/x2apic_cluster.c
@@ -182,7 +182,7 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return notifier_from_errno(err);
 }
 
-static struct notifier_block __refdata x2apic_cpu_notifier = {
+static struct notifier_block x2apic_cpu_notifier = {
 	.notifier_call = update_clusterinfo,
 };
 
@@ -272,7 +272,6 @@ static struct apic apic_x2apic_cluster = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/kernel/arch/x86/kernel/apic/x2apic_phys.c b/kernel/arch/x86/kernel/apic/x2apic_phys.c
index 6fae733e9..662e9150e 100644
--- a/kernel/arch/x86/kernel/apic/x2apic_phys.c
+++ b/kernel/arch/x86/kernel/apic/x2apic_phys.c
@@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
 
 static bool x2apic_fadt_phys(void)
 {
+#ifdef CONFIG_ACPI
 	if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
 		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
 		printk(KERN_DEBUG "System requires x2apic physical mode\n");
 		return true;
 	}
+#endif
 	return false;
 }
 
@@ -126,7 +128,6 @@ static struct apic apic_x2apic_phys = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,
 
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/kernel/arch/x86/kernel/apic/x2apic_uv_x.c b/kernel/arch/x86/kernel/apic/x2apic_uv_x.c
index 3d2fbca33..ad2afff02 100644
--- a/kernel/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/kernel/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -248,7 +248,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 	    APIC_DM_STARTUP;
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
-	atomic_set(&init_deasserted, 1);
 	return 0;
 }
 
@@ -414,7 +413,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.send_IPI_self			= uv_send_IPI_self,
 
 	.wakeup_secondary_cpu		= uv_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,
 
 	.read				= native_apic_msr_read,
diff --git a/kernel/arch/x86/kernel/apm_32.c b/kernel/arch/x86/kernel/apm_32.c
index 927ec9235..052c9c302 100644
--- a/kernel/arch/x86/kernel/apm_32.c
+++ b/kernel/arch/x86/kernel/apm_32.c
@@ -919,7 +919,7 @@ recalc:
 	} else if (jiffies_since_last_check > idle_period) {
 		unsigned int idle_percentage;
 
-		idle_percentage = stime - last_stime;
+		idle_percentage = cputime_to_jiffies(stime - last_stime);
 		idle_percentage *= 100;
 		idle_percentage /= jiffies_since_last_check;
 		use_apm_idle = (idle_percentage > idle_threshold);
diff --git a/kernel/arch/x86/kernel/asm-offsets.c b/kernel/arch/x86/kernel/asm-offsets.c
index 5701b5075..b7954ddd6 100644
--- a/kernel/arch/x86/kernel/asm-offsets.c
+++ b/kernel/arch/x86/kernel/asm-offsets.c
@@ -42,6 +42,22 @@ void common(void) {
 	OFFSET(pbe_orig_address, pbe, orig_address);
 	OFFSET(pbe_next, pbe, next);
 
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	BLANK();
+	OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax);
+	OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx);
+	OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx);
+	OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx);
+	OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si);
+	OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di);
+	OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp);
+	OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp);
+	OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip);
+
+	BLANK();
+	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
 #ifdef CONFIG_PARAVIRT
 	BLANK();
 	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
@@ -50,7 +66,9 @@ void common(void) {
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+#ifdef CONFIG_X86_32
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+#endif
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
diff --git a/kernel/arch/x86/kernel/asm-offsets_32.c b/kernel/arch/x86/kernel/asm-offsets_32.c
index 47703aed7..6ce39025f 100644
--- a/kernel/arch/x86/kernel/asm-offsets_32.c
+++ b/kernel/arch/x86/kernel/asm-offsets_32.c
@@ -17,17 +17,6 @@ void foo(void);
 
 void foo(void)
 {
-	OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
-	OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
-	OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
-	OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
-	OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
-	OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
-	OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
-	OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
-	OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
-	BLANK();
-
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
 	OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
 	OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
@@ -37,10 +26,6 @@ void foo(void)
 	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
 	BLANK();
 
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	OFFSET(TI_cpu, thread_info, cpu);
-	BLANK();
-
 	OFFSET(PT_EBX, pt_regs, bx);
 	OFFSET(PT_ECX, pt_regs, cx);
 	OFFSET(PT_EDX, pt_regs, dx);
@@ -60,9 +45,6 @@ void foo(void)
 	OFFSET(PT_OLDSS,  pt_regs, ss);
 	BLANK();
 
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
-	BLANK();
-
 	OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
 	BLANK();
 
diff --git a/kernel/arch/x86/kernel/asm-offsets_64.c b/kernel/arch/x86/kernel/asm-offsets_64.c
index 5ce6f2da8..d8f42f902 100644
--- a/kernel/arch/x86/kernel/asm-offsets_64.c
+++ b/kernel/arch/x86/kernel/asm-offsets_64.c
@@ -29,27 +29,6 @@ int main(void)
 	BLANK();
 #endif
 
-#ifdef CONFIG_IA32_EMULATION
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	BLANK();
-
-#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
-	ENTRY(ax);
-	ENTRY(bx);
-	ENTRY(cx);
-	ENTRY(dx);
-	ENTRY(si);
-	ENTRY(di);
-	ENTRY(bp);
-	ENTRY(sp);
-	ENTRY(ip);
-	BLANK();
-#undef ENTRY
-
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
-	BLANK();
-#endif
-
 #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 	ENTRY(bx);
 	ENTRY(cx);
@@ -87,7 +66,7 @@ int main(void)
 	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls_64));
 
-	DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+	DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
 	DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
 
 	return 0;
diff --git a/kernel/arch/x86/kernel/bootflag.c b/kernel/arch/x86/kernel/bootflag.c
index 5de7f4c56..52c8e3c77 100644
--- a/kernel/arch/x86/kernel/bootflag.c
+++ b/kernel/arch/x86/kernel/bootflag.c
@@ -98,4 +98,4 @@ static int __init sbf_init(void)
 
 	return 0;
 }
-module_init(sbf_init);
+arch_initcall(sbf_init);
diff --git a/kernel/arch/x86/kernel/check.c b/kernel/arch/x86/kernel/check.c
index 83a799562..145863d4d 100644
--- a/kernel/arch/x86/kernel/check.c
+++ b/kernel/arch/x86/kernel/check.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/workqueue.h>
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
 
 	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
 
-	for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
+	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+				NULL) {
 		start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
 				PAGE_SIZE, corruption_check_size);
 		end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
@@ -162,6 +163,5 @@ static int start_periodic_check_for_corruption(void)
 	schedule_delayed_work(&bios_check_work, 0);
 	return 0;
 }
-
-module_init(start_periodic_check_for_corruption);
+device_initcall(start_periodic_check_for_corruption);
 
diff --git a/kernel/arch/x86/kernel/cpu/Makefile b/kernel/arch/x86/kernel/cpu/Makefile
index 9bff68798..58031303e 100644
--- a/kernel/arch/x86/kernel/cpu/Makefile
+++ b/kernel/arch/x86/kernel/cpu/Makefile
@@ -41,11 +41,14 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o perf_event_intel_bts.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
 					   perf_event_intel_uncore_snbep.o \
 					   perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
+obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
 
 
diff --git a/kernel/arch/x86/kernel/cpu/amd.c b/kernel/arch/x86/kernel/cpu/amd.c
index e4cf63301..a8816b325 100644
--- a/kernel/arch/x86/kernel/cpu/amd.c
+++ b/kernel/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
 #include <asm/cpu.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
+#include <asm/delay.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/mmconfig.h>
@@ -19,6 +20,13 @@
 
 #include "cpu.h"
 
+/*
+ * nodes_per_socket: Stores the number of nodes per socket.
+ * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
+ * Node Identifiers[10:8]
+ */
+static u32 nodes_per_socket = 1;
+
 static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
 {
 	u32 gprs[8] = { 0 };
@@ -107,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		const int K6_BUG_LOOP = 1000000;
 		int n;
 		void (*f_vide)(void);
-		unsigned long d, d2;
+		u64 d, d2;
 
 		printk(KERN_INFO "AMD K6 stepping B detected - ");
 
@@ -118,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 		n = K6_BUG_LOOP;
 		f_vide = vide;
-		rdtscl(d);
+		d = rdtsc();
 		while (n--)
 			f_vide();
-		rdtscl(d2);
+		d2 = rdtsc();
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
@@ -288,10 +296,10 @@ static int nearby_node(int apicid)
  *     Assumption: Number of cores in each internal node is the same.
  * (2) AMD processors supporting compute units
  */
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 static void amd_get_topology(struct cpuinfo_x86 *c)
 {
-	u32 nodes, cores_per_cu = 1;
+	u32 cores_per_cu = 1;
 	u8 node_id;
 	int cpu = smp_processor_id();
 
@@ -300,7 +308,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-		nodes = ((ecx >> 8) & 7) + 1;
+		nodes_per_socket = ((ecx >> 8) & 7) + 1;
 		node_id = ecx & 7;
 
 		/* get compute unit information */
@@ -311,18 +319,18 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
-		nodes = ((value >> 3) & 7) + 1;
+		nodes_per_socket = ((value >> 3) & 7) + 1;
 		node_id = value & 7;
 	} else
 		return;
 
 	/* fixup multi-node processor information */
-	if (nodes > 1) {
+	if (nodes_per_socket > 1) {
 		u32 cores_per_node;
 		u32 cus_per_node;
 
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-		cores_per_node = c->x86_max_cores / nodes;
+		cores_per_node = c->x86_max_cores / nodes_per_socket;
 		cus_per_node = cores_per_node / cores_per_cu;
 
 		/* store NodeID, use llc_shared_map to store sibling info */
@@ -341,9 +349,10 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
  */
 static void amd_detect_cmp(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	unsigned bits;
 	int cpu = smp_processor_id();
+	unsigned int socket_id, core_complex_id;
 
 	bits = c->x86_coreid_bits;
 	/* Low order bits define the core id (index of core in socket) */
@@ -353,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
 	/* use socket ID also for last level cache */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
 	amd_get_topology(c);
+
+	/*
+	 * Fix percpu cpu_llc_id here as LLC topology is different
+	 * for Fam17h systems.
+	 */
+	 if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
+		return;
+
+	socket_id	= (c->apicid >> bits) - 1;
+	core_complex_id	= (c->apicid & ((1 << bits) - 1)) >> 3;
+
+	per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id;
 #endif
 }
 
@@ -366,6 +387,12 @@ u16 amd_get_nb_id(int cpu)
 }
 EXPORT_SYMBOL_GPL(amd_get_nb_id);
 
+u32 amd_get_nodes_per_socket(void)
+{
+	return nodes_per_socket;
+}
+EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
+
 static void srat_detect_node(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_NUMA
@@ -420,7 +447,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 
 static void early_init_amd_mc(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	unsigned bits, ecx;
 
 	/* Multi core CPU? */
@@ -493,6 +520,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		/* A random value per boot for bit slice [12:upper_bit) */
 		va_align.bits = get_random_int() & va_align.mask;
 	}
+
+	if (cpu_has(c, X86_FEATURE_MWAITX))
+		use_mwaitx_delay();
 }
 
 static void early_init_amd(struct cpuinfo_x86 *c)
@@ -520,8 +550,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
-	/* check CPU config space for extended APIC ID */
-	if (cpu_has_apic && c->x86 >= 0xf) {
+	/*
+	 * ApicID can always be treated as an 8-bit value for AMD APIC versions
+	 * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+	 * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+	 * after 16h.
+	 */
+	if (cpu_has_apic && c->x86 > 0x16) {
+		set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+	} else if (cpu_has_apic && c->x86 >= 0xf) {
+		/* check CPU config space for extended APIC ID */
 		unsigned int val;
 		val = read_pci_config(0, 24, 0, 0x68);
 		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
diff --git a/kernel/arch/x86/kernel/cpu/bugs.c b/kernel/arch/x86/kernel/cpu/bugs.c
index 03445346e..bd17db15a 100644
--- a/kernel/arch/x86/kernel/cpu/bugs.c
+++ b/kernel/arch/x86/kernel/cpu/bugs.c
@@ -12,57 +12,11 @@
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
 #include <asm/msr.h>
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
 
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
-
-/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
- *
- * We should really only care about bugs here
- * anyway. Not features.
- */
-static void __init check_fpu(void)
-{
-	s32 fdiv_bug;
-
-	kernel_fpu_begin();
-
-	/*
-	 * trap_init() enabled FXSR and company _before_ testing for FP
-	 * problems here.
-	 *
-	 * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
-	 */
-	__asm__("fninit\n\t"
-		"fldl %1\n\t"
-		"fdivl %2\n\t"
-		"fmull %2\n\t"
-		"fldl %1\n\t"
-		"fsubp %%st,%%st(1)\n\t"
-		"fistpl %0\n\t"
-		"fwait\n\t"
-		"fninit"
-		: "=m" (*&fdiv_bug)
-		: "m" (*&x), "m" (*&y));
-
-	kernel_fpu_end();
-
-	if (fdiv_bug) {
-		set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
-		pr_warn("Hmm, FPU with FDIV bug\n");
-	}
-}
-
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -85,10 +39,5 @@ void __init check_bugs(void)
 		'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
 	alternative_instructions();
 
-	/*
-	 * kernel_fpu_begin/end() in check_fpu() relies on the patched
-	 * alternative instructions.
-	 */
-	if (cpu_has_fpu)
-		check_fpu();
+	fpu__init_check_bugs();
 }
diff --git a/kernel/arch/x86/kernel/cpu/common.c b/kernel/arch/x86/kernel/cpu/common.c
index 205e0f3df..c2b7522cb 100644
--- a/kernel/arch/x86/kernel/cpu/common.c
+++ b/kernel/arch/x86/kernel/cpu/common.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -12,6 +13,7 @@
 #include <linux/kgdb.h>
 #include <linux/smp.h>
 #include <linux/io.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
@@ -31,8 +33,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mtrr.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
@@ -145,32 +146,21 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
-static int __init x86_xsave_setup(char *s)
+static int __init x86_mpx_setup(char *s)
 {
+	/* require an exact match without trailing characters */
 	if (strlen(s))
 		return 0;
-	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	return 1;
-}
-__setup("noxsave", x86_xsave_setup);
 
-static int __init x86_xsaveopt_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_MPX))
+		return 1;
 
-static int __init x86_xsaves_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
 	return 1;
 }
-__setup("noxsaves", x86_xsaves_setup);
+__setup("nompx", x86_mpx_setup);
 
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
@@ -183,14 +173,6 @@ static int __init cachesize_setup(char *str)
 }
 __setup("cachesize=", cachesize_setup);
 
-static int __init x86_fxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-	return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
 static int __init x86_sep_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_SEP);
@@ -291,10 +273,9 @@ __setup("nosmap", setup_disable_smap);
 
 static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 {
-	unsigned long eflags;
+	unsigned long eflags = native_save_fl();
 
 	/* This should have been cleared long ago */
-	raw_local_save_flags(eflags);
 	BUG_ON(eflags & X86_EFLAGS_AC);
 
 	if (cpu_has(c, X86_FEATURE_SMAP)) {
@@ -419,7 +400,7 @@ static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
 static void get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
-	char *p, *q;
+	char *p, *q, *s;
 
 	if (c->extended_cpuid_level < 0x80000004)
 		return;
@@ -430,19 +411,21 @@ static void get_model_name(struct cpuinfo_x86 *c)
 	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 	c->x86_model_id[48] = 0;
 
-	/*
-	 * Intel chips right-justify this string for some dumb reason;
-	 * undo that brain damage:
-	 */
-	p = q = &c->x86_model_id[0];
+	/* Trim whitespace */
+	p = q = s = &c->x86_model_id[0];
+
 	while (*p == ' ')
 		p++;
-	if (p != q) {
-		while (*p)
-			*q++ = *p++;
-		while (q <= &c->x86_model_id[48])
-			*q++ = '\0';	/* Zero-pad the rest */
+
+	while (*p) {
+		/* Note the last non-whitespace index */
+		if (!isspace(*p))
+			s = q;
+
+		*q++ = *p++;
 	}
+
+	*(s + 1) = '\0';
 }
 
 void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
@@ -508,7 +491,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
 
 void detect_ht(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	u32 eax, ebx, ecx, edx;
 	int index_msb, core_bits;
 	static bool printed;
@@ -686,6 +669,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 
 		c->x86_virt_bits = (eax >> 8) & 0xff;
 		c->x86_phys_bits = eax & 0xff;
+		c->x86_capability[13] = cpuid_ebx(0x80000008);
 	}
 #ifdef CONFIG_X86_32
 	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
@@ -759,7 +743,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	cpu_detect(c);
 	get_cpu_vendor(c);
 	get_cpu_cap(c);
-	fpu_detect(c);
 
 	if (this_cpu->c_early_init)
 		this_cpu->c_early_init(c);
@@ -771,6 +754,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		this_cpu->c_bsp_init(c);
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+	fpu__init_system(c);
 }
 
 void __init early_cpu_init(void)
@@ -844,7 +828,7 @@ static void generic_identify(struct cpuinfo_x86 *c)
 	if (c->cpuid_level >= 0x00000001) {
 		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
 #ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
 		c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 # else
 		c->apicid = c->initial_apicid;
@@ -1026,7 +1010,7 @@ void enable_sep_cpu(void)
 	      (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
 	      0);
 
-	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
 
 out:
 	put_cpu();
@@ -1122,14 +1106,14 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		printk(KERN_CONT "%s ", vendor);
 
 	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", strim(c->x86_model_id));
+		printk(KERN_CONT "%s", c->x86_model_id);
 	else
 		printk(KERN_CONT "%d86", c->x86);
 
-	printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+	printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
 
 	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
+		printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
 	else
 		printk(KERN_CONT ")\n");
 
@@ -1155,10 +1139,6 @@ static __init int setup_disablecpuid(char *arg)
 }
 __setup("clearcpuid=", setup_disablecpuid);
 
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
-	(unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
 struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
@@ -1183,8 +1163,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-
 /*
  * Special IST stacks which the CPU switches to when it calls
  * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1208,10 +1186,10 @@ void syscall_init(void)
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, ia32_cstar_target);
+	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
 	/*
 	 * This only works on Intel CPUs.
 	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1220,9 +1198,9 @@ void syscall_init(void)
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
-	wrmsrl(MSR_CSTAR, ignore_sysret);
+	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
@@ -1275,7 +1253,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
@@ -1439,7 +1416,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 
 	if (is_uv_system())
 		uv_cpu_init();
@@ -1495,7 +1472,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 }
 #endif
 
@@ -1512,3 +1489,20 @@ inline bool __static_cpu_has_safe(u16 bit)
 	return boot_cpu_has(bit);
 }
 EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+
+static void bsp_resume(void)
+{
+	if (this_cpu->c_bsp_resume)
+		this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+	.resume		= bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+	register_syscore_ops(&cpu_syscore_ops);
+	return 0;
+}
+core_initcall(init_cpu_syscore);
diff --git a/kernel/arch/x86/kernel/cpu/cpu.h b/kernel/arch/x86/kernel/cpu/cpu.h
index c37dc37e8..2584265d4 100644
--- a/kernel/arch/x86/kernel/cpu/cpu.h
+++ b/kernel/arch/x86/kernel/cpu/cpu.h
@@ -13,6 +13,7 @@ struct cpu_dev {
 	void		(*c_init)(struct cpuinfo_x86 *);
 	void		(*c_identify)(struct cpuinfo_x86 *);
 	void		(*c_detect_tlb)(struct cpuinfo_x86 *);
+	void		(*c_bsp_resume)(struct cpuinfo_x86 *);
 	int		c_x86_vendor;
 #ifdef CONFIG_X86_32
 	/* Optional vendor specific routine to obtain the cache size. */
diff --git a/kernel/arch/x86/kernel/cpu/intel.c b/kernel/arch/x86/kernel/cpu/intel.c
index 50163fa90..209ac1e7d 100644
--- a/kernel/arch/x86/kernel/cpu/intel.c
+++ b/kernel/arch/x86/kernel/cpu/intel.c
@@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		switch (c->x86_model) {
 		case 0x27:	/* Penwell */
 		case 0x35:	/* Cloverview */
+		case 0x4a:	/* Merrifield */
 			set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
 			break;
 		default:
@@ -371,6 +372,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+	u64 epb;
+
+	/*
+	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+	 * (x86_energy_perf_policy(8) is available to change it at run-time.)
+	 */
+	if (!cpu_has(c, X86_FEATURE_EPB))
+		return;
+
+	rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+	if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+		return;
+
+	pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+	pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+	epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+	wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+	/*
+	 * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+	 * so reinitialize it properly like during bootup:
+	 */
+	init_intel_energy_perf(c);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -478,21 +509,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
 
-	/*
-	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
-	 * x86_energy_perf_policy(8) is available to change it at run-time
-	 */
-	if (cpu_has(c, X86_FEATURE_EPB)) {
-		u64 epb;
-
-		rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
-			pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
-			pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
-			epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
-			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		}
-	}
+	init_intel_energy_perf(c);
 }
 
 #ifdef CONFIG_X86_32
@@ -747,6 +764,7 @@ static const struct cpu_dev intel_cpu_dev = {
 	.c_detect_tlb	= intel_detect_tlb,
 	.c_early_init   = early_init_intel,
 	.c_init		= init_intel,
+	.c_bsp_resume	= intel_bsp_resume,
 	.c_x86_vendor	= X86_VENDOR_INTEL,
 };
 
diff --git a/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c b/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c
index edcb0e28c..e38d338a6 100644
--- a/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/kernel/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -157,7 +157,7 @@ struct _cpuid4_info_regs {
 	struct amd_northbridge *nb;
 };
 
-unsigned short			num_cache_leaves;
+static unsigned short num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
    information to the user.  This makes some assumptions about the machine:
@@ -326,7 +326,7 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
  *
  * @returns: the disabled index if used or negative value if slot free.
  */
-int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
 {
 	unsigned int reg = 0;
 
@@ -403,8 +403,8 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
  *
  * @return: 0 on success, error status on failure
  */
-int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
-			    unsigned long index)
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+			    unsigned slot, unsigned long index)
 {
 	int ret = 0;
 
@@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
 	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	unsigned int cpu = c->cpu_index;
 #endif
 
@@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
 	if (new_l2) {
 		l2 = new_l2;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 		per_cpu(cpu_llc_id, cpu) = l2_id;
 #endif
 	}
 
 	if (new_l3) {
 		l3 = new_l3;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 		per_cpu(cpu_llc_id, cpu) = l3_id;
 #endif
 	}
 
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	/*
 	 * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
 	 * turns means that the only possibility is SMT (as indicated in
diff --git a/kernel/arch/x86/kernel/cpu/intel_pt.h b/kernel/arch/x86/kernel/cpu/intel_pt.h
index 1c338b0eb..336878a5d 100644
--- a/kernel/arch/x86/kernel/cpu/intel_pt.h
+++ b/kernel/arch/x86/kernel/cpu/intel_pt.h
@@ -25,32 +25,11 @@
  */
 #define TOPA_PMI_MARGIN 512
 
-/*
- * Table of Physical Addresses bits
- */
-enum topa_sz {
-	TOPA_4K	= 0,
-	TOPA_8K,
-	TOPA_16K,
-	TOPA_32K,
-	TOPA_64K,
-	TOPA_128K,
-	TOPA_256K,
-	TOPA_512K,
-	TOPA_1MB,
-	TOPA_2MB,
-	TOPA_4MB,
-	TOPA_8MB,
-	TOPA_16MB,
-	TOPA_32MB,
-	TOPA_64MB,
-	TOPA_128MB,
-	TOPA_SZ_END,
-};
+#define TOPA_SHIFT 12
 
-static inline unsigned int sizes(enum topa_sz tsz)
+static inline unsigned int sizes(unsigned int tsz)
 {
-	return 1 << (tsz + 12);
+	return 1 << (tsz + TOPA_SHIFT);
 };
 
 struct topa_entry {
@@ -66,20 +45,26 @@ struct topa_entry {
 	u64	rsvd4	: 16;
 };
 
-#define TOPA_SHIFT 12
-#define PT_CPUID_LEAVES 2
+#define PT_CPUID_LEAVES		2
+#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
 
 enum pt_capabilities {
 	PT_CAP_max_subleaf = 0,
 	PT_CAP_cr3_filtering,
+	PT_CAP_psb_cyc,
+	PT_CAP_mtc,
 	PT_CAP_topa_output,
 	PT_CAP_topa_multiple_entries,
+	PT_CAP_single_range_output,
 	PT_CAP_payloads_lip,
+	PT_CAP_mtc_periods,
+	PT_CAP_cycle_thresholds,
+	PT_CAP_psb_periods,
 };
 
 struct pt_pmu {
 	struct pmu		pmu;
-	u32			caps[4 * PT_CPUID_LEAVES];
+	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
 };
 
 /**
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/Makefile b/kernel/arch/x86/kernel/cpu/mcheck/Makefile
index bb34b03af..a3311c886 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/kernel/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,4 +1,4 @@
-obj-y				=  mce.o mce-severity.o
+obj-y				=  mce.o mce-severity.o mce-genpool.o
 
 obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c b/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c
index a1aef9533..34c89a3e8 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 
 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
-	mce_notify_irq();
 }
 EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
 
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/kernel/arch/x86/kernel/cpu/mcheck/mce-genpool.c
new file mode 100644
index 000000000..0a850100c
--- /dev/null
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -0,0 +1,99 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "mce-internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ	(2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+void mce_gen_pool_process(void)
+{
+	struct llist_node *head;
+	struct mce_evt_llist *node;
+	struct mce *mce;
+
+	head = llist_del_all(&mce_event_llist);
+	if (!head)
+		return;
+
+	head = llist_reverse_order(head);
+	llist_for_each_entry(node, head, llnode) {
+		mce = &node->mce;
+		atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+		gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+	}
+}
+
+bool mce_gen_pool_empty(void)
+{
+	return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+	struct mce_evt_llist *node;
+
+	if (!mce_evt_pool)
+		return -EINVAL;
+
+	node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+	if (!node) {
+		pr_warn_ratelimited("MCE records pool full!\n");
+		return -ENOMEM;
+	}
+
+	memcpy(&node->mce, mce, sizeof(*mce));
+	llist_add(&node->llnode, &mce_event_llist);
+
+	return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+	struct gen_pool *tmpp;
+	int ret = -ENOMEM;
+
+	tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+	if (!tmpp)
+		goto out;
+
+	ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+	if (ret) {
+		gen_pool_destroy(tmpp);
+		goto out;
+	}
+
+	mce_evt_pool = tmpp;
+
+out:
+	return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+	/* Just init mce_gen_pool once. */
+	if (mce_evt_pool)
+		return 0;
+
+	return mce_gen_pool_create();
+}
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h b/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fe32074b8..547720efd 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,6 +13,8 @@ enum severity_level {
 	MCE_PANIC_SEVERITY,
 };
 
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
 #define ATTR_LEN		16
 #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */
 
@@ -24,6 +26,16 @@ struct mce_bank {
 	char			attrname[ATTR_LEN];	/* attribute name */
 };
 
+struct mce_evt_llist {
+	struct llist_node llnode;
+	struct mce mce;
+};
+
+void mce_gen_pool_process(void);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+
 extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 struct dentry *mce_get_debugfs_dir(void);
 
@@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id)
 	return -EINVAL;
 }
 #endif
+
+void mce_inject_log(struct mce *m);
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce.c b/kernel/arch/x86/kernel/cpu/mcheck/mce.c
index 9d46f9a13..a080b4939 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce.c
@@ -54,10 +54,13 @@
 
 static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
-#define rcu_dereference_check_mce(p) \
-	rcu_dereference_index_check((p), \
-			      rcu_read_lock_sched_held() || \
-			      lockdep_is_held(&mce_chrdev_read_mutex))
+#define mce_log_get_idx_check(p) \
+({ \
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+			 !lockdep_is_held(&mce_chrdev_read_mutex), \
+			 "suspicious mce_log_get_idx_check() usage"); \
+	smp_load_acquire(&(p)); \
+})
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -109,22 +112,24 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  */
 mce_banks_t mce_banks_ce_disabled;
 
-static DEFINE_PER_CPU(struct work_struct, mce_work);
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+static int mce_usable_address(struct mce *m);
 
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
  */
-static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	rdtscll(m->tsc);
+	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -156,12 +161,13 @@ void mce_log(struct mce *mce)
 	/* Emit the trace record: */
 	trace_mce_record(mce);
 
-	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+	if (!mce_gen_pool_add(mce))
+		irq_work_queue(&mce_irq_work);
 
 	mce->finished = 0;
 	wmb();
 	for (;;) {
-		entry = rcu_dereference_check_mce(mcelog.next);
+		entry = mce_log_get_idx_check(mcelog.next);
 		for (;;) {
 
 			/*
@@ -195,48 +201,23 @@ void mce_log(struct mce *mce)
 	set_bit(0, &mce_need_notify);
 }
 
-static void drain_mcelog_buffer(void)
+void mce_inject_log(struct mce *m)
 {
-	unsigned int next, i, prev = 0;
-
-	next = ACCESS_ONCE(mcelog.next);
-
-	do {
-		struct mce *m;
-
-		/* drain what was logged during boot */
-		for (i = prev; i < next; i++) {
-			unsigned long start = jiffies;
-			unsigned retries = 1;
-
-			m = &mcelog.entry[i];
-
-			while (!m->finished) {
-				if (time_after_eq(jiffies, start + 2*retries))
-					retries++;
-
-				cpu_relax();
-
-				if (!m->finished && retries >= 4) {
-					pr_err("skipping error being logged currently!\n");
-					break;
-				}
-			}
-			smp_rmb();
-			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
-		}
-
-		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
-		prev = next;
-		next = cmpxchg(&mcelog.next, prev, 0);
-	} while (next != prev);
+	mutex_lock(&mce_chrdev_read_mutex);
+	mce_log(m);
+	mutex_unlock(&mce_chrdev_read_mutex);
 }
+EXPORT_SYMBOL_GPL(mce_inject_log);
 
+static struct notifier_block mce_srao_nb;
 
 void mce_register_decode_chain(struct notifier_block *nb)
 {
+	/* Ensure SRAO notifier has the highest priority in the decode chain. */
+	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
+		nb->priority -= 1;
+
 	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
-	drain_mcelog_buffer();
 }
 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 
@@ -460,61 +441,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 	}
 }
 
-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16	/* we use one entry less */
-
-struct mce_ring {
-	unsigned short start;
-	unsigned short end;
-	unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
-	struct mce_ring *r = this_cpu_ptr(&mce_ring);
-
-	return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
-	struct mce_ring *r;
-	int ret = 0;
-
-	*pfn = 0;
-	get_cpu();
-	r = this_cpu_ptr(&mce_ring);
-	if (r->start == r->end)
-		goto out;
-	*pfn = r->ring[r->start];
-	r->start = (r->start + 1) % MCE_RING_SIZE;
-	ret = 1;
-out:
-	put_cpu();
-	return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
-	struct mce_ring *r = this_cpu_ptr(&mce_ring);
-	unsigned next;
-
-	next = (r->end + 1) % MCE_RING_SIZE;
-	if (next == r->start)
-		return -1;
-	r->ring[r->end] = pfn;
-	wmb();
-	r->end = next;
-	return 0;
-}
-
 int mce_available(struct cpuinfo_x86 *c)
 {
 	if (mca_cfg.disabled)
@@ -524,12 +450,10 @@ int mce_available(struct cpuinfo_x86 *c)
 
 static void mce_schedule_work(void)
 {
-	if (!mce_ring_empty())
-		schedule_work(this_cpu_ptr(&mce_work));
+	if (!mce_gen_pool_empty() && keventd_up())
+		schedule_work(&mce_work);
 }
 
-static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-
 static void mce_irq_work_cb(struct irq_work *entry)
 {
 	mce_notify_irq();
@@ -550,9 +474,30 @@ static void mce_report_event(struct pt_regs *regs)
 		return;
 	}
 
-	irq_work_queue(this_cpu_ptr(&mce_irq_work));
+	irq_work_queue(&mce_irq_work);
 }
 
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	unsigned long pfn;
+
+	if (!mce)
+		return NOTIFY_DONE;
+
+	if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+		pfn = mce->addr >> PAGE_SHIFT;
+		memory_failure(pfn, MCE_VECTOR, 0);
+	}
+
+	return NOTIFY_OK;
+}
+static struct notifier_block mce_srao_nb = {
+	.notifier_call	= srao_decode_notifier,
+	.priority = INT_MAX,
+};
+
 /*
  * Read ADDR and MISC registers.
  */
@@ -671,8 +616,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 */
 		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 			if (m.status & MCI_STATUS_ADDRV) {
-				mce_ring_add(m.addr >> PAGE_SHIFT);
-				mce_schedule_work();
+				m.severity = severity;
+				m.usable_addr = mce_usable_address(&m);
+
+				if (!mce_gen_pool_add(&m))
+					mce_schedule_work();
 			}
 		}
 
@@ -1028,7 +976,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 {
 	struct mca_config *cfg = &mca_cfg;
 	struct mce m, *final;
-	enum ctx_state prev_state;
 	int i;
 	int worst = 0;
 	int severity;
@@ -1052,8 +999,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	char *msg = "Unknown";
 	u64 recover_paddr = ~0ull;
 	int flags = MF_ACTION_REQUIRED;
+	int lmce = 0;
 
-	prev_state = ist_enter(regs);
+	/* If this CPU is offline, just bail out. */
+	if (cpu_is_offline(smp_processor_id())) {
+		u64 mcgstatus;
+
+		mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+		if (mcgstatus & MCG_STATUS_RIPV) {
+			mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+			return;
+		}
+	}
+
+	ist_enter(regs);
 
 	this_cpu_inc(mce_exception_count);
 
@@ -1079,11 +1038,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		kill_it = 1;
 
 	/*
-	 * Go through all the banks in exclusion of the other CPUs.
-	 * This way we don't report duplicated events on shared banks
-	 * because the first one to see it will clear it.
+	 * Check if this MCE is signaled to only this logical processor
 	 */
-	order = mce_start(&no_way_out);
+	if (m.mcgstatus & MCG_STATUS_LMCES)
+		lmce = 1;
+	else {
+		/*
+		 * Go through all the banks in exclusion of the other CPUs.
+		 * This way we don't report duplicated events on shared banks
+		 * because the first one to see it will clear it.
+		 * If this is a Local MCE, then no need to perform rendezvous.
+		 */
+		order = mce_start(&no_way_out);
+	}
+
 	for (i = 0; i < cfg->banks; i++) {
 		__clear_bit(i, toclear);
 		if (!test_bit(i, valid_banks))
@@ -1132,15 +1100,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 		mce_read_aux(&m, i);
 
-		/*
-		 * Action optional error. Queue address for later processing.
-		 * When the ring overflows we just ignore the AO error.
-		 * RED-PEN add some logging mechanism when
-		 * usable_address or mce_add_ring fails.
-		 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
-		 */
-		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
-			mce_ring_add(m.addr >> PAGE_SHIFT);
+		/* assuming valid severity level != 0 */
+		m.severity = severity;
+		m.usable_addr = mce_usable_address(&m);
 
 		mce_log(&m);
 
@@ -1160,8 +1122,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 * Do most of the synchronization with other CPUs.
 	 * When there's any problem use only local no_way_out state.
 	 */
-	if (mce_end(order) < 0)
-		no_way_out = worst >= MCE_PANIC_SEVERITY;
+	if (!lmce) {
+		if (mce_end(order) < 0)
+			no_way_out = worst >= MCE_PANIC_SEVERITY;
+	} else {
+		/*
+		 * Local MCE skipped calling mce_reign()
+		 * If we found a fatal error, we need to panic here.
+		 */
+		 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+			mce_panic("Machine check from unknown source",
+				NULL, NULL);
+	}
 
 	/*
 	 * At insane "tolerant" levels we take no action. Otherwise
@@ -1206,7 +1178,7 @@ out:
 	local_irq_disable();
 	ist_end_non_atomic();
 done:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 
@@ -1226,14 +1198,11 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 /*
  * Action optional processing happens here (picking up
  * from the list of faulting pages that do_machine_check()
- * placed into the "ring").
+ * placed into the genpool).
  */
 static void mce_process_work(struct work_struct *dummy)
 {
-	unsigned long pfn;
-
-	while (mce_ring_get(&pfn))
-		memory_failure(pfn, MCE_VECTOR, 0);
+	mce_gen_pool_process();
 }
 
 #ifdef CONFIG_X86_MCE_INTEL
@@ -1655,6 +1624,8 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 		winchip_mcheck_init(c);
 		return 1;
 		break;
+	default:
+		return 0;
 	}
 
 	return 0;
@@ -1667,9 +1638,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 		mce_intel_feature_init(c);
 		mce_adjust_timer = cmci_intel_adjust_timer;
 		break;
-	case X86_VENDOR_AMD:
+
+	case X86_VENDOR_AMD: {
+		u32 ebx = cpuid_ebx(0x80000007);
+
 		mce_amd_feature_init(c);
-		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
+		mce_flags.overflow_recov = !!(ebx & BIT(0));
+		mce_flags.succor	 = !!(ebx & BIT(1));
+		mce_flags.smca		 = !!(ebx & BIT(3));
+
+		break;
+		}
+
+	default:
+		break;
+	}
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		mce_intel_feature_clear(c);
 		break;
 	default:
 		break;
@@ -1730,13 +1720,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	if (mce_gen_pool_init()) {
+		mca_cfg.disabled = true;
+		pr_emerg("Couldn't allocate MCE records pool!\n");
+		return;
+	}
+
 	machine_check_vector = do_machine_check;
 
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_timer();
-	INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
-	init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+	if (mca_cfg.disabled)
+		return;
+
+	if (!mce_available(c))
+		return;
+
+	/*
+	 * Possibly to clear general settings generic to x86
+	 * __mcheck_cpu_clear_generic(c);
+	 */
+	__mcheck_cpu_clear_vendor(c);
+
 }
 
 /*
@@ -1783,7 +1796,7 @@ static void collect_tscs(void *data)
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;
 
-	rdtscll(cpu_tsc[smp_processor_id()]);
+	cpu_tsc[smp_processor_id()] = rdtsc();
 }
 
 static int mce_apei_read_done;
@@ -1849,7 +1862,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
 			goto out;
 	}
 
-	next = rcu_dereference_check_mce(mcelog.next);
+	next = mce_log_get_idx_check(mcelog.next);
 
 	/* Only supports full reads right now */
 	err = -EINVAL;
@@ -1915,7 +1928,7 @@ out:
 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &mce_chrdev_wait, wait);
-	if (rcu_access_index(mcelog.next))
+	if (READ_ONCE(mcelog.next))
 		return POLLIN | POLLRDNORM;
 	if (!mce_apei_read_done && apei_check_mce())
 		return POLLIN | POLLRDNORM;
@@ -1960,8 +1973,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
 }
 EXPORT_SYMBOL_GPL(register_mce_write_callback);
 
-ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
-			 size_t usize, loff_t *off)
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+				size_t usize, loff_t *off)
 {
 	if (mce_write)
 		return mce_write(filp, ubuf, usize, off);
@@ -2007,6 +2020,7 @@ void mce_disable_bank(int bank)
 /*
  * mce=off Disables machine check
  * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
@@ -2030,6 +2044,8 @@ static int __init mcheck_enable(char *str)
 		cfg->disabled = true;
 	else if (!strcmp(str, "no_cmci"))
 		cfg->cmci_disabled = true;
+	else if (!strcmp(str, "no_lmce"))
+		cfg->lmce_disabled = true;
 	else if (!strcmp(str, "dont_log_ce"))
 		cfg->dont_log_ce = true;
 	else if (!strcmp(str, "ignore_ce"))
@@ -2039,11 +2055,8 @@ static int __init mcheck_enable(char *str)
 	else if (!strcmp(str, "bios_cmci_threshold"))
 		cfg->bios_cmci_threshold = true;
 	else if (isdigit(str[0])) {
-		get_option(&str, &(cfg->tolerant));
-		if (*str == ',') {
-			++str;
+		if (get_option(&str, &cfg->tolerant) == 2)
 			get_option(&str, &(cfg->monarch_timeout));
-		}
 	} else {
 		pr_info("mce argument %s ignored. Please use /sys\n", str);
 		return 0;
@@ -2055,8 +2068,12 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
+	mce_register_decode_chain(&mce_srao_nb);
 	mcheck_vendor_init_severity();
 
+	INIT_WORK(&mce_work, mce_process_work);
+	init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
 	return 0;
 }
 
@@ -2068,7 +2085,7 @@ int __init mcheck_init(void)
  * Disable machine checks on suspend and shutdown. We can't really handle
  * them later.
  */
-static int mce_disable_error_reporting(void)
+static void mce_disable_error_reporting(void)
 {
 	int i;
 
@@ -2078,17 +2095,32 @@ static int mce_disable_error_reporting(void)
 		if (b->init)
 			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
 	}
-	return 0;
+	return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+	/*
+	 * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
+	 * Disabling them for just a single offlined CPU is bad, since it will
+	 * inhibit reporting for all shared resources on the socket like the
+	 * last level cache (LLC), the integrated memory controller (iMC), etc.
+	 */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		return;
+
+	mce_disable_error_reporting();
 }
 
 static int mce_syscore_suspend(void)
 {
-	return mce_disable_error_reporting();
+	vendor_disable_error_reporting();
+	return 0;
 }
 
 static void mce_syscore_shutdown(void)
 {
-	mce_disable_error_reporting();
+	vendor_disable_error_reporting();
 }
 
 /*
@@ -2368,7 +2400,6 @@ static void mce_device_remove(unsigned int cpu)
 static void mce_disable_cpu(void *h)
 {
 	unsigned long action = *(unsigned long *)h;
-	int i;
 
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
@@ -2377,12 +2408,8 @@ static void mce_disable_cpu(void *h)
 
 	if (!(action & CPU_TASKS_FROZEN))
 		cmci_clear();
-	for (i = 0; i < mca_cfg.banks; i++) {
-		struct mce_bank *b = &mce_banks[i];
 
-		if (b->init)
-			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
-	}
+	vendor_disable_error_reporting();
 }
 
 static void mce_reenable_cpu(void *h)
@@ -2594,5 +2621,20 @@ static int __init mcheck_debugfs_init(void)
 
 	return 0;
 }
-late_initcall(mcheck_debugfs_init);
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
 #endif
+
+static int __init mcheck_late_init(void)
+{
+	mcheck_debugfs_init();
+
+	/*
+	 * Flush out everything that has been logged during early boot, now that
+	 * everything has been initialized (workqueues, decoders, ...).
+	 */
+	mce_schedule_work();
+
+	return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c b/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 55ad9b37c..e99b15077 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,19 +1,13 @@
 /*
- *  (c) 2005-2012 Advanced Micro Devices, Inc.
+ *  (c) 2005-2015 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
  *
  *  Written by Jacob Shin - AMD, Inc.
- *
  *  Maintained by: Borislav Petkov <bp@alien8.de>
  *
- *  April 2006
- *     - added support for AMD Family 0x10 processors
- *  May 2012
- *     - major scrubbing
- *
- *  All MC4_MISCi registers are shared between multi-cores
+ *  All MC4_MISCi registers are shared between cores on a node.
  */
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
@@ -32,6 +26,7 @@
 #include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
 
 #define NR_BLOCKS         9
 #define THRESHOLD_MAX     0xFFF
@@ -47,6 +42,13 @@
 #define MASK_BLKPTR_LO    0xFF000000
 #define MCG_XBLK_ADDR     0xC0000400
 
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR		0xC0000410
+#define MASK_DEF_LVTOFF		0x000000F0
+#define MASK_DEF_INT_TYPE	0x00000006
+#define DEF_LVT_OFF		0x2
+#define DEF_INT_TYPE_APIC	0x2
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
 static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+	pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
 /*
  * CPU Initialization
@@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
 	threshold_restart_bank(&tr);
 };
 
-static int setup_APIC_mce(int reserved, int new)
+static int setup_APIC_mce_threshold(int reserved, int new)
 {
 	if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
 					      APIC_EILVT_MSG_FIX, 0))
@@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new)
 	return reserved;
 }
 
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+	if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+					      APIC_EILVT_MSG_FIX, 0))
+		return new;
+
+	return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+	u32 low = 0, high = 0;
+	int def_offset = -1, def_new;
+
+	if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+		return;
+
+	def_new = (low & MASK_DEF_LVTOFF) >> 4;
+	if (!(low & MASK_DEF_LVTOFF)) {
+		pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+		def_new = DEF_LVT_OFF;
+		low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+	}
+
+	def_offset = setup_APIC_deferred_error(def_offset, def_new);
+	if ((def_offset == def_new) &&
+	    (deferred_error_int_vector != amd_deferred_error_interrupt))
+		deferred_error_int_vector = amd_deferred_error_interrupt;
+
+	low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+	wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
@@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 			b.interrupt_enable = 1;
 			new	= (high & MASK_LVTOFF_HI) >> 20;
-			offset  = setup_APIC_mce(offset, new);
+			offset  = setup_APIC_mce_threshold(offset, new);
 
 			if ((offset == new) &&
 			    (mce_threshold_vector != amd_threshold_interrupt))
@@ -262,6 +304,73 @@ init:
 			mce_threshold_block_init(&b, offset);
 		}
 	}
+
+	if (mce_flags.succor)
+		deferred_error_interrupt_enable(c);
+}
+
+static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+{
+	struct mce m;
+	u64 status;
+
+	rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+	if (!(status & MCI_STATUS_VAL))
+		return;
+
+	mce_setup(&m);
+
+	m.status = status;
+	m.bank = bank;
+
+	if (threshold_err)
+		m.misc = misc;
+
+	if (m.status & MCI_STATUS_ADDRV)
+		rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+
+	mce_log(&m);
+	wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+}
+
+static inline void __smp_deferred_error_interrupt(void)
+{
+	inc_irq_stat(irq_deferred_error_count);
+	deferred_error_int_vector();
+}
+
+asmlinkage __visible void smp_deferred_error_interrupt(void)
+{
+	entering_irq();
+	__smp_deferred_error_interrupt();
+	exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+{
+	entering_irq();
+	trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+	__smp_deferred_error_interrupt();
+	trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+	exiting_ack_irq();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+	u64 status;
+	unsigned int bank;
+
+	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+
+		if (!(status & MCI_STATUS_VAL) ||
+		    !(status & MCI_STATUS_DEFERRED))
+			continue;
+
+		__log_error(bank, false, 0);
+		break;
+	}
 }
 
 /*
@@ -273,12 +382,12 @@ init:
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
+
 static void amd_threshold_interrupt(void)
 {
 	u32 low = 0, high = 0, address = 0;
 	int cpu = smp_processor_id();
 	unsigned int bank, block;
-	struct mce m;
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void)
 	return;
 
 log:
-	mce_setup(&m);
-	rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
-	if (!(m.status & MCI_STATUS_VAL))
-		return;
-	m.misc = ((u64)high << 32) | low;
-	m.bank = bank;
-	mce_log(&m);
-
-	wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+	__log_error(bank, true, ((u64)high << 32) | low);
 }
 
 /*
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c b/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e166d833c..1e8bb6c94 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -91,6 +91,36 @@ static int cmci_supported(int *banks)
 	return !!(cap & MCG_CMCI_P);
 }
 
+static bool lmce_supported(void)
+{
+	u64 tmp;
+
+	if (mca_cfg.lmce_disabled)
+		return false;
+
+	rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+	/*
+	 * LMCE depends on recovery support in the processor. Hence both
+	 * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+	 */
+	if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+		   (MCG_SER_P | MCG_LMCE_P))
+		return false;
+
+	/*
+	 * BIOS should indicate support for LMCE by setting bit 20 in
+	 * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+	 * generate a #GP fault.
+	 */
+	rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+	if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+		   (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+		return true;
+
+	return false;
+}
+
 bool mce_intel_cmci_poll(void)
 {
 	if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
@@ -221,7 +251,6 @@ static void intel_threshold_interrupt(void)
 		return;
 
 	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
-	mce_notify_irq();
 }
 
 /*
@@ -410,8 +439,39 @@ static void intel_init_cmci(void)
 	cmci_recheck();
 }
 
+static void intel_init_lmce(void)
+{
+	u64 val;
+
+	if (!lmce_supported())
+		return;
+
+	rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+	if (!(val & MCG_EXT_CTL_LMCE_EN))
+		wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+static void intel_clear_lmce(void)
+{
+	u64 val;
+
+	if (!lmce_supported())
+		return;
+
+	rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+	val &= ~MCG_EXT_CTL_LMCE_EN;
+	wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 	intel_init_cmci();
+	intel_init_lmce();
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+	intel_clear_lmce();
 }
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/p5.c b/kernel/arch/x86/kernel/cpu/mcheck/p5.c
index 737b0ad4e..12402e10a 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
 /* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	u32 loaddr, hi, lotype;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c b/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1af51b158..2c5aaf8c2 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -503,14 +503,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	/* Check whether a vector already exists */
-	if (h & APIC_VECTOR_MASK) {
-		printk(KERN_DEBUG
-		       "CPU%d: Thermal LVT vector (%#x) already installed\n",
-		       cpu, (h & APIC_VECTOR_MASK));
-		return;
-	}
-
 	/* early Pentium M models use different method for enabling TM2 */
 	if (cpu_has(c, X86_FEATURE_TM2)) {
 		if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
diff --git a/kernel/arch/x86/kernel/cpu/mcheck/winchip.c b/kernel/arch/x86/kernel/cpu/mcheck/winchip.c
index 44f138296..01dd87028 100644
--- a/kernel/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/kernel/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
 /* Machine check handler for WinChip C6: */
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
diff --git a/kernel/arch/x86/kernel/cpu/microcode/Makefile b/kernel/arch/x86/kernel/cpu/microcode/Makefile
index 285c85427..220b1a508 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/Makefile
+++ b/kernel/arch/x86/kernel/cpu/microcode/Makefile
@@ -2,6 +2,3 @@ microcode-y				:= core.o
 obj-$(CONFIG_MICROCODE)			+= microcode.o
 microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o intel_lib.o
 microcode-$(CONFIG_MICROCODE_AMD)	+= amd.o
-obj-$(CONFIG_MICROCODE_EARLY)		+= core_early.o
-obj-$(CONFIG_MICROCODE_INTEL_EARLY)	+= intel_early.o
-obj-$(CONFIG_MICROCODE_AMD_EARLY)	+= amd_early.o
diff --git a/kernel/arch/x86/kernel/cpu/microcode/amd.c b/kernel/arch/x86/kernel/cpu/microcode/amd.c
index 12829c3ce..2233f8a76 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/amd.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/amd.c
@@ -1,5 +1,9 @@
 /*
  *  AMD CPU Microcode Update Driver for Linux
+ *
+ *  This driver allows to upgrade microcode on F10h AMD
+ *  CPUs and later.
+ *
  *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.
  *
  *  Author: Peter Oruba <peter.oruba@amd.com>
@@ -7,34 +11,31 @@
  *  Based on work by:
  *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
  *
- *  Maintainers:
- *  Andreas Herrmann <herrmann.der.user@googlemail.com>
- *  Borislav Petkov <bp@alien8.de>
+ *  early loader:
+ *  Copyright (C) 2013 Advanced Micro Devices, Inc.
  *
- *  This driver allows to upgrade microcode on F10h AMD
- *  CPUs and later.
+ *  Author: Jacob Shin <jacob.shin@amd.com>
+ *  Fixes: Borislav Petkov <bp@suse.de>
  *
  *  Licensed under the terms of the GNU General Public
  *  License version 2. See file COPYING for details.
  */
+#define pr_fmt(fmt) "microcode: " fmt
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
+#include <linux/earlycpio.h>
 #include <linux/firmware.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
+#include <linux/initrd.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/pci.h>
 
+#include <asm/microcode_amd.h>
 #include <asm/microcode.h>
 #include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
 #include <asm/msr.h>
-#include <asm/microcode_amd.h>
-
-MODULE_DESCRIPTION("AMD Microcode Update Driver");
-MODULE_AUTHOR("Peter Oruba");
-MODULE_LICENSE("GPL v2");
 
 static struct equiv_cpu_entry *equiv_cpu_table;
 
@@ -47,6 +48,432 @@ struct ucode_patch {
 
 static LIST_HEAD(pcache);
 
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
+
+static u32 ucode_new_rev;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u16 this_equiv_id;
+
+static struct cpio_data ucode_cpio;
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio format.
+ * See Documentation/x86/early-microcode.txt
+ */
+static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+
+static struct cpio_data __init find_ucode_in_initrd(void)
+{
+	long offset = 0;
+	char *path;
+	void *start;
+	size_t size;
+
+#ifdef CONFIG_X86_32
+	struct boot_params *p;
+
+	/*
+	 * On 32-bit, early load occurs before paging is turned on so we need
+	 * to use physical addresses.
+	 */
+	p       = (struct boot_params *)__pa_nodebug(&boot_params);
+	path    = (char *)__pa_nodebug(ucode_path);
+	start   = (void *)p->hdr.ramdisk_image;
+	size    = p->hdr.ramdisk_size;
+#else
+	path    = ucode_path;
+	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
+	size    = boot_params.hdr.ramdisk_size;
+#endif
+
+	return find_cpio_data(path, start, size, &offset);
+}
+
+static size_t compute_container_size(u8 *data, u32 total_size)
+{
+	size_t size = 0;
+	u32 *header = (u32 *)data;
+
+	if (header[0] != UCODE_MAGIC ||
+	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+	    header[2] == 0)                            /* size */
+		return size;
+
+	size = header[2] + CONTAINER_HDR_SZ;
+	total_size -= size;
+	data += size;
+
+	while (total_size) {
+		u16 patch_size;
+
+		header = (u32 *)data;
+
+		if (header[0] != UCODE_UCODE_TYPE)
+			break;
+
+		/*
+		 * Sanity-check patch size.
+		 */
+		patch_size = header[1];
+		if (patch_size > PATCH_MAX_SIZE)
+			break;
+
+		size	   += patch_size + SECTION_HDR_SIZE;
+		data	   += patch_size + SECTION_HDR_SIZE;
+		total_size -= patch_size + SECTION_HDR_SIZE;
+	}
+
+	return size;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+{
+	struct equiv_cpu_entry *eq;
+	size_t *cont_sz;
+	u32 *header;
+	u8  *data, **cont;
+	u8 (*patch)[PATCH_MAX_SIZE];
+	u16 eq_id = 0;
+	int offset, left;
+	u32 rev, eax, ebx, ecx, edx;
+	u32 *new_rev;
+
+#ifdef CONFIG_X86_32
+	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+	cont_sz = (size_t *)__pa_nodebug(&container_size);
+	cont	= (u8 **)__pa_nodebug(&container);
+	patch	= (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+	new_rev = &ucode_new_rev;
+	cont_sz = &container_size;
+	cont	= &container;
+	patch	= &amd_ucode_patch;
+#endif
+
+	data   = ucode;
+	left   = size;
+	header = (u32 *)data;
+
+	/* find equiv cpu table */
+	if (header[0] != UCODE_MAGIC ||
+	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+	    header[2] == 0)                            /* size */
+		return;
+
+	eax = 0x00000001;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	while (left > 0) {
+		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
+
+		*cont = data;
+
+		/* Advance past the container header */
+		offset = header[2] + CONTAINER_HDR_SZ;
+		data  += offset;
+		left  -= offset;
+
+		eq_id = find_equiv_id(eq, eax);
+		if (eq_id) {
+			this_equiv_id = eq_id;
+			*cont_sz = compute_container_size(*cont, left + offset);
+
+			/*
+			 * truncate how much we need to iterate over in the
+			 * ucode update loop below
+			 */
+			left = *cont_sz - offset;
+			break;
+		}
+
+		/*
+		 * support multiple container files appended together. if this
+		 * one does not have a matching equivalent cpu entry, we fast
+		 * forward to the next container file.
+		 */
+		while (left > 0) {
+			header = (u32 *)data;
+			if (header[0] == UCODE_MAGIC &&
+			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
+				break;
+
+			offset = header[1] + SECTION_HDR_SIZE;
+			data  += offset;
+			left  -= offset;
+		}
+
+		/* mark where the next microcode container file starts */
+		offset    = data - (u8 *)ucode;
+		ucode     = data;
+	}
+
+	if (!eq_id) {
+		*cont = NULL;
+		*cont_sz = 0;
+		return;
+	}
+
+	if (check_current_patch_level(&rev, true))
+		return;
+
+	while (left > 0) {
+		struct microcode_amd *mc;
+
+		header = (u32 *)data;
+		if (header[0] != UCODE_UCODE_TYPE || /* type */
+		    header[1] == 0)                  /* size */
+			break;
+
+		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
+
+		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+
+			if (!__apply_microcode_amd(mc)) {
+				rev = mc->hdr.patch_id;
+				*new_rev = rev;
+
+				if (save_patch)
+					memcpy(patch, mc,
+					       min_t(u32, header[1], PATCH_MAX_SIZE));
+			}
+		}
+
+		offset  = header[1] + SECTION_HDR_SIZE;
+		data   += offset;
+		left   -= offset;
+	}
+}
+
+static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
+					      unsigned int family)
+{
+#ifdef CONFIG_X86_64
+	char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+	if (family >= 0x15)
+		snprintf(fw_name, sizeof(fw_name),
+			 "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+	return get_builtin_firmware(cp, fw_name);
+#else
+	return false;
+#endif
+}
+
+void __init load_ucode_amd_bsp(unsigned int family)
+{
+	struct cpio_data cp;
+	void **data;
+	size_t *size;
+
+#ifdef CONFIG_X86_32
+	data =  (void **)__pa_nodebug(&ucode_cpio.data);
+	size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+	data = &ucode_cpio.data;
+	size = &ucode_cpio.size;
+#endif
+
+	cp = find_ucode_in_initrd();
+	if (!cp.data) {
+		if (!load_builtin_amd_microcode(&cp, family))
+			return;
+	}
+
+	*data = cp.data;
+	*size = cp.size;
+
+	apply_ucode_in_initrd(cp.data, cp.size, true);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit, since AP's early load occurs before paging is turned on, we
+ * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
+ * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * which is used upon resume from suspend.
+ */
+void load_ucode_amd_ap(void)
+{
+	struct microcode_amd *mc;
+	size_t *usize;
+	void **ucode;
+
+	mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
+		__apply_microcode_amd(mc);
+		return;
+	}
+
+	ucode = (void *)__pa_nodebug(&container);
+	usize = (size_t *)__pa_nodebug(&container_size);
+
+	if (!*ucode || !*usize)
+		return;
+
+	apply_ucode_in_initrd(*ucode, *usize, false);
+}
+
+static void __init collect_cpu_sig_on_bsp(void *arg)
+{
+	unsigned int cpu = smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	uci->cpu_sig.sig = cpuid_eax(0x00000001);
+}
+
+static void __init get_bsp_sig(void)
+{
+	unsigned int bsp = boot_cpu_data.cpu_index;
+	struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
+
+	if (!uci->cpu_sig.sig)
+		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+}
+#else
+void load_ucode_amd_ap(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct equiv_cpu_entry *eq;
+	struct microcode_amd *mc;
+	u32 rev, eax;
+	u16 eq_id;
+
+	/* Exit if called on the BSP. */
+	if (!cpu)
+		return;
+
+	if (!container)
+		return;
+
+	/*
+	 * 64-bit runs with paging enabled, thus early==false.
+	 */
+	if (check_current_patch_level(&rev, false))
+		return;
+
+	eax = cpuid_eax(0x00000001);
+	eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+
+	eq_id = find_equiv_id(eq, eax);
+	if (!eq_id)
+		return;
+
+	if (eq_id == this_equiv_id) {
+		mc = (struct microcode_amd *)amd_ucode_patch;
+
+		if (mc && rev < mc->hdr.patch_id) {
+			if (!__apply_microcode_amd(mc))
+				ucode_new_rev = mc->hdr.patch_id;
+		}
+
+	} else {
+		if (!ucode_cpio.data)
+			return;
+
+		/*
+		 * AP has a different equivalence ID than BSP, looks like
+		 * mixed-steppings silicon so go through the ucode blob anew.
+		 */
+		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
+	}
+}
+#endif
+
+int __init save_microcode_in_initrd_amd(void)
+{
+	unsigned long cont;
+	int retval = 0;
+	enum ucode_state ret;
+	u8 *cont_va;
+	u32 eax;
+
+	if (!container)
+		return -EINVAL;
+
+#ifdef CONFIG_X86_32
+	get_bsp_sig();
+	cont	= (unsigned long)container;
+	cont_va = __va(container);
+#else
+	/*
+	 * We need the physical address of the container for both bitness since
+	 * boot_params.hdr.ramdisk_image is a physical address.
+	 */
+	cont    = __pa(container);
+	cont_va = container;
+#endif
+
+	/*
+	 * Take into account the fact that the ramdisk might get relocated and
+	 * therefore we need to recompute the container's position in virtual
+	 * memory space.
+	 */
+	if (relocated_ramdisk)
+		container = (u8 *)(__va(relocated_ramdisk) +
+			     (cont - boot_params.hdr.ramdisk_image));
+	else
+		container = cont_va;
+
+	if (ucode_new_rev)
+		pr_info("microcode: updated early to new patch_level=0x%08x\n",
+			ucode_new_rev);
+
+	eax   = cpuid_eax(0x00000001);
+	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+
+	ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+	if (ret != UCODE_OK)
+		retval = -EINVAL;
+
+	/*
+	 * This will be freed any msec now, stash patches for the current
+	 * family and switch to patch cache for cpu hotplug, etc later.
+	 */
+	container = NULL;
+	container_size = 0;
+
+	return retval;
+}
+
+void reload_ucode_amd(void)
+{
+	struct microcode_amd *mc;
+	u32 rev;
+
+	/*
+	 * early==false because this is a syscore ->resume path and by
+	 * that time paging is long enabled.
+	 */
+	if (check_current_patch_level(&rev, false))
+		return;
+
+	mc = (struct microcode_amd *)amd_ucode_patch;
+
+	if (mc && rev < mc->hdr.patch_id) {
+		if (!__apply_microcode_amd(mc)) {
+			ucode_new_rev = mc->hdr.patch_id;
+			pr_info("microcode: reload patch_level=0x%08x\n",
+				ucode_new_rev);
+		}
+	}
+}
 static u16 __find_equiv_id(unsigned int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -177,6 +604,53 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
 	return patch_size;
 }
 
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+	0x01000098,
+	0x0100009f,
+	0x010000af,
+	0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * @rev: Use it to return the patch level. It is set to 0 in the case of
+ * error.
+ *
+ * Returns:
+ *  - true: if update should stop
+ *  - false: otherwise
+ */
+bool check_current_patch_level(u32 *rev, bool early)
+{
+	u32 lvl, dummy, i;
+	bool ret = false;
+	u32 *levels;
+
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+	if (IS_ENABLED(CONFIG_X86_32) && early)
+		levels = (u32 *)__pa_nodebug(&final_levels);
+	else
+		levels = final_levels;
+
+	for (i = 0; levels[i]; i++) {
+		if (lvl == levels[i]) {
+			lvl = 0;
+			ret = true;
+			break;
+		}
+	}
+
+	if (rev)
+		*rev = lvl;
+
+	return ret;
+}
+
 int __apply_microcode_amd(struct microcode_amd *mc_amd)
 {
 	u32 rev, dummy;
@@ -197,7 +671,7 @@ int apply_microcode_amd(int cpu)
 	struct microcode_amd *mc_amd;
 	struct ucode_cpu_info *uci;
 	struct ucode_patch *p;
-	u32 rev, dummy;
+	u32 rev;
 
 	BUG_ON(raw_smp_processor_id() != cpu);
 
@@ -210,7 +684,8 @@ int apply_microcode_amd(int cpu)
 	mc_amd  = p->data;
 	uci->mc = p->data;
 
-	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+	if (check_current_patch_level(&rev, false))
+		return -1;
 
 	/* need to apply patch? */
 	if (rev >= mc_amd->hdr.patch_id) {
@@ -387,7 +862,7 @@ enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t s
 	if (ret != UCODE_OK)
 		cleanup();
 
-#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32)
+#ifdef CONFIG_X86_32
 	/* save BSP's matching patch for early load */
 	if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
 		struct ucode_patch *p = find_patch(cpu);
@@ -475,7 +950,7 @@ static struct microcode_ops microcode_amd_ops = {
 
 struct microcode_ops * __init init_amd_microcode(void)
 {
-	struct cpuinfo_x86 *c = &cpu_data(0);
+	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
 		pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
diff --git a/kernel/arch/x86/kernel/cpu/microcode/amd_early.c b/kernel/arch/x86/kernel/cpu/microcode/amd_early.c
deleted file mode 100644
index 737737edb..000000000
--- a/kernel/arch/x86/kernel/cpu/microcode/amd_early.c
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- * Fixes: Borislav Petkov <bp@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-
-#include <asm/cpu.h>
-#include <asm/setup.h>
-#include <asm/microcode_amd.h>
-
-/*
- * This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
- */
-static u8 *container;
-static size_t container_size;
-
-static u32 ucode_new_rev;
-u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static u16 this_equiv_id;
-
-static struct cpio_data ucode_cpio;
-
-/*
- * Microcode patch container file is prepended to the initrd in cpio format.
- * See Documentation/x86/early-microcode.txt
- */
-static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
-static struct cpio_data __init find_ucode_in_initrd(void)
-{
-	long offset = 0;
-	char *path;
-	void *start;
-	size_t size;
-
-#ifdef CONFIG_X86_32
-	struct boot_params *p;
-
-	/*
-	 * On 32-bit, early load occurs before paging is turned on so we need
-	 * to use physical addresses.
-	 */
-	p       = (struct boot_params *)__pa_nodebug(&boot_params);
-	path    = (char *)__pa_nodebug(ucode_path);
-	start   = (void *)p->hdr.ramdisk_image;
-	size    = p->hdr.ramdisk_size;
-#else
-	path    = ucode_path;
-	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
-	size    = boot_params.hdr.ramdisk_size;
-#endif
-
-	return find_cpio_data(path, start, size, &offset);
-}
-
-static size_t compute_container_size(u8 *data, u32 total_size)
-{
-	size_t size = 0;
-	u32 *header = (u32 *)data;
-
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return size;
-
-	size = header[2] + CONTAINER_HDR_SZ;
-	total_size -= size;
-	data += size;
-
-	while (total_size) {
-		u16 patch_size;
-
-		header = (u32 *)data;
-
-		if (header[0] != UCODE_UCODE_TYPE)
-			break;
-
-		/*
-		 * Sanity-check patch size.
-		 */
-		patch_size = header[1];
-		if (patch_size > PATCH_MAX_SIZE)
-			break;
-
-		size	   += patch_size + SECTION_HDR_SIZE;
-		data	   += patch_size + SECTION_HDR_SIZE;
-		total_size -= patch_size + SECTION_HDR_SIZE;
-	}
-
-	return size;
-}
-
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- */
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
-{
-	struct equiv_cpu_entry *eq;
-	size_t *cont_sz;
-	u32 *header;
-	u8  *data, **cont;
-	u8 (*patch)[PATCH_MAX_SIZE];
-	u16 eq_id = 0;
-	int offset, left;
-	u32 rev, eax, ebx, ecx, edx;
-	u32 *new_rev;
-
-#ifdef CONFIG_X86_32
-	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-	cont_sz = (size_t *)__pa_nodebug(&container_size);
-	cont	= (u8 **)__pa_nodebug(&container);
-	patch	= (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
-#else
-	new_rev = &ucode_new_rev;
-	cont_sz = &container_size;
-	cont	= &container;
-	patch	= &amd_ucode_patch;
-#endif
-
-	data   = ucode;
-	left   = size;
-	header = (u32 *)data;
-
-	/* find equiv cpu table */
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return;
-
-	eax = 0x00000001;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	while (left > 0) {
-		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
-
-		*cont = data;
-
-		/* Advance past the container header */
-		offset = header[2] + CONTAINER_HDR_SZ;
-		data  += offset;
-		left  -= offset;
-
-		eq_id = find_equiv_id(eq, eax);
-		if (eq_id) {
-			this_equiv_id = eq_id;
-			*cont_sz = compute_container_size(*cont, left + offset);
-
-			/*
-			 * truncate how much we need to iterate over in the
-			 * ucode update loop below
-			 */
-			left = *cont_sz - offset;
-			break;
-		}
-
-		/*
-		 * support multiple container files appended together. if this
-		 * one does not have a matching equivalent cpu entry, we fast
-		 * forward to the next container file.
-		 */
-		while (left > 0) {
-			header = (u32 *)data;
-			if (header[0] == UCODE_MAGIC &&
-			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
-				break;
-
-			offset = header[1] + SECTION_HDR_SIZE;
-			data  += offset;
-			left  -= offset;
-		}
-
-		/* mark where the next microcode container file starts */
-		offset    = data - (u8 *)ucode;
-		ucode     = data;
-	}
-
-	if (!eq_id) {
-		*cont = NULL;
-		*cont_sz = 0;
-		return;
-	}
-
-	/* find ucode and update if needed */
-
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
-	while (left > 0) {
-		struct microcode_amd *mc;
-
-		header = (u32 *)data;
-		if (header[0] != UCODE_UCODE_TYPE || /* type */
-		    header[1] == 0)                  /* size */
-			break;
-
-		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-
-		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
-
-			if (!__apply_microcode_amd(mc)) {
-				rev = mc->hdr.patch_id;
-				*new_rev = rev;
-
-				if (save_patch)
-					memcpy(patch, mc,
-					       min_t(u32, header[1], PATCH_MAX_SIZE));
-			}
-		}
-
-		offset  = header[1] + SECTION_HDR_SIZE;
-		data   += offset;
-		left   -= offset;
-	}
-}
-
-void __init load_ucode_amd_bsp(void)
-{
-	struct cpio_data cp;
-	void **data;
-	size_t *size;
-
-#ifdef CONFIG_X86_32
-	data =  (void **)__pa_nodebug(&ucode_cpio.data);
-	size = (size_t *)__pa_nodebug(&ucode_cpio.size);
-#else
-	data = &ucode_cpio.data;
-	size = &ucode_cpio.size;
-#endif
-
-	cp = find_ucode_in_initrd();
-	if (!cp.data)
-		return;
-
-	*data = cp.data;
-	*size = cp.size;
-
-	apply_ucode_in_initrd(cp.data, cp.size, true);
-}
-
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * which is used upon resume from suspend.
- */
-void load_ucode_amd_ap(void)
-{
-	struct microcode_amd *mc;
-	size_t *usize;
-	void **ucode;
-
-	mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
-	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
-		__apply_microcode_amd(mc);
-		return;
-	}
-
-	ucode = (void *)__pa_nodebug(&container);
-	usize = (size_t *)__pa_nodebug(&container_size);
-
-	if (!*ucode || !*usize)
-		return;
-
-	apply_ucode_in_initrd(*ucode, *usize, false);
-}
-
-static void __init collect_cpu_sig_on_bsp(void *arg)
-{
-	unsigned int cpu = smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
-	unsigned int bsp = boot_cpu_data.cpu_index;
-	struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
-	if (!uci->cpu_sig.sig)
-		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
-}
-#else
-void load_ucode_amd_ap(void)
-{
-	unsigned int cpu = smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	struct equiv_cpu_entry *eq;
-	struct microcode_amd *mc;
-	u32 rev, eax;
-	u16 eq_id;
-
-	/* Exit if called on the BSP. */
-	if (!cpu)
-		return;
-
-	if (!container)
-		return;
-
-	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
-	uci->cpu_sig.rev = rev;
-	uci->cpu_sig.sig = eax;
-
-	eax = cpuid_eax(0x00000001);
-	eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
-
-	eq_id = find_equiv_id(eq, eax);
-	if (!eq_id)
-		return;
-
-	if (eq_id == this_equiv_id) {
-		mc = (struct microcode_amd *)amd_ucode_patch;
-
-		if (mc && rev < mc->hdr.patch_id) {
-			if (!__apply_microcode_amd(mc))
-				ucode_new_rev = mc->hdr.patch_id;
-		}
-
-	} else {
-		if (!ucode_cpio.data)
-			return;
-
-		/*
-		 * AP has a different equivalence ID than BSP, looks like
-		 * mixed-steppings silicon so go through the ucode blob anew.
-		 */
-		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
-	}
-}
-#endif
-
-int __init save_microcode_in_initrd_amd(void)
-{
-	unsigned long cont;
-	int retval = 0;
-	enum ucode_state ret;
-	u8 *cont_va;
-	u32 eax;
-
-	if (!container)
-		return -EINVAL;
-
-#ifdef CONFIG_X86_32
-	get_bsp_sig();
-	cont	= (unsigned long)container;
-	cont_va = __va(container);
-#else
-	/*
-	 * We need the physical address of the container for both bitness since
-	 * boot_params.hdr.ramdisk_image is a physical address.
-	 */
-	cont    = __pa(container);
-	cont_va = container;
-#endif
-
-	/*
-	 * Take into account the fact that the ramdisk might get relocated and
-	 * therefore we need to recompute the container's position in virtual
-	 * memory space.
-	 */
-	if (relocated_ramdisk)
-		container = (u8 *)(__va(relocated_ramdisk) +
-			     (cont - boot_params.hdr.ramdisk_image));
-	else
-		container = cont_va;
-
-	if (ucode_new_rev)
-		pr_info("microcode: updated early to new patch_level=0x%08x\n",
-			ucode_new_rev);
-
-	eax   = cpuid_eax(0x00000001);
-	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
-
-	ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
-	if (ret != UCODE_OK)
-		retval = -EINVAL;
-
-	/*
-	 * This will be freed any msec now, stash patches for the current
-	 * family and switch to patch cache for cpu hotplug, etc later.
-	 */
-	container = NULL;
-	container_size = 0;
-
-	return retval;
-}
-
-void reload_ucode_amd(void)
-{
-	struct microcode_amd *mc;
-	u32 rev, eax;
-
-	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
-
-	mc = (struct microcode_amd *)amd_ucode_patch;
-
-	if (mc && rev < mc->hdr.patch_id) {
-		if (!__apply_microcode_amd(mc)) {
-			ucode_new_rev = mc->hdr.patch_id;
-			pr_info("microcode: reload patch_level=0x%08x\n",
-				ucode_new_rev);
-		}
-	}
-}
diff --git a/kernel/arch/x86/kernel/cpu/microcode/core.c b/kernel/arch/x86/kernel/cpu/microcode/core.c
index 36a83617e..b3e94ef46 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/core.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/core.c
@@ -1,104 +1,57 @@
 /*
- *	Intel CPU Microcode Update Driver for Linux
+ * CPU Microcode Update Driver for Linux
  *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *	      2006	Shaohua Li <shaohua.li@intel.com>
+ *	      2013-2015	Borislav Petkov <bp@alien8.de>
  *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
+ * X86 CPU microcode early update for Linux:
  *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
+ *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *			   H Peter Anvin" <hpa@zytor.com>
+ *		  (C) 2015 Borislav Petkov <bp@alien8.de>
  *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
+ * This driver allows to upgrade microcode on x86 processors.
  *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define pr_fmt(fmt) "microcode: " fmt
 
 #include <linux/platform_device.h>
+#include <linux/syscore_ops.h>
 #include <linux/miscdevice.h>
 #include <linux/capability.h>
+#include <linux/firmware.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
-#include <linux/syscore_ops.h>
 
-#include <asm/microcode.h>
-#include <asm/processor.h>
+#include <asm/microcode_intel.h>
 #include <asm/cpu_device_id.h>
+#include <asm/microcode_amd.h>
 #include <asm/perf_event.h>
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
 
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION	"2.00"
+#define MICROCODE_VERSION	"2.01"
 
 static struct microcode_ops	*microcode_ops;
 
-bool dis_ucode_ldr;
-module_param(dis_ucode_ldr, bool, 0);
+static bool dis_ucode_ldr;
+
+static int __init disable_loader(char *str)
+{
+	dis_ucode_ldr = true;
+	return 1;
+}
+__setup("dis_ucode_ldr", disable_loader);
 
 /*
  * Synchronization.
@@ -126,6 +79,150 @@ struct cpu_info_ctx {
 	int			err;
 };
 
+static bool __init check_loader_disabled_bsp(void)
+{
+#ifdef CONFIG_X86_32
+	const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+	const char *opt	    = "dis_ucode_ldr";
+	const char *option  = (const char *)__pa_nodebug(opt);
+	bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+	const char *cmdline = boot_command_line;
+	const char *option  = "dis_ucode_ldr";
+	bool *res = &dis_ucode_ldr;
+#endif
+
+	if (cmdline_find_option_bool(cmdline, option))
+		*res = true;
+
+	return *res;
+}
+
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+	struct builtin_fw *b_fw;
+
+	for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+		if (!strcmp(name, b_fw->name)) {
+			cd->size = b_fw->size;
+			cd->data = b_fw->data;
+			return true;
+		}
+	}
+#endif
+	return false;
+}
+
+void __init load_ucode_bsp(void)
+{
+	int vendor;
+	unsigned int family;
+
+	if (check_loader_disabled_bsp())
+		return;
+
+	if (!have_cpuid_p())
+		return;
+
+	vendor = x86_vendor();
+	family = x86_family();
+
+	switch (vendor) {
+	case X86_VENDOR_INTEL:
+		if (family >= 6)
+			load_ucode_intel_bsp();
+		break;
+	case X86_VENDOR_AMD:
+		if (family >= 0x10)
+			load_ucode_amd_bsp(family);
+		break;
+	default:
+		break;
+	}
+}
+
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+	return *((bool *)__pa_nodebug(&dis_ucode_ldr));
+#else
+	return dis_ucode_ldr;
+#endif
+}
+
+void load_ucode_ap(void)
+{
+	int vendor, family;
+
+	if (check_loader_disabled_ap())
+		return;
+
+	if (!have_cpuid_p())
+		return;
+
+	vendor = x86_vendor();
+	family = x86_family();
+
+	switch (vendor) {
+	case X86_VENDOR_INTEL:
+		if (family >= 6)
+			load_ucode_intel_ap();
+		break;
+	case X86_VENDOR_AMD:
+		if (family >= 0x10)
+			load_ucode_amd_ap();
+		break;
+	default:
+		break;
+	}
+}
+
+int __init save_microcode_in_initrd(void)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		if (c->x86 >= 6)
+			save_microcode_in_initrd_intel();
+		break;
+	case X86_VENDOR_AMD:
+		if (c->x86 >= 0x10)
+			save_microcode_in_initrd_amd();
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+void reload_early_microcode(void)
+{
+	int vendor, family;
+
+	vendor = x86_vendor();
+	family = x86_family();
+
+	switch (vendor) {
+	case X86_VENDOR_INTEL:
+		if (family >= 6)
+			reload_ucode_intel();
+		break;
+	case X86_VENDOR_AMD:
+		if (family >= 0x10)
+			reload_ucode_amd();
+		break;
+	default:
+		break;
+	}
+}
+
 static void collect_cpu_info_local(void *arg)
 {
 	struct cpu_info_ctx *ctx = arg;
@@ -268,9 +365,6 @@ static void __exit microcode_dev_exit(void)
 {
 	misc_deregister(&microcode_dev);
 }
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-MODULE_ALIAS("devname:cpu/microcode");
 #else
 #define microcode_dev_init()	0
 #define microcode_dev_exit()	do { } while (0)
@@ -435,17 +529,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
 	return err;
 }
 
-static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
 {
 	int cpu = dev->id;
 
 	if (!cpu_online(cpu))
-		return 0;
+		return;
 
 	pr_debug("CPU%d removed\n", cpu);
 	microcode_fini_cpu(cpu);
 	sysfs_remove_group(&dev->kobj, &mc_attr_group);
-	return 0;
 }
 
 static struct subsys_interface mc_cpu_interface = {
@@ -518,24 +611,10 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __refdata mc_cpu_notifier = {
+static struct notifier_block mc_cpu_notifier = {
 	.notifier_call	= mc_cpu_callback,
 };
 
-#ifdef MODULE
-/* Autoload on Intel and AMD systems */
-static const struct x86_cpu_id __initconst microcode_id[] = {
-#ifdef CONFIG_MICROCODE_INTEL
-	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
-#ifdef CONFIG_MICROCODE_AMD
-	{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
-#endif
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, microcode_id);
-#endif
-
 static struct attribute *cpu_root_microcode_attrs[] = {
 	&dev_attr_reload.attr,
 	NULL
@@ -546,9 +625,9 @@ static struct attribute_group cpu_root_microcode_group = {
 	.attrs = cpu_root_microcode_attrs,
 };
 
-static int __init microcode_init(void)
+int __init microcode_init(void)
 {
-	struct cpuinfo_x86 *c = &cpu_data(0);
+	struct cpuinfo_x86 *c = &boot_cpu_data;
 	int error;
 
 	if (paravirt_enabled() || dis_ucode_ldr)
@@ -619,35 +698,4 @@ static int __init microcode_init(void)
 	return error;
 
 }
-module_init(microcode_init);
-
-static void __exit microcode_exit(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	microcode_dev_exit();
-
-	unregister_hotcpu_notifier(&mc_cpu_notifier);
-	unregister_syscore_ops(&mc_syscore_ops);
-
-	sysfs_remove_group(&cpu_subsys.dev_root->kobj,
-			   &cpu_root_microcode_group);
-
-	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-
-	subsys_interface_unregister(&mc_cpu_interface);
-
-	mutex_unlock(&microcode_mutex);
-	put_online_cpus();
-
-	platform_device_unregister(microcode_pdev);
-
-	microcode_ops = NULL;
-
-	if (c->x86_vendor == X86_VENDOR_AMD)
-		exit_amd_microcode();
-
-	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
-}
-module_exit(microcode_exit);
+late_initcall(microcode_init);
diff --git a/kernel/arch/x86/kernel/cpu/microcode/core_early.c b/kernel/arch/x86/kernel/cpu/microcode/core_early.c
deleted file mode 100644
index a413a69cb..000000000
--- a/kernel/arch/x86/kernel/cpu/microcode/core_early.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- *	X86 CPU microcode early update for Linux
- *
- *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- *			   H Peter Anvin" <hpa@zytor.com>
- *
- *	This driver allows to early upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- *	Software Developer's Manual.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <asm/microcode.h>
-#include <asm/microcode_intel.h>
-#include <asm/microcode_amd.h>
-#include <asm/processor.h>
-#include <asm/cmdline.h>
-
-static bool __init check_loader_disabled_bsp(void)
-{
-#ifdef CONFIG_X86_32
-	const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
-	const char *opt	    = "dis_ucode_ldr";
-	const char *option  = (const char *)__pa_nodebug(opt);
-	bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
-
-#else /* CONFIG_X86_64 */
-	const char *cmdline = boot_command_line;
-	const char *option  = "dis_ucode_ldr";
-	bool *res = &dis_ucode_ldr;
-#endif
-
-	if (cmdline_find_option_bool(cmdline, option))
-		*res = true;
-
-	return *res;
-}
-
-void __init load_ucode_bsp(void)
-{
-	int vendor, family;
-
-	if (check_loader_disabled_bsp())
-		return;
-
-	if (!have_cpuid_p())
-		return;
-
-	vendor = x86_vendor();
-	family = x86_family();
-
-	switch (vendor) {
-	case X86_VENDOR_INTEL:
-		if (family >= 6)
-			load_ucode_intel_bsp();
-		break;
-	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			load_ucode_amd_bsp();
-		break;
-	default:
-		break;
-	}
-}
-
-static bool check_loader_disabled_ap(void)
-{
-#ifdef CONFIG_X86_32
-	return *((bool *)__pa_nodebug(&dis_ucode_ldr));
-#else
-	return dis_ucode_ldr;
-#endif
-}
-
-void load_ucode_ap(void)
-{
-	int vendor, family;
-
-	if (check_loader_disabled_ap())
-		return;
-
-	if (!have_cpuid_p())
-		return;
-
-	vendor = x86_vendor();
-	family = x86_family();
-
-	switch (vendor) {
-	case X86_VENDOR_INTEL:
-		if (family >= 6)
-			load_ucode_intel_ap();
-		break;
-	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			load_ucode_amd_ap();
-		break;
-	default:
-		break;
-	}
-}
-
-int __init save_microcode_in_initrd(void)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	switch (c->x86_vendor) {
-	case X86_VENDOR_INTEL:
-		if (c->x86 >= 6)
-			save_microcode_in_initrd_intel();
-		break;
-	case X86_VENDOR_AMD:
-		if (c->x86 >= 0x10)
-			save_microcode_in_initrd_amd();
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-void reload_early_microcode(void)
-{
-	int vendor, family;
-
-	vendor = x86_vendor();
-	family = x86_family();
-
-	switch (vendor) {
-	case X86_VENDOR_INTEL:
-		if (family >= 6)
-			reload_ucode_intel();
-		break;
-	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			reload_ucode_amd();
-		break;
-	default:
-		break;
-	}
-}
diff --git a/kernel/arch/x86/kernel/cpu/microcode/intel.c b/kernel/arch/x86/kernel/cpu/microcode/intel.c
index a41beadb3..ce47402eb 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/intel.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/intel.c
@@ -1,91 +1,807 @@
 /*
- *	Intel CPU Microcode Update Driver for Linux
+ * Intel CPU Microcode Update Driver for Linux
  *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		 2006 Shaohua Li <shaohua.li@intel.com>
  *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
+ * Intel CPU microcode early update for Linux
  *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ *		      H Peter Anvin" <hpa@zytor.com>
  *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+#define pr_fmt(fmt) "microcode: " fmt
 
+#include <linux/earlycpio.h>
 #include <linux/firmware.h>
 #include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
 
 #include <asm/microcode_intel.h>
 #include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
 #include <asm/msr.h>
 
-MODULE_DESCRIPTION("Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
+static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+static struct mc_saved_data {
+	unsigned int mc_saved_count;
+	struct microcode_intel **mc_saved;
+} mc_saved_data;
+
+static enum ucode_state
+load_microcode_early(struct microcode_intel **saved,
+		     unsigned int num_saved, struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *ucode_ptr, *new_mc = NULL;
+	struct microcode_header_intel *mc_hdr;
+	int new_rev, ret, i;
+
+	new_rev = uci->cpu_sig.rev;
+
+	for (i = 0; i < num_saved; i++) {
+		ucode_ptr = saved[i];
+		mc_hdr	  = (struct microcode_header_intel *)ucode_ptr;
+
+		ret = has_newer_microcode(ucode_ptr,
+					  uci->cpu_sig.sig,
+					  uci->cpu_sig.pf,
+					  new_rev);
+		if (!ret)
+			continue;
+
+		new_rev = mc_hdr->rev;
+		new_mc  = ucode_ptr;
+	}
+
+	if (!new_mc)
+		return UCODE_NFOUND;
+
+	uci->mc = (struct microcode_intel *)new_mc;
+	return UCODE_OK;
+}
+
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+		  unsigned long off, int num_saved)
+{
+	int i;
+
+	for (i = 0; i < num_saved; i++)
+		mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
+}
+
+#ifdef CONFIG_X86_32
+static void
+microcode_phys(struct microcode_intel **mc_saved_tmp,
+	       struct mc_saved_data *mc_saved_data)
+{
+	int i;
+	struct microcode_intel ***mc_saved;
+
+	mc_saved = (struct microcode_intel ***)
+		   __pa_nodebug(&mc_saved_data->mc_saved);
+	for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+		struct microcode_intel *p;
+
+		p = *(struct microcode_intel **)
+			__pa_nodebug(mc_saved_data->mc_saved + i);
+		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+	}
+}
+#endif
+
+static enum ucode_state
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+	       unsigned long initrd_start, struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	unsigned int count = mc_saved_data->mc_saved_count;
+
+	if (!mc_saved_data->mc_saved) {
+		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
+
+		return load_microcode_early(mc_saved_tmp, count, uci);
+	} else {
+#ifdef CONFIG_X86_32
+		microcode_phys(mc_saved_tmp, mc_saved_data);
+		return load_microcode_early(mc_saved_tmp, count, uci);
+#else
+		return load_microcode_early(mc_saved_data->mc_saved,
+						    count, uci);
+#endif
+	}
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ */
+static enum ucode_state
+matching_model_microcode(struct microcode_header_intel *mc_header,
+			unsigned long sig)
+{
+	unsigned int fam, model;
+	unsigned int fam_ucode, model_ucode;
+	struct extended_sigtable *ext_header;
+	unsigned long total_size = get_totalsize(mc_header);
+	unsigned long data_size = get_datasize(mc_header);
+	int ext_sigcount, i;
+	struct extended_signature *ext_sig;
+
+	fam   = __x86_family(sig);
+	model = x86_model(sig);
+
+	fam_ucode   = __x86_family(mc_header->sig);
+	model_ucode = x86_model(mc_header->sig);
+
+	if (fam == fam_ucode && model == model_ucode)
+		return UCODE_OK;
+
+	/* Look for ext. headers: */
+	if (total_size <= data_size + MC_HEADER_SIZE)
+		return UCODE_NFOUND;
+
+	ext_header   = (void *) mc_header + data_size + MC_HEADER_SIZE;
+	ext_sig      = (void *)ext_header + EXT_HEADER_SIZE;
+	ext_sigcount = ext_header->count;
+
+	for (i = 0; i < ext_sigcount; i++) {
+		fam_ucode   = __x86_family(ext_sig->sig);
+		model_ucode = x86_model(ext_sig->sig);
+
+		if (fam == fam_ucode && model == model_ucode)
+			return UCODE_OK;
+
+		ext_sig++;
+	}
+	return UCODE_NFOUND;
+}
+
+static int
+save_microcode(struct mc_saved_data *mc_saved_data,
+	       struct microcode_intel **mc_saved_src,
+	       unsigned int mc_saved_count)
+{
+	int i, j;
+	struct microcode_intel **saved_ptr;
+	int ret;
+
+	if (!mc_saved_count)
+		return -EINVAL;
+
+	/*
+	 * Copy new microcode data.
+	 */
+	saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+	if (!saved_ptr)
+		return -ENOMEM;
+
+	for (i = 0; i < mc_saved_count; i++) {
+		struct microcode_header_intel *mc_hdr;
+		struct microcode_intel *mc;
+		unsigned long size;
+
+		if (!mc_saved_src[i]) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		mc     = mc_saved_src[i];
+		mc_hdr = &mc->hdr;
+		size   = get_totalsize(mc_hdr);
+
+		saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+		if (!saved_ptr[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		memcpy(saved_ptr[i], mc, size);
+	}
+
+	/*
+	 * Point to newly saved microcode.
+	 */
+	mc_saved_data->mc_saved = saved_ptr;
+	mc_saved_data->mc_saved_count = mc_saved_count;
+
+	return 0;
+
+err:
+	for (j = 0; j <= i; j++)
+		kfree(saved_ptr[j]);
+	kfree(saved_ptr);
+
+	return ret;
+}
+
+/*
+ * A microcode patch in ucode_ptr is saved into mc_saved
+ * - if it has matching signature and newer revision compared to an existing
+ *   patch mc_saved.
+ * - or if it is a newly discovered microcode patch.
+ *
+ * The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
+ */
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+			     u8 *ucode_ptr, unsigned int num_saved)
+{
+	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+	unsigned int sig, pf;
+	int found = 0, i;
+
+	mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+	for (i = 0; i < num_saved; i++) {
+		mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+		sig	     = mc_saved_hdr->sig;
+		pf	     = mc_saved_hdr->pf;
+
+		if (!find_matching_signature(ucode_ptr, sig, pf))
+			continue;
+
+		found = 1;
+
+		if (mc_hdr->rev <= mc_saved_hdr->rev)
+			continue;
+
+		/*
+		 * Found an older ucode saved earlier. Replace it with
+		 * this newer one.
+		 */
+		mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+		break;
+	}
+
+	/* Newly detected microcode, save it to memory. */
+	if (i >= num_saved && !found)
+		mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+
+	return num_saved;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static enum ucode_state __init
+get_matching_model_microcode(int cpu, unsigned long start,
+			     void *data, size_t size,
+			     struct mc_saved_data *mc_saved_data,
+			     unsigned long *mc_saved_in_initrd,
+			     struct ucode_cpu_info *uci)
+{
+	u8 *ucode_ptr = data;
+	unsigned int leftover = size;
+	enum ucode_state state = UCODE_OK;
+	unsigned int mc_size;
+	struct microcode_header_intel *mc_header;
+	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+	int i;
+
+	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+
+		if (leftover < sizeof(mc_header))
+			break;
+
+		mc_header = (struct microcode_header_intel *)ucode_ptr;
+
+		mc_size = get_totalsize(mc_header);
+		if (!mc_size || mc_size > leftover ||
+			microcode_sanity_check(ucode_ptr, 0) < 0)
+			break;
+
+		leftover -= mc_size;
+
+		/*
+		 * Since APs with same family and model as the BSP may boot in
+		 * the platform, we need to find and save microcode patches
+		 * with the same family and model as the BSP.
+		 */
+		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
+			 UCODE_OK) {
+			ucode_ptr += mc_size;
+			continue;
+		}
+
+		mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
+
+		ucode_ptr += mc_size;
+	}
+
+	if (leftover) {
+		state = UCODE_ERROR;
+		goto out;
+	}
+
+	if (mc_saved_count == 0) {
+		state = UCODE_NFOUND;
+		goto out;
+	}
+
+	for (i = 0; i < mc_saved_count; i++)
+		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+
+	mc_saved_data->mc_saved_count = mc_saved_count;
+out:
+	return state;
+}
+
+static int collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+	unsigned int val[2];
+	unsigned int family, model;
+	struct cpu_signature csig;
+	unsigned int eax, ebx, ecx, edx;
+
+	csig.sig = 0;
+	csig.pf = 0;
+	csig.rev = 0;
+
+	memset(uci, 0, sizeof(*uci));
+
+	eax = 0x00000001;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	csig.sig = eax;
+
+	family = __x86_family(csig.sig);
+	model  = x86_model(csig.sig);
+
+	if ((model >= 5) || (family > 6)) {
+		/* get processor flags from MSR 0x17 */
+		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+		csig.pf = 1 << ((val[1] >> 18) & 7);
+	}
+	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+	/* As documented in the SDM: Do a CPUID 1 here */
+	sync_core();
+
+	/* get the current revision from MSR 0x8B */
+	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+	csig.rev = val[1];
+
+	uci->cpu_sig = csig;
+	uci->valid = 1;
+
+	return 0;
+}
+
+static void show_saved_mc(void)
+{
+#ifdef DEBUG
+	int i, j;
+	unsigned int sig, pf, rev, total_size, data_size, date;
+	struct ucode_cpu_info uci;
+
+	if (mc_saved_data.mc_saved_count == 0) {
+		pr_debug("no microcode data saved.\n");
+		return;
+	}
+	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+
+	collect_cpu_info_early(&uci);
+
+	sig = uci.cpu_sig.sig;
+	pf = uci.cpu_sig.pf;
+	rev = uci.cpu_sig.rev;
+	pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
+
+	for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+		struct microcode_header_intel *mc_saved_header;
+		struct extended_sigtable *ext_header;
+		int ext_sigcount;
+		struct extended_signature *ext_sig;
+
+		mc_saved_header = (struct microcode_header_intel *)
+				  mc_saved_data.mc_saved[i];
+		sig = mc_saved_header->sig;
+		pf = mc_saved_header->pf;
+		rev = mc_saved_header->rev;
+		total_size = get_totalsize(mc_saved_header);
+		data_size = get_datasize(mc_saved_header);
+		date = mc_saved_header->date;
+
+		pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
+			 i, sig, pf, rev, total_size,
+			 date & 0xffff,
+			 date >> 24,
+			 (date >> 16) & 0xff);
+
+		/* Look for ext. headers: */
+		if (total_size <= data_size + MC_HEADER_SIZE)
+			continue;
+
+		ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+		ext_sigcount = ext_header->count;
+		ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+		for (j = 0; j < ext_sigcount; j++) {
+			sig = ext_sig->sig;
+			pf = ext_sig->pf;
+
+			pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+				 j, sig, pf);
+
+			ext_sig++;
+		}
+
+	}
+#endif
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+/*
+ * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
+ * hot added or resumes.
+ *
+ * Please make sure this mc should be a valid microcode patch before calling
+ * this function.
+ */
+int save_mc_for_early(u8 *mc)
+{
+	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	unsigned int mc_saved_count_init;
+	unsigned int mc_saved_count;
+	struct microcode_intel **mc_saved;
+	int ret = 0;
+	int i;
+
+	/*
+	 * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
+	 * hotplug.
+	 */
+	mutex_lock(&x86_cpu_microcode_mutex);
+
+	mc_saved_count_init = mc_saved_data.mc_saved_count;
+	mc_saved_count = mc_saved_data.mc_saved_count;
+	mc_saved = mc_saved_data.mc_saved;
+
+	if (mc_saved && mc_saved_count)
+		memcpy(mc_saved_tmp, mc_saved,
+		       mc_saved_count * sizeof(struct microcode_intel *));
+	/*
+	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
+	 * version.
+	 */
+	mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
+
+	/*
+	 * Save the mc_save_tmp in global mc_saved_data.
+	 */
+	ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+	if (ret) {
+		pr_err("Cannot save microcode patch.\n");
+		goto out;
+	}
+
+	show_saved_mc();
+
+	/*
+	 * Free old saved microcode data.
+	 */
+	if (mc_saved) {
+		for (i = 0; i < mc_saved_count_init; i++)
+			kfree(mc_saved[i]);
+		kfree(mc_saved);
+	}
+
+out:
+	mutex_unlock(&x86_cpu_microcode_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(save_mc_for_early);
+#endif
+
+static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+{
+#ifdef CONFIG_X86_64
+	unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+	unsigned int family, model, stepping;
+	char name[30];
+
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	family   = __x86_family(eax);
+	model    = x86_model(eax);
+	stepping = eax & 0xf;
+
+	sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
+
+	return get_builtin_firmware(cp, name);
+#else
+	return false;
+#endif
+}
+
+static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
+static __init enum ucode_state
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+	       unsigned long start, unsigned long size,
+	       struct ucode_cpu_info *uci)
+{
+	struct cpio_data cd;
+	long offset = 0;
+#ifdef CONFIG_X86_32
+	char *p = (char *)__pa_nodebug(ucode_name);
+#else
+	char *p = ucode_name;
+#endif
+
+	cd.data = NULL;
+	cd.size = 0;
+
+	cd = find_cpio_data(p, (void *)start, size, &offset);
+	if (!cd.data) {
+		if (!load_builtin_intel_microcode(&cd))
+			return UCODE_ERROR;
+	}
+
+	return get_matching_model_microcode(0, start, cd.data, cd.size,
+					    mc_saved_data, initrd, uci);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+	int cpu = smp_processor_id();
+
+	pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+		cpu,
+		uci->cpu_sig.rev,
+		date & 0xffff,
+		date >> 24,
+		(date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void show_ucode_info_early(void)
+{
+	struct ucode_cpu_info uci;
+
+	if (delay_ucode_info) {
+		collect_cpu_info_early(&uci);
+		print_ucode_info(&uci, current_mc_date);
+		delay_ucode_info = 0;
+	}
+}
+
+/*
+ * At this point, we can not call printk() yet. Keep microcode patch number in
+ * mc_saved_data.mc_saved and delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void print_ucode(struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *mc_intel;
+	int *delay_ucode_info_p;
+	int *current_mc_date_p;
+
+	mc_intel = uci->mc;
+	if (mc_intel == NULL)
+		return;
+
+	delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
+	current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+
+	*delay_ucode_info_p = 1;
+	*current_mc_date_p = mc_intel->hdr.date;
+}
+#else
+
+/*
+ * Flush global tlb. We only do this in x86_64 where paging has been enabled
+ * already and PGE should be enabled as well.
+ */
+static inline void flush_tlb_early(void)
+{
+	__native_flush_tlb_global_irq_disabled();
+}
+
+static inline void print_ucode(struct ucode_cpu_info *uci)
+{
+	struct microcode_intel *mc_intel;
+
+	mc_intel = uci->mc;
+	if (mc_intel == NULL)
+		return;
+
+	print_ucode_info(uci, mc_intel->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
+{
+	struct microcode_intel *mc_intel;
+	unsigned int val[2];
+
+	mc_intel = uci->mc;
+	if (mc_intel == NULL)
+		return 0;
+
+	/* write microcode via MSR 0x79 */
+	native_wrmsr(MSR_IA32_UCODE_WRITE,
+	      (unsigned long) mc_intel->bits,
+	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+	/* As documented in the SDM: Do a CPUID 1 here */
+	sync_core();
+
+	/* get the current revision from MSR 0x8B */
+	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+	if (val[1] != mc_intel->hdr.rev)
+		return -1;
+
+#ifdef CONFIG_X86_64
+	/* Flush global tlb. This is precaution. */
+	flush_tlb_early();
+#endif
+	uci->cpu_sig.rev = val[1];
+
+	if (early)
+		print_ucode(uci);
+	else
+		print_ucode_info(uci, mc_intel->hdr.date);
+
+	return 0;
+}
+
+/*
+ * This function converts microcode patch offsets previously stored in
+ * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ */
+int __init save_microcode_in_initrd_intel(void)
+{
+	unsigned int count = mc_saved_data.mc_saved_count;
+	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
+	int ret = 0;
+
+	if (count == 0)
+		return ret;
+
+	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+	ret = save_microcode(&mc_saved_data, mc_saved, count);
+	if (ret)
+		pr_err("Cannot save microcode patches from initrd.\n");
+
+	show_saved_mc();
+
+	return ret;
+}
+
+static void __init
+_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
+		      unsigned long *initrd,
+		      unsigned long start, unsigned long size)
+{
+	struct ucode_cpu_info uci;
+	enum ucode_state ret;
+
+	collect_cpu_info_early(&uci);
+
+	ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+	if (ret != UCODE_OK)
+		return;
+
+	ret = load_microcode(mc_saved_data, initrd, start, &uci);
+	if (ret != UCODE_OK)
+		return;
+
+	apply_microcode_early(&uci, true);
+}
+
+void __init load_ucode_intel_bsp(void)
+{
+	u64 start, size;
+#ifdef CONFIG_X86_32
+	struct boot_params *p;
+
+	p	= (struct boot_params *)__pa_nodebug(&boot_params);
+	start	= p->hdr.ramdisk_image;
+	size	= p->hdr.ramdisk_size;
+
+	_load_ucode_intel_bsp(
+			(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+			(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+			start, size);
+#else
+	start	= boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+	size	= boot_params.hdr.ramdisk_size;
+
+	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
+#endif
+}
+
+void load_ucode_intel_ap(void)
+{
+	struct mc_saved_data *mc_saved_data_p;
+	struct ucode_cpu_info uci;
+	unsigned long *mc_saved_in_initrd_p;
+	unsigned long initrd_start_addr;
+	enum ucode_state ret;
+#ifdef CONFIG_X86_32
+	unsigned long *initrd_start_p;
+
+	mc_saved_in_initrd_p =
+		(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+	initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+	initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+	mc_saved_data_p = &mc_saved_data;
+	mc_saved_in_initrd_p = mc_saved_in_initrd;
+	initrd_start_addr = initrd_start;
+#endif
+
+	/*
+	 * If there is no valid ucode previously saved in memory, no need to
+	 * update ucode on this AP.
+	 */
+	if (mc_saved_data_p->mc_saved_count == 0)
+		return;
+
+	collect_cpu_info_early(&uci);
+	ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+			     initrd_start_addr, &uci);
+
+	if (ret != UCODE_OK)
+		return;
+
+	apply_microcode_early(&uci, true);
+}
+
+void reload_ucode_intel(void)
+{
+	struct ucode_cpu_info uci;
+	enum ucode_state ret;
+
+	if (!mc_saved_data.mc_saved_count)
+		return;
+
+	collect_cpu_info_early(&uci);
+
+	ret = load_microcode_early(mc_saved_data.mc_saved,
+				   mc_saved_data.mc_saved_count, &uci);
+	if (ret != UCODE_OK)
+		return;
+
+	apply_microcode_early(&uci, false);
+}
 
 static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 {
@@ -124,7 +840,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
 	cpf = cpu_sig.pf;
 	crev = cpu_sig.rev;
 
-	return get_matching_microcode(csig, cpf, crev, mc_intel);
+	return has_newer_microcode(mc_intel, csig, cpf, crev);
 }
 
 static int apply_microcode_intel(int cpu)
@@ -226,7 +942,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 
 		csig = uci->cpu_sig.sig;
 		cpf = uci->cpu_sig.pf;
-		if (get_matching_microcode(csig, cpf, new_rev, mc)) {
+		if (has_newer_microcode(mc, csig, cpf, new_rev)) {
 			vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
@@ -325,7 +1041,7 @@ static struct microcode_ops microcode_intel_ops = {
 
 struct microcode_ops * __init init_intel_microcode(void)
 {
-	struct cpuinfo_x86 *c = &cpu_data(0);
+	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
 	    cpu_has(c, X86_FEATURE_IA64)) {
diff --git a/kernel/arch/x86/kernel/cpu/microcode/intel_early.c b/kernel/arch/x86/kernel/cpu/microcode/intel_early.c
deleted file mode 100644
index 2f49ab4ac..000000000
--- a/kernel/arch/x86/kernel/cpu/microcode/intel_early.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- *	Intel CPU microcode early update for Linux
- *
- *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- *			   H Peter Anvin" <hpa@zytor.com>
- *
- *	This allows to early upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
- *	Software Developer's Manual.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- */
-
-/*
- * This needs to be before all headers so that pr_debug in printk.h doesn't turn
- * printk calls into no_printk().
- *
- *#define DEBUG
- */
-
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/earlycpio.h>
-#include <linux/initrd.h>
-#include <linux/cpu.h>
-#include <asm/msr.h>
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/setup.h>
-
-#undef pr_fmt
-#define pr_fmt(fmt)	"microcode: " fmt
-
-static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
-static struct mc_saved_data {
-	unsigned int mc_saved_count;
-	struct microcode_intel **mc_saved;
-} mc_saved_data;
-
-static enum ucode_state
-load_microcode_early(struct microcode_intel **saved,
-		     unsigned int num_saved, struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *ucode_ptr, *new_mc = NULL;
-	struct microcode_header_intel *mc_hdr;
-	int new_rev, ret, i;
-
-	new_rev = uci->cpu_sig.rev;
-
-	for (i = 0; i < num_saved; i++) {
-		ucode_ptr = saved[i];
-		mc_hdr	  = (struct microcode_header_intel *)ucode_ptr;
-
-		ret = get_matching_microcode(uci->cpu_sig.sig,
-					     uci->cpu_sig.pf,
-					     new_rev,
-					     ucode_ptr);
-		if (!ret)
-			continue;
-
-		new_rev = mc_hdr->rev;
-		new_mc  = ucode_ptr;
-	}
-
-	if (!new_mc)
-		return UCODE_NFOUND;
-
-	uci->mc = (struct microcode_intel *)new_mc;
-	return UCODE_OK;
-}
-
-static inline void
-copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
-		  unsigned long off, int num_saved)
-{
-	int i;
-
-	for (i = 0; i < num_saved; i++)
-		mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
-	       struct mc_saved_data *mc_saved_data)
-{
-	int i;
-	struct microcode_intel ***mc_saved;
-
-	mc_saved = (struct microcode_intel ***)
-		   __pa_nodebug(&mc_saved_data->mc_saved);
-	for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
-		struct microcode_intel *p;
-
-		p = *(struct microcode_intel **)
-			__pa_nodebug(mc_saved_data->mc_saved + i);
-		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
-	}
-}
-#endif
-
-static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
-	       unsigned long initrd_start, struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mc_saved_data->mc_saved_count;
-
-	if (!mc_saved_data->mc_saved) {
-		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
-
-		return load_microcode_early(mc_saved_tmp, count, uci);
-	} else {
-#ifdef CONFIG_X86_32
-		microcode_phys(mc_saved_tmp, mc_saved_data);
-		return load_microcode_early(mc_saved_tmp, count, uci);
-#else
-		return load_microcode_early(mc_saved_data->mc_saved,
-						    count, uci);
-#endif
-	}
-}
-
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- */
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
-			unsigned long sig)
-{
-	unsigned int fam, model;
-	unsigned int fam_ucode, model_ucode;
-	struct extended_sigtable *ext_header;
-	unsigned long total_size = get_totalsize(mc_header);
-	unsigned long data_size = get_datasize(mc_header);
-	int ext_sigcount, i;
-	struct extended_signature *ext_sig;
-
-	fam   = __x86_family(sig);
-	model = x86_model(sig);
-
-	fam_ucode   = __x86_family(mc_header->sig);
-	model_ucode = x86_model(mc_header->sig);
-
-	if (fam == fam_ucode && model == model_ucode)
-		return UCODE_OK;
-
-	/* Look for ext. headers: */
-	if (total_size <= data_size + MC_HEADER_SIZE)
-		return UCODE_NFOUND;
-
-	ext_header   = (void *) mc_header + data_size + MC_HEADER_SIZE;
-	ext_sig      = (void *)ext_header + EXT_HEADER_SIZE;
-	ext_sigcount = ext_header->count;
-
-	for (i = 0; i < ext_sigcount; i++) {
-		fam_ucode   = __x86_family(ext_sig->sig);
-		model_ucode = x86_model(ext_sig->sig);
-
-		if (fam == fam_ucode && model == model_ucode)
-			return UCODE_OK;
-
-		ext_sig++;
-	}
-	return UCODE_NFOUND;
-}
-
-static int
-save_microcode(struct mc_saved_data *mc_saved_data,
-	       struct microcode_intel **mc_saved_src,
-	       unsigned int mc_saved_count)
-{
-	int i, j;
-	struct microcode_intel **saved_ptr;
-	int ret;
-
-	if (!mc_saved_count)
-		return -EINVAL;
-
-	/*
-	 * Copy new microcode data.
-	 */
-	saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
-	if (!saved_ptr)
-		return -ENOMEM;
-
-	for (i = 0; i < mc_saved_count; i++) {
-		struct microcode_header_intel *mc_hdr;
-		struct microcode_intel *mc;
-		unsigned long size;
-
-		if (!mc_saved_src[i]) {
-			ret = -EINVAL;
-			goto err;
-		}
-
-		mc     = mc_saved_src[i];
-		mc_hdr = &mc->hdr;
-		size   = get_totalsize(mc_hdr);
-
-		saved_ptr[i] = kmalloc(size, GFP_KERNEL);
-		if (!saved_ptr[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
-
-		memcpy(saved_ptr[i], mc, size);
-	}
-
-	/*
-	 * Point to newly saved microcode.
-	 */
-	mc_saved_data->mc_saved = saved_ptr;
-	mc_saved_data->mc_saved_count = mc_saved_count;
-
-	return 0;
-
-err:
-	for (j = 0; j <= i; j++)
-		kfree(saved_ptr[j]);
-	kfree(saved_ptr);
-
-	return ret;
-}
-
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- *   patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
-			     u8 *ucode_ptr, unsigned int num_saved)
-{
-	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
-	unsigned int sig, pf, new_rev;
-	int found = 0, i;
-
-	mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
-	for (i = 0; i < num_saved; i++) {
-		mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
-		sig	     = mc_saved_hdr->sig;
-		pf	     = mc_saved_hdr->pf;
-		new_rev	     = mc_hdr->rev;
-
-		if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
-			continue;
-
-		found = 1;
-
-		if (!revision_is_newer(mc_hdr, new_rev))
-			continue;
-
-		/*
-		 * Found an older ucode saved earlier. Replace it with
-		 * this newer one.
-		 */
-		mc_saved[i] = (struct microcode_intel *)ucode_ptr;
-		break;
-	}
-
-	/* Newly detected microcode, save it to memory. */
-	if (i >= num_saved && !found)
-		mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
-
-	return num_saved;
-}
-
-/*
- * Get microcode matching with BSP's model. Only CPUs with the same model as
- * BSP can stay in the platform.
- */
-static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
-			     void *data, size_t size,
-			     struct mc_saved_data *mc_saved_data,
-			     unsigned long *mc_saved_in_initrd,
-			     struct ucode_cpu_info *uci)
-{
-	u8 *ucode_ptr = data;
-	unsigned int leftover = size;
-	enum ucode_state state = UCODE_OK;
-	unsigned int mc_size;
-	struct microcode_header_intel *mc_header;
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
-	int i;
-
-	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
-
-		if (leftover < sizeof(mc_header))
-			break;
-
-		mc_header = (struct microcode_header_intel *)ucode_ptr;
-
-		mc_size = get_totalsize(mc_header);
-		if (!mc_size || mc_size > leftover ||
-			microcode_sanity_check(ucode_ptr, 0) < 0)
-			break;
-
-		leftover -= mc_size;
-
-		/*
-		 * Since APs with same family and model as the BSP may boot in
-		 * the platform, we need to find and save microcode patches
-		 * with the same family and model as the BSP.
-		 */
-		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
-			 UCODE_OK) {
-			ucode_ptr += mc_size;
-			continue;
-		}
-
-		mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
-
-		ucode_ptr += mc_size;
-	}
-
-	if (leftover) {
-		state = UCODE_ERROR;
-		goto out;
-	}
-
-	if (mc_saved_count == 0) {
-		state = UCODE_NFOUND;
-		goto out;
-	}
-
-	for (i = 0; i < mc_saved_count; i++)
-		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
-
-	mc_saved_data->mc_saved_count = mc_saved_count;
-out:
-	return state;
-}
-
-static int collect_cpu_info_early(struct ucode_cpu_info *uci)
-{
-	unsigned int val[2];
-	unsigned int family, model;
-	struct cpu_signature csig;
-	unsigned int eax, ebx, ecx, edx;
-
-	csig.sig = 0;
-	csig.pf = 0;
-	csig.rev = 0;
-
-	memset(uci, 0, sizeof(*uci));
-
-	eax = 0x00000001;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-	csig.sig = eax;
-
-	family = __x86_family(csig.sig);
-	model  = x86_model(csig.sig);
-
-	if ((model >= 5) || (family > 6)) {
-		/* get processor flags from MSR 0x17 */
-		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-		csig.pf = 1 << ((val[1] >> 18) & 7);
-	}
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
-
-	/* get the current revision from MSR 0x8B */
-	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-	csig.rev = val[1];
-
-	uci->cpu_sig = csig;
-	uci->valid = 1;
-
-	return 0;
-}
-
-#ifdef DEBUG
-static void __ref show_saved_mc(void)
-{
-	int i, j;
-	unsigned int sig, pf, rev, total_size, data_size, date;
-	struct ucode_cpu_info uci;
-
-	if (mc_saved_data.mc_saved_count == 0) {
-		pr_debug("no microcode data saved.\n");
-		return;
-	}
-	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
-
-	collect_cpu_info_early(&uci);
-
-	sig = uci.cpu_sig.sig;
-	pf = uci.cpu_sig.pf;
-	rev = uci.cpu_sig.rev;
-	pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
-
-	for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
-		struct microcode_header_intel *mc_saved_header;
-		struct extended_sigtable *ext_header;
-		int ext_sigcount;
-		struct extended_signature *ext_sig;
-
-		mc_saved_header = (struct microcode_header_intel *)
-				  mc_saved_data.mc_saved[i];
-		sig = mc_saved_header->sig;
-		pf = mc_saved_header->pf;
-		rev = mc_saved_header->rev;
-		total_size = get_totalsize(mc_saved_header);
-		data_size = get_datasize(mc_saved_header);
-		date = mc_saved_header->date;
-
-		pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
-			 i, sig, pf, rev, total_size,
-			 date & 0xffff,
-			 date >> 24,
-			 (date >> 16) & 0xff);
-
-		/* Look for ext. headers: */
-		if (total_size <= data_size + MC_HEADER_SIZE)
-			continue;
-
-		ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
-		ext_sigcount = ext_header->count;
-		ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-
-		for (j = 0; j < ext_sigcount; j++) {
-			sig = ext_sig->sig;
-			pf = ext_sig->pf;
-
-			pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
-				 j, sig, pf);
-
-			ext_sig++;
-		}
-
-	}
-}
-#else
-static inline void show_saved_mc(void)
-{
-}
-#endif
-
-#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
-static DEFINE_MUTEX(x86_cpu_microcode_mutex);
-/*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
- */
-int save_mc_for_early(u8 *mc)
-{
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count_init;
-	unsigned int mc_saved_count;
-	struct microcode_intel **mc_saved;
-	int ret = 0;
-	int i;
-
-	/*
-	 * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
-	 * hotplug.
-	 */
-	mutex_lock(&x86_cpu_microcode_mutex);
-
-	mc_saved_count_init = mc_saved_data.mc_saved_count;
-	mc_saved_count = mc_saved_data.mc_saved_count;
-	mc_saved = mc_saved_data.mc_saved;
-
-	if (mc_saved && mc_saved_count)
-		memcpy(mc_saved_tmp, mc_saved,
-		       mc_saved_count * sizeof(struct microcode_intel *));
-	/*
-	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
-	 * version.
-	 */
-	mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
-
-	/*
-	 * Save the mc_save_tmp in global mc_saved_data.
-	 */
-	ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
-	if (ret) {
-		pr_err("Cannot save microcode patch.\n");
-		goto out;
-	}
-
-	show_saved_mc();
-
-	/*
-	 * Free old saved microcode data.
-	 */
-	if (mc_saved) {
-		for (i = 0; i < mc_saved_count_init; i++)
-			kfree(mc_saved[i]);
-		kfree(mc_saved);
-	}
-
-out:
-	mutex_unlock(&x86_cpu_microcode_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(save_mc_for_early);
-#endif
-
-static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
-	       unsigned long start, unsigned long size,
-	       struct ucode_cpu_info *uci)
-{
-	struct cpio_data cd;
-	long offset = 0;
-#ifdef CONFIG_X86_32
-	char *p = (char *)__pa_nodebug(ucode_name);
-#else
-	char *p = ucode_name;
-#endif
-
-	cd.data = NULL;
-	cd.size = 0;
-
-	cd = find_cpio_data(p, (void *)start, size, &offset);
-	if (!cd.data)
-		return UCODE_ERROR;
-
-	return get_matching_model_microcode(0, start, cd.data, cd.size,
-					    mc_saved_data, initrd, uci);
-}
-
-/*
- * Print ucode update info.
- */
-static void
-print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
-{
-	int cpu = smp_processor_id();
-
-	pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
-		cpu,
-		uci->cpu_sig.rev,
-		date & 0xffff,
-		date >> 24,
-		(date >> 16) & 0xff);
-}
-
-#ifdef CONFIG_X86_32
-
-static int delay_ucode_info;
-static int current_mc_date;
-
-/*
- * Print early updated ucode info after printk works. This is delayed info dump.
- */
-void show_ucode_info_early(void)
-{
-	struct ucode_cpu_info uci;
-
-	if (delay_ucode_info) {
-		collect_cpu_info_early(&uci);
-		print_ucode_info(&uci, current_mc_date);
-		delay_ucode_info = 0;
-	}
-}
-
-/*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
- * show_ucode_info_early() until printk() works.
- */
-static void print_ucode(struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *mc_intel;
-	int *delay_ucode_info_p;
-	int *current_mc_date_p;
-
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
-		return;
-
-	delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
-	current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
-
-	*delay_ucode_info_p = 1;
-	*current_mc_date_p = mc_intel->hdr.date;
-}
-#else
-
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
-	__native_flush_tlb_global_irq_disabled();
-}
-
-static inline void print_ucode(struct ucode_cpu_info *uci)
-{
-	struct microcode_intel *mc_intel;
-
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
-		return;
-
-	print_ucode_info(uci, mc_intel->hdr.date);
-}
-#endif
-
-static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
-{
-	struct microcode_intel *mc_intel;
-	unsigned int val[2];
-
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
-		return 0;
-
-	/* write microcode via MSR 0x79 */
-	native_wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc_intel->bits,
-	      (unsigned long) mc_intel->bits >> 16 >> 16);
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
-
-	/* get the current revision from MSR 0x8B */
-	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-	if (val[1] != mc_intel->hdr.rev)
-		return -1;
-
-#ifdef CONFIG_X86_64
-	/* Flush global tlb. This is precaution. */
-	flush_tlb_early();
-#endif
-	uci->cpu_sig.rev = val[1];
-
-	if (early)
-		print_ucode(uci);
-	else
-		print_ucode_info(uci, mc_intel->hdr.date);
-
-	return 0;
-}
-
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
- */
-int __init save_microcode_in_initrd_intel(void)
-{
-	unsigned int count = mc_saved_data.mc_saved_count;
-	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
-	int ret = 0;
-
-	if (count == 0)
-		return ret;
-
-	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
-	ret = save_microcode(&mc_saved_data, mc_saved, count);
-	if (ret)
-		pr_err("Cannot save microcode patches from initrd.\n");
-
-	show_saved_mc();
-
-	return ret;
-}
-
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
-		      unsigned long *initrd,
-		      unsigned long start, unsigned long size)
-{
-	struct ucode_cpu_info uci;
-	enum ucode_state ret;
-
-	collect_cpu_info_early(&uci);
-
-	ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
-	if (ret != UCODE_OK)
-		return;
-
-	ret = load_microcode(mc_saved_data, initrd, start, &uci);
-	if (ret != UCODE_OK)
-		return;
-
-	apply_microcode_early(&uci, true);
-}
-
-void __init load_ucode_intel_bsp(void)
-{
-	u64 start, size;
-#ifdef CONFIG_X86_32
-	struct boot_params *p;
-
-	p	= (struct boot_params *)__pa_nodebug(&boot_params);
-	start	= p->hdr.ramdisk_image;
-	size	= p->hdr.ramdisk_size;
-
-	_load_ucode_intel_bsp(
-			(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-			(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-			start, size);
-#else
-	start	= boot_params.hdr.ramdisk_image + PAGE_OFFSET;
-	size	= boot_params.hdr.ramdisk_size;
-
-	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
-#endif
-}
-
-void load_ucode_intel_ap(void)
-{
-	struct mc_saved_data *mc_saved_data_p;
-	struct ucode_cpu_info uci;
-	unsigned long *mc_saved_in_initrd_p;
-	unsigned long initrd_start_addr;
-	enum ucode_state ret;
-#ifdef CONFIG_X86_32
-	unsigned long *initrd_start_p;
-
-	mc_saved_in_initrd_p =
-		(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
-	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
-	initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
-#else
-	mc_saved_data_p = &mc_saved_data;
-	mc_saved_in_initrd_p = mc_saved_in_initrd;
-	initrd_start_addr = initrd_start;
-#endif
-
-	/*
-	 * If there is no valid ucode previously saved in memory, no need to
-	 * update ucode on this AP.
-	 */
-	if (mc_saved_data_p->mc_saved_count == 0)
-		return;
-
-	collect_cpu_info_early(&uci);
-	ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
-			     initrd_start_addr, &uci);
-
-	if (ret != UCODE_OK)
-		return;
-
-	apply_microcode_early(&uci, true);
-}
-
-void reload_ucode_intel(void)
-{
-	struct ucode_cpu_info uci;
-	enum ucode_state ret;
-
-	if (!mc_saved_data.mc_saved_count)
-		return;
-
-	collect_cpu_info_early(&uci);
-
-	ret = load_microcode_early(mc_saved_data.mc_saved,
-				   mc_saved_data.mc_saved_count, &uci);
-	if (ret != UCODE_OK)
-		return;
-
-	apply_microcode_early(&uci, false);
-}
diff --git a/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c b/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c
index cd47a510a..b96896bcb 100644
--- a/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/kernel/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -25,17 +25,23 @@
 #include <linux/firmware.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
 
 #include <asm/microcode_intel.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 
-static inline int
-update_match_cpu(unsigned int csig, unsigned int cpf,
-		 unsigned int sig, unsigned int pf)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+					unsigned int s2, unsigned int p2)
 {
-	return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+	if (s1 != s2)
+		return false;
+
+	/* Processor flags are either both 0 ... */
+	if (!p1 && !p2)
+		return true;
+
+	/* ... or they intersect. */
+	return p1 & p2;
 }
 
 int microcode_sanity_check(void *mc, int print_err)
@@ -124,27 +130,25 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check);
 /*
  * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
+int find_matching_signature(void *mc, unsigned int csig, int cpf)
 {
-	struct microcode_header_intel *mc_header = mc;
-	struct extended_sigtable *ext_header;
-	unsigned long total_size = get_totalsize(mc_header);
-	int ext_sigcount, i;
+	struct microcode_header_intel *mc_hdr = mc;
+	struct extended_sigtable *ext_hdr;
 	struct extended_signature *ext_sig;
+	int i;
 
-	if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+	if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
 		return 1;
 
 	/* Look for ext. headers: */
-	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+	if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
 		return 0;
 
-	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-	ext_sigcount = ext_header->count;
-	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+	ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+	ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
 
-	for (i = 0; i < ext_sigcount; i++) {
-		if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+	for (i = 0; i < ext_hdr->count; i++) {
+		if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
 			return 1;
 		ext_sig++;
 	}
@@ -154,13 +158,13 @@ int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
 /*
  * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
+int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
 {
 	struct microcode_header_intel *mc_hdr = mc;
 
-	if (!revision_is_newer(mc_hdr, rev))
+	if (mc_hdr->rev <= new_rev)
 		return 0;
 
-	return get_matching_sig(csig, cpf, rev, mc);
+	return find_matching_signature(mc, csig, cpf);
 }
-EXPORT_SYMBOL_GPL(get_matching_microcode);
+EXPORT_SYMBOL_GPL(has_newer_microcode);
diff --git a/kernel/arch/x86/kernel/cpu/mshyperv.c b/kernel/arch/x86/kernel/cpu/mshyperv.c
index 939155ffd..20e242ea1 100644
--- a/kernel/arch/x86/kernel/cpu/mshyperv.c
+++ b/kernel/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
 #include <linux/efi.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/kexec.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
@@ -28,25 +29,26 @@
 #include <asm/i8259.h>
 #include <asm/apic.h>
 #include <asm/timer.h>
+#include <asm/reboot.h>
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
 
 #if IS_ENABLED(CONFIG_HYPERV)
 static void (*vmbus_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
 
 void hyperv_vector_handler(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	irq_enter();
-	exit_idle();
-
+	entering_irq();
 	inc_irq_stat(irq_hv_callback_count);
 	if (vmbus_handler)
 		vmbus_handler();
 
-	irq_exit();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 
@@ -69,7 +71,47 @@ void hv_remove_vmbus_irq(void)
 }
 EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
 EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
-#endif
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+	hv_kexec_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void hv_remove_kexec_handler(void)
+{
+	hv_kexec_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+	hv_crash_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void hv_remove_crash_handler(void)
+{
+	hv_crash_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+	if (kexec_in_progress && hv_kexec_handler)
+		hv_kexec_handler();
+	native_machine_shutdown();
+}
+
+static void hv_machine_crash_shutdown(struct pt_regs *regs)
+{
+	if (hv_crash_handler)
+		hv_crash_handler(regs);
+	native_machine_crash_shutdown(regs);
+}
+#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_HYPERV */
 
 static uint32_t  __init ms_hyperv_platform(void)
 {
@@ -116,6 +158,7 @@ static void __init ms_hyperv_init_platform(void)
 	 * Extract the features and hints
 	 */
 	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
 	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
 	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
@@ -143,6 +186,11 @@ static void __init ms_hyperv_init_platform(void)
 	no_timer_check = 1;
 #endif
 
+#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+	machine_ops.shutdown = hv_machine_shutdown;
+	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+#endif
+	mark_tsc_unstable("running on Hyper-V");
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c b/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5f90b85ff..70d7c93f4 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/kernel/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 			continue;
 		base = range_state[i].base_pfn;
 		if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
-		    (mtrr_state.enabled & 1)) {
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
 			/* Var MTRR contains UC entry below 1M? Skip it: */
 			printk(BIOS_BUG_MSG, i);
 			if (base + size <= (1<<(20-PAGE_SHIFT)))
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/generic.c b/kernel/arch/x86/kernel/cpu/mtrr/generic.c
index 7d74f7b3c..3b533cf37 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/kernel/arch/x86/kernel/cpu/mtrr/generic.c
@@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr)
 	return 0;
 }
 
-/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- *		corresponds only to [start:*partial_end].
- *		Caller has to lookup again for [*partial_end:end].
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ *  0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ *  0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ *  0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+	int idx;
+
+	if (start >= 0x100000)
+		return MTRR_TYPE_INVALID;
+
+	/* 0x0 - 0x7FFFF */
+	if (start < 0x80000) {
+		idx = 0;
+		idx += (start >> 16);
+		return mtrr_state.fixed_ranges[idx];
+	/* 0x80000 - 0xBFFFF */
+	} else if (start < 0xC0000) {
+		idx = 1 * 8;
+		idx += ((start - 0x80000) >> 14);
+		return mtrr_state.fixed_ranges[idx];
+	}
+
+	/* 0xC0000 - 0xFFFFF */
+	idx = 3 * 8;
+	idx += ((start - 0xC0000) >> 12);
+	return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ *	    returned corresponds only to [start:*partial_end].  Caller has
+ *	    to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+				    int *repeat, u8 *uniform)
 {
 	int i;
 	u64 base, mask;
 	u8 prev_match, curr_match;
 
 	*repeat = 0;
-	if (!mtrr_state_set)
-		return 0xFF;
-
-	if (!mtrr_state.enabled)
-		return 0xFF;
+	*uniform = 1;
 
-	/* Make end inclusive end, instead of exclusive */
+	/* Make end inclusive instead of exclusive */
 	end--;
 
-	/* Look in fixed ranges. Just return the type as per start */
-	if (mtrr_state.have_fixed && (start < 0x100000)) {
-		int idx;
-
-		if (start < 0x80000) {
-			idx = 0;
-			idx += (start >> 16);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0xC0000) {
-			idx = 1 * 8;
-			idx += ((start - 0x80000) >> 14);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0x1000000) {
-			idx = 3 * 8;
-			idx += ((start - 0xC0000) >> 12);
-			return mtrr_state.fixed_ranges[idx];
-		}
-	}
-
-	/*
-	 * Look in variable ranges
-	 * Look of multiple ranges matching this address and pick type
-	 * as per MTRR precedence
-	 */
-	if (!(mtrr_state.enabled & 2))
-		return mtrr_state.def_type;
-
-	prev_match = 0xFF;
+	prev_match = MTRR_TYPE_INVALID;
 	for (i = 0; i < num_var_ranges; ++i) {
-		unsigned short start_state, end_state;
+		unsigned short start_state, end_state, inclusive;
 
 		if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
 			continue;
@@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 		start_state = ((start & mask) == (base & mask));
 		end_state = ((end & mask) == (base & mask));
+		inclusive = ((start < base) && (end > base));
 
-		if (start_state != end_state) {
+		if ((start_state != end_state) || inclusive) {
 			/*
 			 * We have start:end spanning across an MTRR.
-			 * We split the region into
-			 * either
-			 * (start:mtrr_end) (mtrr_end:end)
-			 * or
-			 * (start:mtrr_start) (mtrr_start:end)
+			 * We split the region into either
+			 *
+			 * - start_state:1
+			 * (start:mtrr_end)(mtrr_end:end)
+			 * - end_state:1
+			 * (start:mtrr_start)(mtrr_start:end)
+			 * - inclusive:1
+			 * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+			 *
 			 * depending on kind of overlap.
-			 * Return the type for first region and a pointer to
-			 * the start of second region so that caller will
-			 * lookup again on the second region.
-			 * Note: This way we handle multiple overlaps as well.
+			 *
+			 * Return the type of the first region and a pointer
+			 * to the start of next region so that caller will be
+			 * advised to lookup again after having adjusted start
+			 * and end.
+			 *
+			 * Note: This way we handle overlaps with multiple
+			 * entries and the default type properly.
 			 */
 			if (start_state)
 				*partial_end = base + get_mtrr_size(mask);
@@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 			end = *partial_end - 1; /* end is inclusive */
 			*repeat = 1;
+			*uniform = 0;
 		}
 
 		if ((start & mask) != (base & mask))
 			continue;
 
 		curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
-		if (prev_match == 0xFF) {
+		if (prev_match == MTRR_TYPE_INVALID) {
 			prev_match = curr_match;
 			continue;
 		}
 
+		*uniform = 0;
 		if (check_type_overlap(&prev_match, &curr_match))
 			return curr_match;
 	}
 
-	if (mtrr_tom2) {
-		if (start >= (1ULL<<32) && (end < mtrr_tom2))
-			return MTRR_TYPE_WRBACK;
-	}
-
-	if (prev_match != 0xFF)
+	if (prev_match != MTRR_TYPE_INVALID)
 		return prev_match;
 
 	return mtrr_state.def_type;
 }
 
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
 {
-	u8 type, prev_type;
+	u8 type, prev_type, is_uniform = 1, dummy;
 	int repeat;
 	u64 partial_end;
 
-	type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+	if (!mtrr_state_set)
+		return MTRR_TYPE_INVALID;
+
+	if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+		return MTRR_TYPE_INVALID;
+
+	/*
+	 * Look up the fixed ranges first, which take priority over
+	 * the variable ranges.
+	 */
+	if ((start < 0x100000) &&
+	    (mtrr_state.have_fixed) &&
+	    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+		is_uniform = 0;
+		type = mtrr_type_lookup_fixed(start, end);
+		goto out;
+	}
+
+	/*
+	 * Look up the variable ranges.  Look of multiple ranges matching
+	 * this address and pick type as per MTRR precedence.
+	 */
+	type = mtrr_type_lookup_variable(start, end, &partial_end,
+					 &repeat, &is_uniform);
 
 	/*
 	 * Common path is with repeat = 0.
 	 * However, we can have cases where [start:end] spans across some
-	 * MTRR range. Do repeated lookups for that case here.
+	 * MTRR ranges and/or the default type.  Do repeated lookups for
+	 * that case here.
 	 */
 	while (repeat) {
 		prev_type = type;
 		start = partial_end;
-		type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+		is_uniform = 0;
+		type = mtrr_type_lookup_variable(start, end, &partial_end,
+						 &repeat, &dummy);
 
 		if (check_type_overlap(&prev_type, &type))
-			return type;
+			goto out;
 	}
 
+	if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+		type = MTRR_TYPE_WRBACK;
+
+out:
+	*uniform = is_uniform;
 	return type;
 }
 
@@ -347,7 +408,9 @@ static void __init print_mtrr_state(void)
 		 mtrr_attrib_to_str(mtrr_state.def_type));
 	if (mtrr_state.have_fixed) {
 		pr_debug("MTRR fixed ranges %sabled:\n",
-			 mtrr_state.enabled & 1 ? "en" : "dis");
+			((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+			 (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+			 "en" : "dis");
 		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
 		for (i = 0; i < 2; ++i)
 			print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -360,7 +423,7 @@ static void __init print_mtrr_state(void)
 		print_fixed_last();
 	}
 	pr_debug("MTRR variable ranges %sabled:\n",
-		 mtrr_state.enabled & 2 ? "en" : "dis");
+		 mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
 	high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
 
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -382,7 +445,7 @@ static void __init print_mtrr_state(void)
 }
 
 /* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
 {
 	struct mtrr_var_range *vrs;
 	unsigned long flags;
@@ -426,6 +489,8 @@ void __init get_mtrr_state(void)
 
 	post_set();
 	local_irq_restore(flags);
+
+	return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
 }
 
 /* Some BIOS's are messed up and don't set all MTRRs the same! */
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/main.c b/kernel/arch/x86/kernel/cpu/mtrr/main.c
index ea5f363a1..f891b4750 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/main.c
+++ b/kernel/arch/x86/kernel/cpu/mtrr/main.c
@@ -59,6 +59,12 @@
 #define MTRR_TO_PHYS_WC_OFFSET 1000
 
 u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+	return __mtrr_enabled;
+}
 
 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
@@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	int i, replace, error;
 	mtrr_type ltype;
 
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return -ENXIO;
 
 	error = mtrr_if->validate_add_page(base, size, type);
@@ -435,12 +441,13 @@ static int mtrr_check(unsigned long base, unsigned long size)
 int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	     bool increment)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
 			     increment);
 }
-EXPORT_SYMBOL(mtrr_add);
 
 /**
  * mtrr_del_page - delete a memory type region
@@ -463,8 +470,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	unsigned long lbase, lsize;
 	int error = -EINVAL;
 
-	if (!mtrr_if)
-		return -ENXIO;
+	if (!mtrr_enabled())
+		return -ENODEV;
 
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
@@ -523,11 +530,12 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
  */
 int mtrr_del(int reg, unsigned long base, unsigned long size)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
 }
-EXPORT_SYMBOL(mtrr_del);
 
 /**
  * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
@@ -538,6 +546,9 @@ EXPORT_SYMBOL(mtrr_del);
  * attempts to add a WC MTRR covering size bytes starting at base and
  * logs an error if this fails.
  *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
  * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
  * but drivers should not try to interpret that return value.
  */
@@ -545,7 +556,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
 {
 	int ret;
 
-	if (pat_enabled)
+	if (pat_enabled() || !mtrr_enabled())
 		return 0;  /* Success!  (We don't need to do anything.) */
 
 	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +588,7 @@ void arch_phys_wc_del(int handle)
 EXPORT_SYMBOL(arch_phys_wc_del);
 
 /*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
  * @handle: Return value from arch_phys_wc_add
  *
  * This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,14 +598,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
  * in printk line.  Alas there is an illegitimate use in some ancient
  * drm ioctls.
  */
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
 {
 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
 		return -1;
 	else
 		return handle - MTRR_TO_PHYS_WC_OFFSET;
 }
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
 
 /*
  * HACK ALERT!
@@ -734,10 +745,12 @@ void __init mtrr_bp_init(void)
 	}
 
 	if (mtrr_if) {
+		__mtrr_enabled = true;
 		set_num_var_ranges();
 		init_table();
 		if (use_intel()) {
-			get_mtrr_state();
+			/* BIOS may override */
+			__mtrr_enabled = get_mtrr_state();
 
 			if (mtrr_cleanup(phys_addr)) {
 				changed_by_mtrr_cleanup = 1;
@@ -745,10 +758,16 @@ void __init mtrr_bp_init(void)
 			}
 		}
 	}
+
+	if (!mtrr_enabled())
+		pr_info("MTRR: Disabled\n");
 }
 
 void mtrr_ap_init(void)
 {
+	if (!mtrr_enabled())
+		return;
+
 	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
 	/*
@@ -774,6 +793,9 @@ void mtrr_save_state(void)
 {
 	int first_cpu;
 
+	if (!mtrr_enabled())
+		return;
+
 	get_online_cpus();
 	first_cpu = cpumask_first(cpu_online_mask);
 	smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
@@ -782,6 +804,8 @@ void mtrr_save_state(void)
 
 void set_mtrr_aps_delayed_init(void)
 {
+	if (!mtrr_enabled())
+		return;
 	if (!use_intel())
 		return;
 
@@ -793,7 +817,7 @@ void set_mtrr_aps_delayed_init(void)
  */
 void mtrr_aps_init(void)
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 
 	/*
@@ -810,7 +834,7 @@ void mtrr_aps_init(void)
 
 void mtrr_bp_restore(void)
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 
 	mtrr_if->set_all();
@@ -818,7 +842,7 @@ void mtrr_bp_restore(void)
 
 static int __init mtrr_init_finialize(void)
 {
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return 0;
 
 	if (use_intel()) {
diff --git a/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h b/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31..951884dcc 100644
--- a/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/kernel/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 
 void fill_mtrr_var_range(unsigned int index,
 		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
 
 extern void set_mtrr_ops(const struct mtrr_ops *ops);
 
diff --git a/kernel/arch/x86/kernel/cpu/perf_event.c b/kernel/arch/x86/kernel/cpu/perf_event.c
index 4cc98a4e8..2bf79d7c9 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event.c
@@ -5,7 +5,7 @@
  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2009 Jaswinder Singh Rajput
  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
  *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  *  Copyright (C) 2009 Google, Inc., Stephane Eranian
  *
@@ -135,6 +135,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 }
 
 static atomic_t active_events;
+static atomic_t pmc_refcount;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -271,6 +272,7 @@ msr_fail:
 static void hw_perf_event_destroy(struct perf_event *event)
 {
 	x86_release_hardware();
+	atomic_dec(&active_events);
 }
 
 void hw_perf_lbr_event_destroy(struct perf_event *event)
@@ -324,16 +326,16 @@ int x86_reserve_hardware(void)
 {
 	int err = 0;
 
-	if (!atomic_inc_not_zero(&active_events)) {
+	if (!atomic_inc_not_zero(&pmc_refcount)) {
 		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&active_events) == 0) {
+		if (atomic_read(&pmc_refcount) == 0) {
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
 			else
 				reserve_ds_buffers();
 		}
 		if (!err)
-			atomic_inc(&active_events);
+			atomic_inc(&pmc_refcount);
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 
@@ -342,7 +344,7 @@ int x86_reserve_hardware(void)
 
 void x86_release_hardware(void)
 {
-	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
+	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		release_ds_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
@@ -355,28 +357,30 @@ void x86_release_hardware(void)
  */
 int x86_add_exclusive(unsigned int what)
 {
-	int ret = -EBUSY, i;
-
-	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
-		return 0;
+	int i;
 
-	mutex_lock(&pmc_reserve_mutex);
-	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
-		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
-			goto out;
+	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+		mutex_lock(&pmc_reserve_mutex);
+		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+				goto fail_unlock;
+		}
+		atomic_inc(&x86_pmu.lbr_exclusive[what]);
+		mutex_unlock(&pmc_reserve_mutex);
 	}
 
-	atomic_inc(&x86_pmu.lbr_exclusive[what]);
-	ret = 0;
+	atomic_inc(&active_events);
+	return 0;
 
-out:
+fail_unlock:
 	mutex_unlock(&pmc_reserve_mutex);
-	return ret;
+	return -EBUSY;
 }
 
 void x86_del_exclusive(unsigned int what)
 {
 	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+	atomic_dec(&active_events);
 }
 
 int x86_setup_perfctr(struct perf_event *event)
@@ -557,6 +561,7 @@ static int __x86_pmu_event_init(struct perf_event *event)
 	if (err)
 		return err;
 
+	atomic_inc(&active_events);
 	event->destroy = hw_perf_event_destroy;
 
 	event->hw.idx = -1;
@@ -895,10 +900,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			if (x86_pmu.commit_scheduling)
 				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
 		}
-	}
-
-	if (!assign || unsched) {
-
+	} else {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			/*
@@ -1111,13 +1113,16 @@ int x86_perf_event_set_period(struct perf_event *event)
 
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
-	/*
-	 * The hw event starts counting from this event offset,
-	 * mark it to be able to extra future deltas:
-	 */
-	local64_set(&hwc->prev_count, (u64)-left);
+	if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+	    local64_read(&hwc->prev_count) != (u64)-left) {
+		/*
+		 * The hw event starts counting from this event offset,
+		 * mark it to be able to extra future deltas:
+		 */
+		local64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+		wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+	}
 
 	/*
 	 * Due to erratum on certan cpu we need
@@ -1170,7 +1175,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 	 * skip the schedulability test here, it will be performed
 	 * at commit time (->commit_txn) as a whole.
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN)
+	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		goto done_collect;
 
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1321,7 +1326,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 	 * XXX assumes any ->del() called during a TXN will only be on
 	 * an event added during that same TXN.
 	 */
-	if (cpuc->group_flag & PERF_EVENT_TXN)
+	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
 		return;
 
 	/*
@@ -1429,6 +1434,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 	u64 finish_clock;
 	int ret;
 
+	/*
+	 * All PMUs/events that share this PMI handler should make sure to
+	 * increment active_events for their events.
+	 */
 	if (!atomic_read(&active_events))
 		return NMI_DONE;
 
@@ -1542,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs)
 }
 
 /* Merge two pointer arrays */
-static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
 {
 	struct attribute **new;
 	int j, i;
@@ -1739,11 +1748,22 @@ static inline void x86_pmu_read(struct perf_event *event)
  * Start group events scheduling transaction
  * Set the flag to make pmu::enable() not perform the
  * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
  */
-static void x86_pmu_start_txn(struct pmu *pmu)
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
 {
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	WARN_ON_ONCE(cpuc->txn_flags);		/* txn already in flight */
+
+	cpuc->txn_flags = txn_flags;
+	if (txn_flags & ~PERF_PMU_TXN_ADD)
+		return;
+
 	perf_pmu_disable(pmu);
-	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
 	__this_cpu_write(cpu_hw_events.n_txn, 0);
 }
 
@@ -1754,7 +1774,16 @@ static void x86_pmu_start_txn(struct pmu *pmu)
  */
 static void x86_pmu_cancel_txn(struct pmu *pmu)
 {
-	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
+	unsigned int txn_flags;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
+
+	txn_flags = cpuc->txn_flags;
+	cpuc->txn_flags = 0;
+	if (txn_flags & ~PERF_PMU_TXN_ADD)
+		return;
+
 	/*
 	 * Truncate collected array by the number of events added in this
 	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
@@ -1777,6 +1806,13 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	int assign[X86_PMC_IDX_MAX];
 	int n, ret;
 
+	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
+
+	if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+		cpuc->txn_flags = 0;
+		return 0;
+	}
+
 	n = cpuc->n_events;
 
 	if (!x86_pmu_initialized())
@@ -1792,7 +1828,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
-	cpuc->group_flag &= ~PERF_EVENT_TXN;
+	cpuc->txn_flags = 0;
 	perf_pmu_enable(pmu);
 	return 0;
 }
@@ -2170,6 +2206,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	int idx = segment >> 3;
 
 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		struct ldt_struct *ldt;
 
 		if (idx > LDT_ENTRIES)
@@ -2181,6 +2218,9 @@ static unsigned long get_segment_base(unsigned int segment)
 			return 0;
 
 		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
 	} else {
 		if (idx > GDT_ENTRIES)
 			return 0;
@@ -2191,7 +2231,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	return get_desc_base(desc);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION
 
 #include <asm/compat.h>
 
diff --git a/kernel/arch/x86/kernel/cpu/perf_event.h b/kernel/arch/x86/kernel/cpu/perf_event.h
index f068695ea..d0e35ebb2 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event.h
+++ b/kernel/arch/x86/kernel/cpu/perf_event.h
@@ -5,7 +5,7 @@
  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2009 Jaswinder Singh Rajput
  *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
  *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  *  Copyright (C) 2009 Google, Inc., Stephane Eranian
  *
@@ -47,6 +47,7 @@ enum extra_reg_type {
 	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
 	EXTRA_REG_LBR   = 2,	/* lbr_select */
 	EXTRA_REG_LDLAT = 3,	/* ld_lat_threshold */
+	EXTRA_REG_FE    = 4,    /* fe_* */
 
 	EXTRA_REG_MAX		/* number of entries needed */
 };
@@ -75,6 +76,8 @@ struct event_constraint {
 #define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
 #define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING	0x0800 /* use freerunning PEBS */
 
 
 struct amd_nb {
@@ -88,6 +91,18 @@ struct amd_nb {
 #define MAX_PEBS_EVENTS		8
 
 /*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+	PERF_SAMPLE_TRANSACTION)
+
+/*
  * A debug store configuration.
  *
  * We only support architectures that use 64bit fields.
@@ -133,7 +148,6 @@ enum intel_excl_state_type {
 };
 
 struct intel_excl_states {
-	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
 	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
 	bool sched_started; /* true if scheduling has started */
 };
@@ -152,7 +166,7 @@ struct intel_excl_cntrs {
 	unsigned	core_id;	/* per-core: core id */
 };
 
-#define MAX_LBR_ENTRIES		16
+#define MAX_LBR_ENTRIES		32
 
 enum {
 	X86_PERF_KFREE_SHARED = 0,
@@ -182,7 +196,7 @@ struct cpu_hw_events {
 
 	int			n_excl; /* the number of exclusive events */
 
-	unsigned int		group_flag;
+	unsigned int		txn_flags;
 	int			is_fake;
 
 	/*
@@ -373,7 +387,7 @@ struct cpu_hw_events {
 /* Check flags and event code/umask, and set the HSW N/A flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
-			  INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
 
 
@@ -527,10 +541,10 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 
-	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
-
 	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
 
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
 	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
 
 	struct event_constraint *event_constraints;
@@ -581,6 +595,7 @@ struct x86_pmu {
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
 	int 		max_pebs_events;
+	unsigned long	free_running_flags;
 
 	/*
 	 * Intel LBR
@@ -611,6 +626,8 @@ struct x86_pmu {
 struct x86_perf_task_context {
 	u64 lbr_from[MAX_LBR_ENTRIES];
 	u64 lbr_to[MAX_LBR_ENTRIES];
+	u64 lbr_info[MAX_LBR_ENTRIES];
+	int tos;
 	int lbr_callstack_users;
 	int lbr_stack_state;
 };
@@ -780,6 +797,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
 ssize_t intel_event_sysfs_show(char *page, u64 config);
 
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
 #ifdef CONFIG_CPU_SUP_AMD
 
 int amd_pmu_init(void);
@@ -795,20 +814,6 @@ static inline int amd_pmu_init(void)
 
 #ifdef CONFIG_CPU_SUP_INTEL
 
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-	/* user explicitly requested branch sampling */
-	if (has_branch_stack(event))
-		return true;
-
-	/* implicit branch sampling to correct PEBS skid */
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-	    x86_pmu.intel_cap.pebs_format < 2)
-		return true;
-
-	return false;
-}
-
 static inline bool intel_pmu_has_bts(struct perf_event *event)
 {
 	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
@@ -860,6 +865,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
 
 extern struct event_constraint intel_hsw_pebs_event_constraints[];
 
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
 
 void intel_pmu_pebs_enable(struct perf_event *event);
@@ -870,6 +877,8 @@ void intel_pmu_pebs_enable_all(void);
 
 void intel_pmu_pebs_disable_all(void);
 
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
 void intel_ds_init(void);
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
@@ -896,6 +905,8 @@ void intel_pmu_lbr_init_snb(void);
 
 void intel_pmu_lbr_init_hsw(void);
 
+void intel_pmu_lbr_init_skl(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
@@ -919,6 +930,7 @@ static inline int is_ht_workaround_enabled(void)
 {
 	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
 }
+
 #else /* CONFIG_CPU_SUP_INTEL */
 
 static inline void reserve_ds_buffers(void)
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel.c b/kernel/arch/x86/kernel/cpu/perf_event_intel.c
index 2813ea0f1..e2a430021 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_skl_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	EVENT_CONSTRAINT_END
+};
+
 static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,18 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	/*
+	 * Note the low 8 bits eventsel code is not a continuous field, containing
+	 * some #GPing bits. These are masked out.
+	 */
+	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+	EVENT_EXTRA_END
+};
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
 EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
@@ -212,7 +232,7 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
 	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 	/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
@@ -235,7 +255,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
 	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
 	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
-	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
+	INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
 	EVENT_CONSTRAINT_END
 };
 
@@ -244,6 +264,200 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD		BIT_ULL(0)
+#define SKL_DEMAND_RFO			BIT_ULL(1)
+#define SKL_ANY_RESPONSE		BIT_ULL(16)
+#define SKL_SUPPLIER_NONE		BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29)
+#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT			BIT_ULL(30)
+#define SKL_SNOOP_NONE			BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define SKL_SNOOP_MISS			BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define SKL_SNOOP_HITM			BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM		BIT_ULL(37)
+#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 #define SNB_DMND_DATA_RD	(1ULL << 0)
 #define SNB_DMND_RFO		(1ULL << 1)
 #define SNB_DMND_IFETCH		(1ULL << 2)
@@ -1114,7 +1328,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly =
 {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
 	EVENT_EXTRA_END
 };
 
@@ -1594,6 +1808,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
 	loops = 0;
 again:
+	intel_pmu_lbr_read();
 	intel_pmu_ack_status(status);
 	if (++loops > 100) {
 		static bool warned = false;
@@ -1608,16 +1823,16 @@ again:
 
 	inc_irq_stat(apic_perf_irqs);
 
-	intel_pmu_lbr_read();
 
 	/*
-	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
-	 * and clear the bit.
+	 * Ignore a range of extra bits in status that do not indicate
+	 * overflow by themselves.
 	 */
-	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
-		if (!status)
-			goto done;
-	}
+	status &= ~(GLOBAL_STATUS_COND_CHG |
+		    GLOBAL_STATUS_ASIF |
+		    GLOBAL_STATUS_LBRS_FROZEN);
+	if (!status)
+		goto done;
 
 	/*
 	 * PEBS overflow sets bit 62 in the global status register
@@ -1699,18 +1914,22 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }
 
-static int intel_alt_er(int idx)
+static int intel_alt_er(int idx, u64 config)
 {
+	int alt_idx;
 	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
 		return idx;
 
 	if (idx == EXTRA_REG_RSP_0)
-		return EXTRA_REG_RSP_1;
+		alt_idx = EXTRA_REG_RSP_1;
 
 	if (idx == EXTRA_REG_RSP_1)
-		return EXTRA_REG_RSP_0;
+		alt_idx = EXTRA_REG_RSP_0;
+
+	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+		return idx;
 
-	return idx;
+	return alt_idx;
 }
 
 static void intel_fixup_er(struct perf_event *event, int idx)
@@ -1799,7 +2018,7 @@ again:
 		 */
 		c = NULL;
 	} else {
-		idx = intel_alt_er(idx);
+		idx = intel_alt_er(idx, reg->config);
 		if (idx != reg->idx) {
 			raw_spin_unlock_irqrestore(&era->lock, flags);
 			goto again;
@@ -1903,9 +2122,8 @@ static void
 intel_start_scheduling(struct cpu_hw_events *cpuc)
 {
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
+	struct intel_excl_states *xl;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* sibling thread */
 
 	/*
 	 * nothing needed if in group validation mode
@@ -1916,10 +2134,9 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xlo = &excl_cntrs->states[o_tid];
 	xl = &excl_cntrs->states[tid];
 
 	xl->sched_started = true;
@@ -1928,22 +2145,41 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	 * in stop_event_scheduling()
 	 * makes scheduling appear as a transaction
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
 	raw_spin_lock(&excl_cntrs->lock);
+}
 
-	/*
-	 * save initial state of sibling thread
-	 */
-	memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = cpuc->event_constraint[idx];
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	lockdep_assert_held(&excl_cntrs->lock);
+
+	if (c->flags & PERF_X86_EVENT_EXCL)
+		xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+	else
+		xl->state[cntr] = INTEL_EXCL_SHARED;
 }
 
 static void
 intel_stop_scheduling(struct cpu_hw_events *cpuc)
 {
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
+	struct intel_excl_states *xl;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* sibling thread */
 
 	/*
 	 * nothing needed if in group validation mode
@@ -1953,17 +2189,11 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xlo = &excl_cntrs->states[o_tid];
 	xl = &excl_cntrs->states[tid];
 
-	/*
-	 * make new sibling thread state visible
-	 */
-	memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
-
 	xl->sched_started = false;
 	/*
 	 * release shared state lock (acquired in intel_start_scheduling())
@@ -1975,12 +2205,10 @@ static struct event_constraint *
 intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 			   int idx, struct event_constraint *c)
 {
-	struct event_constraint *cx;
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
-	int is_excl, i;
+	struct intel_excl_states *xlo;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* alternate */
+	int is_excl, i;
 
 	/*
 	 * validating a group does not require
@@ -1992,27 +2220,8 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return c;
-	/*
-	 * event requires exclusive counter access
-	 * across HT threads
-	 */
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
-		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
-		if (!cpuc->n_excl++)
-			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
-	}
-
-	/*
-	 * xl = state of current HT
-	 * xlo = state of sibling HT
-	 */
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
-
-	cx = c;
 
 	/*
 	 * because we modify the constraint, we need
@@ -2023,10 +2232,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * been cloned (marked dynamic)
 	 */
 	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-
-		/* sanity check */
-		if (idx < 0)
-			return &emptyconstraint;
+		struct event_constraint *cx;
 
 		/*
 		 * grab pre-allocated constraint entry
@@ -2037,13 +2243,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 		 * initialize dynamic constraint
 		 * with static constraint
 		 */
-		memcpy(cx, c, sizeof(*cx));
+		*cx = *c;
 
 		/*
 		 * mark constraint as dynamic, so we
 		 * can free it later on
 		 */
 		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+		c = cx;
 	}
 
 	/*
@@ -2054,6 +2261,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 */
 
 	/*
+	 * state of sibling HT
+	 */
+	xlo = &excl_cntrs->states[tid ^ 1];
+
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+		if (!cpuc->n_excl++)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+	}
+
+	/*
 	 * Modify static constraint with current dynamic
 	 * state of thread
 	 *
@@ -2061,46 +2284,49 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * SHARED   : sibling counter measuring non-exclusive event
 	 * UNUSED   : sibling counter unused
 	 */
-	for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
 		/*
 		 * exclusive event in sibling counter
 		 * our corresponding counter cannot be used
 		 * regardless of our event
 		 */
-		if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
-			__clear_bit(i, cx->idxmsk);
+		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, c->idxmsk);
 		/*
 		 * if measuring an exclusive event, sibling
 		 * measuring non-exclusive, then counter cannot
 		 * be used
 		 */
-		if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
-			__clear_bit(i, cx->idxmsk);
+		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, c->idxmsk);
 	}
 
 	/*
 	 * recompute actual bit weight for scheduling algorithm
 	 */
-	cx->weight = hweight64(cx->idxmsk64);
+	c->weight = hweight64(c->idxmsk64);
 
 	/*
 	 * if we return an empty mask, then switch
 	 * back to static empty constraint to avoid
 	 * the cost of freeing later on
 	 */
-	if (cx->weight == 0)
-		cx = &emptyconstraint;
+	if (c->weight == 0)
+		c = &emptyconstraint;
 
-	return cx;
+	return c;
 }
 
 static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
-	struct event_constraint *c1 = cpuc->event_constraint[idx];
+	struct event_constraint *c1 = NULL;
 	struct event_constraint *c2;
 
+	if (idx >= 0) /* fake does < 0 */
+		c1 = cpuc->event_constraint[idx];
+
 	/*
 	 * first time only
 	 * - static constraint: no change across incremental scheduling calls
@@ -2124,10 +2350,8 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xlo, *xl;
-	unsigned long flags = 0; /* keep compiler happy */
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid;
+	struct intel_excl_states *xl;
 
 	/*
 	 * nothing needed if in group validation mode
@@ -2135,13 +2359,9 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
 	if (cpuc->is_fake)
 		return;
 
-	WARN_ON_ONCE(!excl_cntrs);
-
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
 	if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
 		hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
 		if (!--cpuc->n_excl)
@@ -2149,22 +2369,25 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
 	}
 
 	/*
-	 * put_constraint may be called from x86_schedule_events()
-	 * which already has the lock held so here make locking
-	 * conditional
+	 * If event was actually assigned, then mark the counter state as
+	 * unused now.
 	 */
-	if (!xl->sched_started)
-		raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+	if (hwc->idx >= 0) {
+		xl = &excl_cntrs->states[tid];
 
-	/*
-	 * if event was actually assigned, then mark the
-	 * counter state as unused now
-	 */
-	if (hwc->idx >= 0)
-		xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+		/*
+		 * put_constraint may be called from x86_schedule_events()
+		 * which already has the lock held so here make locking
+		 * conditional.
+		 */
+		if (!xl->sched_started)
+			raw_spin_lock(&excl_cntrs->lock);
+
+		xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
 
-	if (!xl->sched_started)
-		raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+		if (!xl->sched_started)
+			raw_spin_unlock(&excl_cntrs->lock);
+	}
 }
 
 static void
@@ -2196,41 +2419,6 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 		intel_put_excl_constraints(cpuc, event);
 }
 
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct event_constraint *c = cpuc->event_constraint[idx];
-	struct intel_excl_states *xlo, *xl;
-	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid;
-	int is_excl;
-
-	if (cpuc->is_fake || !c)
-		return;
-
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
-	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-		return;
-
-	WARN_ON_ONCE(!excl_cntrs);
-
-	if (!excl_cntrs)
-		return;
-
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
-
-	WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
-
-	if (cntr >= 0) {
-		if (is_excl)
-			xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
-		else
-			xlo->init_state[cntr] = INTEL_EXCL_SHARED;
-	}
-}
-
 static void intel_pebs_aliases_core2(struct perf_event *event)
 {
 	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
@@ -2287,6 +2475,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
 	}
 }
 
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+	unsigned long flags = x86_pmu.free_running_flags;
+
+	if (event->attr.use_clockid)
+		flags &= ~PERF_SAMPLE_TIME;
+	return flags;
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -2294,8 +2491,16 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (ret)
 		return ret;
 
-	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
-		x86_pmu.pebs_aliases(event);
+	if (event->attr.precise_ip) {
+		if (!event->attr.freq) {
+			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+			if (!(event->attr.sample_type &
+			      ~intel_pmu_free_running_flags(event)))
+				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+		}
+		if (x86_pmu.pebs_aliases)
+			x86_pmu.pebs_aliases(event);
+	}
 
 	if (needs_branch_stack(event)) {
 		ret = intel_pmu_setup_lbr_filter(event);
@@ -2544,19 +2749,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
 static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
 {
 	struct intel_excl_cntrs *c;
-	int i;
 
 	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
 			 GFP_KERNEL, cpu_to_node(cpu));
 	if (c) {
 		raw_spin_lock_init(&c->lock);
-		for (i = 0; i < X86_PMC_IDX_MAX; i++) {
-			c->states[0].state[i] = INTEL_EXCL_UNUSED;
-			c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
-
-			c->states[1].state[i] = INTEL_EXCL_UNUSED;
-			c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
-		}
 		c->core_id = -1;
 	}
 	return c;
@@ -2569,7 +2766,7 @@ static int intel_pmu_cpu_prepare(int cpu)
 	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
 		cpuc->shared_regs = allocate_shared_regs(cpu);
 		if (!cpuc->shared_regs)
-			return NOTIFY_BAD;
+			goto err;
 	}
 
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
@@ -2577,18 +2774,27 @@ static int intel_pmu_cpu_prepare(int cpu)
 
 		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
 		if (!cpuc->constraint_list)
-			return NOTIFY_BAD;
+			goto err_shared_regs;
 
 		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
-		if (!cpuc->excl_cntrs) {
-			kfree(cpuc->constraint_list);
-			kfree(cpuc->shared_regs);
-			return NOTIFY_BAD;
-		}
+		if (!cpuc->excl_cntrs)
+			goto err_constraint_list;
+
 		cpuc->excl_thread_id = 0;
 	}
 
 	return NOTIFY_OK;
+
+err_constraint_list:
+	kfree(cpuc->constraint_list);
+	cpuc->constraint_list = NULL;
+
+err_shared_regs:
+	kfree(cpuc->shared_regs);
+	cpuc->shared_regs = NULL;
+
+err:
+	return NOTIFY_BAD;
 }
 
 static void intel_pmu_cpu_starting(int cpu)
@@ -2611,7 +2817,7 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
 		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
 
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
 			pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2629,7 +2835,7 @@ static void intel_pmu_cpu_starting(int cpu)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_excl_cntrs *c;
 
 			c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -2677,10 +2883,21 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+				 bool sched_in)
+{
+	if (x86_pmu.pebs_active)
+		intel_pmu_pebs_sched_task(ctx, sched_in);
+	if (x86_pmu.lbr_nr)
+		intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
 
 PMU_FORMAT_ATTR(ldlat, "config1:0-15");
 
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
 static struct attribute *intel_arch3_formats_attr[] = {
 	&format_attr_event.attr,
 	&format_attr_umask.attr,
@@ -2697,6 +2914,11 @@ static struct attribute *intel_arch3_formats_attr[] = {
 	NULL,
 };
 
+static struct attribute *skl_format_attr[] = {
+	&format_attr_frontend.attr,
+	NULL,
+};
+
 static __initconst const struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -2711,6 +2933,8 @@ static __initconst const struct x86_pmu core_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
+
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32-bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -2749,6 +2973,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -2766,7 +2991,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
-	.sched_task		= intel_pmu_lbr_sched_task,
+	.sched_task		= intel_pmu_sched_task,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -2939,8 +3164,8 @@ static __init void intel_ht_bug(void)
 {
 	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
 
-	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.stop_scheduling = intel_stop_scheduling;
 }
 
@@ -3286,6 +3511,30 @@ __init int intel_pmu_init(void)
 		pr_cont("Broadwell events, ");
 		break;
 
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_skl_event_constraints;
+		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_skl_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+						  skl_format_attr);
+		WARN_ON(!x86_pmu.format_attrs);
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Skylake events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
@@ -3355,7 +3604,7 @@ __init int intel_pmu_init(void)
 	 */
 	if (x86_pmu.extra_regs) {
 		for (er = x86_pmu.extra_regs; er->msr; er++) {
-			er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+			er->extra_msr_access = check_msr(er->msr, 0x11UL);
 			/* Disable LBR select mapping */
 			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
 				x86_pmu.lbr_sel_map = NULL;
@@ -3388,21 +3637,24 @@ static __init int fixup_ht_bug(void)
 	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
 		return 0;
 
-	w = cpumask_weight(topology_thread_cpumask(cpu));
+	w = cpumask_weight(topology_sibling_cpumask(cpu));
 	if (w > 1) {
 		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
 		return 0;
 	}
 
-	watchdog_nmi_disable_all();
+	if (lockup_detector_suspend() != 0) {
+		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+		return 0;
+	}
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
-	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.start_scheduling = NULL;
+	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_nmi_enable_all();
+	lockup_detector_resume();
 
 	get_online_cpus();
 
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 7795f3f8b..2cad71d1b 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -62,9 +62,6 @@ struct bts_buffer {
 
 struct pmu bts_pmu;
 
-void intel_pmu_enable_bts(u64 config);
-void intel_pmu_disable_bts(void);
-
 static size_t buf_size(struct page *page)
 {
 	return 1 << (PAGE_SHIFT + page_private(page));
@@ -225,6 +222,7 @@ static void __bts_event_start(struct perf_event *event)
 	if (!buf || bts_buffer_is_full(buf, bts))
 		return;
 
+	event->hw.itrace_started = 1;
 	event->hw.state = 0;
 
 	if (!buf->snapshot)
@@ -497,6 +495,19 @@ static int bts_event_init(struct perf_event *event)
 	if (x86_add_exclusive(x86_lbr_exclusive_bts))
 		return -EBUSY;
 
+	/*
+	 * BTS leaks kernel addresses even when CPL0 tracing is
+	 * disabled, so disallow intel_bts driver for unprivileged
+	 * users on paranoid systems since it provides trace data
+	 * to the user in a zero-copy fashion.
+	 *
+	 * Note that the default paranoia setting permits unprivileged
+	 * users to profile the kernel.
+	 */
+	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	ret = x86_reserve_hardware();
 	if (ret) {
 		x86_del_exclusive(x86_lbr_exclusive_bts);
@@ -530,5 +541,4 @@ static __init int bts_init(void)
 
 	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
 }
-
-module_init(bts_init);
+arch_initcall(bts_init);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index cb77b11bc..a316ca96f 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -13,16 +13,35 @@
 #define MSR_IA32_QM_CTR		0x0c8e
 #define MSR_IA32_QM_EVTSEL	0x0c8d
 
-static unsigned int cqm_max_rmid = -1;
+static u32 cqm_max_rmid = -1;
 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 
-struct intel_cqm_state {
-	raw_spinlock_t		lock;
-	int			rmid;
-	int			cnt;
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:		The cached Resource Monitoring ID
+ * @closid:		The cached Class Of Service ID
+ * @rmid_usecnt:	The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			rmid;
+	u32			closid;
+	int			rmid_usecnt;
 };
 
-static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
 /*
  * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -57,7 +76,7 @@ static cpumask_t cqm_cpumask;
  * near-zero occupancy value, i.e. no cachelines are tagged with this
  * RMID, once __intel_cqm_rmid_rotate() returns.
  */
-static unsigned int intel_cqm_rotation_rmid;
+static u32 intel_cqm_rotation_rmid;
 
 #define INVALID_RMID		(-1)
 
@@ -69,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid;
  * Likewise, an rmid value of -1 is used to indicate "no rmid currently
  * assigned" and is used as part of the rotation code.
  */
-static inline bool __rmid_valid(unsigned int rmid)
+static inline bool __rmid_valid(u32 rmid)
 {
 	if (!rmid || rmid == INVALID_RMID)
 		return false;
@@ -77,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid)
 	return true;
 }
 
-static u64 __rmid_read(unsigned int rmid)
+static u64 __rmid_read(u32 rmid)
 {
 	u64 val;
 
@@ -102,7 +121,7 @@ enum rmid_recycle_state {
 };
 
 struct cqm_rmid_entry {
-	unsigned int rmid;
+	u32 rmid;
 	enum rmid_recycle_state state;
 	struct list_head list;
 	unsigned long queue_time;
@@ -147,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru);
  */
 static struct cqm_rmid_entry **cqm_rmid_ptrs;
 
-static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -162,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
  *
  * We expect to be called with cache_mutex held.
  */
-static int __get_rmid(void)
+static u32 __get_rmid(void)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -177,7 +196,7 @@ static int __get_rmid(void)
 	return entry->rmid;
 }
 
-static void __put_rmid(unsigned int rmid)
+static void __put_rmid(u32 rmid)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -279,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
 {
 	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.target);
+		return perf_cgroup_from_task(event->hw.target, event->ctx);
 
 	return event->cgrp;
 }
@@ -372,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 }
 
 struct rmid_read {
-	unsigned int rmid;
+	u32 rmid;
 	atomic64_t value;
 };
 
@@ -381,12 +400,11 @@ static void __intel_cqm_event_count(void *info);
 /*
  * Exchange the RMID of a group of events.
  */
-static unsigned int
-intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
 {
 	struct perf_event *event;
-	unsigned int old_rmid = group->hw.cqm_rmid;
 	struct list_head *head = &group->hw.cqm_group_entry;
+	u32 old_rmid = group->hw.cqm_rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -451,7 +469,7 @@ static void intel_cqm_stable(void *arg)
  * If we have group events waiting for an RMID that don't conflict with
  * events already running, assign @rmid.
  */
-static bool intel_cqm_sched_in_event(unsigned int rmid)
+static bool intel_cqm_sched_in_event(u32 rmid)
 {
 	struct perf_event *leader, *event;
 
@@ -598,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
 static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 {
 	struct perf_event *rotor;
-	unsigned int rmid;
+	u32 rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -626,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
 {
 	struct perf_event *group, *g;
-	unsigned int rmid;
+	u32 rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -828,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
 				  struct perf_event **group)
 {
 	struct perf_event *iter;
-	unsigned int rmid;
 	bool conflict = false;
+	u32 rmid;
 
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
 		rmid = iter->hw.cqm_rmid;
@@ -860,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
 static void intel_cqm_event_read(struct perf_event *event)
 {
 	unsigned long flags;
-	unsigned int rmid;
+	u32 rmid;
 	u64 val;
 
 	/*
@@ -969,55 +987,48 @@ out:
 
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
-	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned int rmid = event->hw.cqm_rmid;
-	unsigned long flags;
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 rmid = event->hw.cqm_rmid;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
 		return;
 
 	event->hw.cqm_state &= ~PERF_HES_STOPPED;
 
-	raw_spin_lock_irqsave(&state->lock, flags);
-
-	if (state->cnt++)
-		WARN_ON_ONCE(state->rmid != rmid);
-	else
+	if (state->rmid_usecnt++) {
+		if (!WARN_ON_ONCE(state->rmid != rmid))
+			return;
+	} else {
 		WARN_ON_ONCE(state->rmid);
+	}
 
 	state->rmid = rmid;
-	wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
 }
 
 static void intel_cqm_event_stop(struct perf_event *event, int mode)
 {
-	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned long flags;
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
 	if (event->hw.cqm_state & PERF_HES_STOPPED)
 		return;
 
 	event->hw.cqm_state |= PERF_HES_STOPPED;
 
-	raw_spin_lock_irqsave(&state->lock, flags);
 	intel_cqm_event_read(event);
 
-	if (!--state->cnt) {
+	if (!--state->rmid_usecnt) {
 		state->rmid = 0;
-		wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
 	} else {
 		WARN_ON_ONCE(!state->rmid);
 	}
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
 }
 
 static int intel_cqm_event_add(struct perf_event *event, int mode)
 {
 	unsigned long flags;
-	unsigned int rmid;
+	u32 rmid;
 
 	raw_spin_lock_irqsave(&cache_lock, flags);
 
@@ -1032,11 +1043,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
 	return 0;
 }
 
-static void intel_cqm_event_del(struct perf_event *event, int mode)
-{
-	intel_cqm_event_stop(event, mode);
-}
-
 static void intel_cqm_event_destroy(struct perf_event *event)
 {
 	struct perf_event *group_other = NULL;
@@ -1065,7 +1071,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 			list_replace(&event->hw.cqm_groups_entry,
 				     &group_other->hw.cqm_groups_entry);
 		} else {
-			unsigned int rmid = event->hw.cqm_rmid;
+			u32 rmid = event->hw.cqm_rmid;
 
 			if (__rmid_valid(rmid))
 				__put_rmid(rmid);
@@ -1229,7 +1235,7 @@ static struct pmu intel_cqm_pmu = {
 	.task_ctx_nr	     = perf_sw_context,
 	.event_init	     = intel_cqm_event_init,
 	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_del,
+	.del		     = intel_cqm_event_stop,
 	.start		     = intel_cqm_event_start,
 	.stop		     = intel_cqm_event_stop,
 	.read		     = intel_cqm_event_read,
@@ -1249,14 +1255,14 @@ static inline void cqm_pick_event_reader(int cpu)
 	cpumask_set_cpu(cpu, &cqm_cpumask);
 }
 
-static void intel_cqm_cpu_prepare(unsigned int cpu)
+static void intel_cqm_cpu_starting(unsigned int cpu)
 {
-	struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	raw_spin_lock_init(&state->lock);
 	state->rmid = 0;
-	state->cnt  = 0;
+	state->closid = 0;
+	state->rmid_usecnt = 0;
 
 	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
 	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
@@ -1290,13 +1296,11 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
 	unsigned int cpu  = (unsigned long)hcpu;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		intel_cqm_cpu_prepare(cpu);
-		break;
 	case CPU_DOWN_PREPARE:
 		intel_cqm_cpu_exit(cpu);
 		break;
 	case CPU_STARTING:
+		intel_cqm_cpu_starting(cpu);
 		cqm_pick_event_reader(cpu);
 		break;
 	}
@@ -1367,7 +1371,7 @@ static int __init intel_cqm_init(void)
 		goto out;
 
 	for_each_online_cpu(i) {
-		intel_cqm_cpu_prepare(i);
+		intel_cqm_cpu_starting(i);
 		cqm_pick_event_reader(i);
 	}
 
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_cstate.c
new file mode 100644
index 000000000..75a38b5a2
--- /dev/null
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_cstate.c
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ *  - 'cstate_core': The counter is available for each physical core.
+ *    The counters include CORE_C*_RESIDENCY.
+ *  - 'cstate_pkg': The counter is available for each physical package.
+ *    The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ *	MSR_CORE_C1_RES: CORE C1 Residency Counter
+ *			 perf code: 0x00
+ *			 Available model: SLM,AMT
+ *			 Scope: Core (each processor core has a MSR)
+ *	MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ *			       perf code: 0x01
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ *			       perf code: 0x02
+ *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ *			       perf code: 0x03
+ *			       Available model: SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
+ *			       perf code: 0x00
+ *			       Available model: SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
+ *			       perf code: 0x01
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
+ *			       perf code: 0x02
+ *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
+ *			       perf code: 0x03
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
+ *			       perf code: 0x04
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
+ *			       perf code: 0x05
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ *			       perf code: 0x06
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
+static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,	\
+				char *page)			\
+{								\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
+	return sprintf(page, _format "\n");			\
+}								\
+static struct kobj_attribute format_attr_##_var =		\
+	__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf);
+
+struct perf_cstate_msr {
+	u64	msr;
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+	/*
+	 * cstate_core events
+	 */
+	PERF_CSTATE_CORE_C1_RES = 0,
+	PERF_CSTATE_CORE_C3_RES,
+	PERF_CSTATE_CORE_C6_RES,
+	PERF_CSTATE_CORE_C7_RES,
+
+	PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES ||
+		    idx == PERF_CSTATE_CORE_C7_RES)
+			return true;
+		break;
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_CSTATE_CORE_C1_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+	[PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,		&evattr_cstate_core_c1,	test_core, },
+	[PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,	&evattr_cstate_core_c3, test_core, },
+	[PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,	&evattr_cstate_core_c6, test_core, },
+	[PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,	&evattr_cstate_core_c7,	test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+	.name = "events",
+	.attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+	&format_attr_core_event.attr,
+	NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+	.name = "format",
+	.attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+	&core_events_attr_group,
+	&core_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+	/*
+	 * cstate_pkg events
+	 */
+	PERF_CSTATE_PKG_C2_RES = 0,
+	PERF_CSTATE_PKG_C3_RES,
+	PERF_CSTATE_PKG_C6_RES,
+	PERF_CSTATE_PKG_C7_RES,
+	PERF_CSTATE_PKG_C8_RES,
+	PERF_CSTATE_PKG_C9_RES,
+	PERF_CSTATE_PKG_C10_RES,
+
+	PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES ||
+		    idx == PERF_CSTATE_CORE_C7_RES)
+			return true;
+		break;
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_CSTATE_PKG_C2_RES ||
+		    idx == PERF_CSTATE_PKG_C3_RES ||
+		    idx == PERF_CSTATE_PKG_C6_RES ||
+		    idx == PERF_CSTATE_PKG_C7_RES)
+			return true;
+		break;
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	case 69: /* 22nm Haswell ULT */
+		if (idx == PERF_CSTATE_PKG_C2_RES ||
+		    idx == PERF_CSTATE_PKG_C3_RES ||
+		    idx == PERF_CSTATE_PKG_C6_RES ||
+		    idx == PERF_CSTATE_PKG_C7_RES ||
+		    idx == PERF_CSTATE_PKG_C8_RES ||
+		    idx == PERF_CSTATE_PKG_C9_RES ||
+		    idx == PERF_CSTATE_PKG_C10_RES)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+	[PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,	&evattr_cstate_pkg_c2,	test_pkg, },
+	[PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,	&evattr_cstate_pkg_c3,	test_pkg, },
+	[PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,	&evattr_cstate_pkg_c6,	test_pkg, },
+	[PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,	&evattr_cstate_pkg_c7,	test_pkg, },
+	[PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,	&evattr_cstate_pkg_c8,	test_pkg, },
+	[PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,	&evattr_cstate_pkg_c9,	test_pkg, },
+	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&evattr_cstate_pkg_c10,	test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+	.name = "events",
+	.attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+	&format_attr_pkg_event.attr,
+	NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+	.name = "format",
+	.attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+	&pkg_events_attr_group,
+	&pkg_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	if (pmu == &cstate_core_pmu)
+		return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+	else if (pmu == &cstate_pkg_pmu)
+		return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+	else
+		return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+	int ret = 0;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	if (event->pmu == &cstate_core_pmu) {
+		if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+			return -EINVAL;
+		if (!core_msr[cfg].attr)
+			return -EINVAL;
+		event->hw.event_base = core_msr[cfg].msr;
+	} else if (event->pmu == &cstate_pkg_pmu) {
+		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+			return -EINVAL;
+		if (!pkg_msr[cfg].attr)
+			return -EINVAL;
+		event->hw.event_base = pkg_msr[cfg].msr;
+	} else
+		return -ENOENT;
+
+	/* must be done before validate_group */
+	event->hw.config = cfg;
+	event->hw.idx = -1;
+
+	return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+	u64 val;
+
+	rdmsrl(event->hw.event_base, val);
+	return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_raw_count, new_raw_count;
+
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	new_raw_count = cstate_pmu_read_counter(event);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			    new_raw_count) != prev_raw_count)
+		goto again;
+
+	local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+	local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+	cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+	cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+	if (mode & PERF_EF_START)
+		cstate_pmu_event_start(event, mode);
+
+	return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+	int i, id, target;
+
+	/* cpu exit for cstate core */
+	if (has_cstate_core) {
+		id = topology_core_id(cpu);
+		target = -1;
+
+		for_each_online_cpu(i) {
+			if (i == cpu)
+				continue;
+			if (id == topology_core_id(i)) {
+				target = i;
+				break;
+			}
+		}
+		if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+			cpumask_set_cpu(target, &cstate_core_cpu_mask);
+		WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+		if (target >= 0)
+			perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+	}
+
+	/* cpu exit for cstate pkg */
+	if (has_cstate_pkg) {
+		id = topology_physical_package_id(cpu);
+		target = -1;
+
+		for_each_online_cpu(i) {
+			if (i == cpu)
+				continue;
+			if (id == topology_physical_package_id(i)) {
+				target = i;
+				break;
+			}
+		}
+		if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+			cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+		WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+		if (target >= 0)
+			perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+	}
+}
+
+static void cstate_cpu_init(int cpu)
+{
+	int i, id;
+
+	/* cpu init for cstate core */
+	if (has_cstate_core) {
+		id = topology_core_id(cpu);
+		for_each_cpu(i, &cstate_core_cpu_mask) {
+			if (id == topology_core_id(i))
+				break;
+		}
+		if (i >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+	}
+
+	/* cpu init for cstate pkg */
+	if (has_cstate_pkg) {
+		id = topology_physical_package_id(cpu);
+		for_each_cpu(i, &cstate_pkg_cpu_mask) {
+			if (id == topology_physical_package_id(i))
+				break;
+		}
+		if (i >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+	}
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		break;
+	case CPU_STARTING:
+		cstate_cpu_init(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		break;
+	case CPU_DOWN_PREPARE:
+		cstate_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+			     struct attribute	**events_attrs,
+			     int max_event_nr)
+{
+	int i, j = 0;
+	u64 val;
+
+	/* Probe the cstate events. */
+	for (i = 0; i < max_event_nr; i++) {
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
+	}
+
+	/* List remaining events in the sysfs attrs. */
+	for (i = 0; i < max_event_nr; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
+	}
+	events_attrs[j] = NULL;
+
+	return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+	/* SLM has different MSR for PKG C6 */
+	switch (boot_cpu_data.x86_model) {
+	case 55:
+	case 76:
+	case 77:
+		pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+	}
+
+	if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+		has_cstate_core = true;
+
+	if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+		has_cstate_pkg = true;
+
+	return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+	int cpu;
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu)
+		cstate_cpu_init(cpu);
+
+	__perf_cpu_notifier(cstate_cpu_notifier);
+
+	cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+	.attr_groups	= core_attr_groups,
+	.name		= "cstate_core",
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= cstate_pmu_event_init,
+	.add		= cstate_pmu_event_add, /* must have */
+	.del		= cstate_pmu_event_del, /* must have */
+	.start		= cstate_pmu_event_start,
+	.stop		= cstate_pmu_event_stop,
+	.read		= cstate_pmu_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+	.attr_groups	= pkg_attr_groups,
+	.name		= "cstate_pkg",
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= cstate_pmu_event_init,
+	.add		= cstate_pmu_event_add, /* must have */
+	.del		= cstate_pmu_event_del, /* must have */
+	.start		= cstate_pmu_event_start,
+	.stop		= cstate_pmu_event_stop,
+	.read		= cstate_pmu_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+	int err;
+
+	if (has_cstate_core) {
+		err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+		if (WARN_ON(err))
+			pr_info("Failed to register PMU %s error %d\n",
+				cstate_core_pmu.name, err);
+	}
+
+	if (has_cstate_pkg) {
+		err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+		if (WARN_ON(err))
+			pr_info("Failed to register PMU %s error %d\n",
+				cstate_pkg_pmu.name, err);
+	}
+}
+
+static int __init cstate_pmu_init(void)
+{
+	int err;
+
+	if (cpu_has_hypervisor)
+		return -ENODEV;
+
+	err = cstate_init();
+	if (err)
+		return err;
+
+	cstate_cpumask_init();
+
+	cstate_pmus_register();
+
+	return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 7f73b3553..5db1c7755 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -11,7 +11,7 @@
 #define BTS_RECORD_SIZE		24
 
 #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	PAGE_SIZE
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 /*
@@ -224,6 +224,19 @@ union hsw_tsx_tuning {
 
 #define PEBS_HSW_TSX_FLAGS	0xff00000000ULL
 
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+	u64 real_ip, tsx_tuning;
+	u64 tsc;
+};
+
 void init_debug_store_on_cpu(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -250,7 +263,7 @@ static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 	int node = cpu_to_node(cpu);
-	int max, thresh = 1; /* always use a single PEBS record */
+	int max;
 	void *buffer, *ibuffer;
 
 	if (!x86_pmu.pebs)
@@ -280,9 +293,6 @@ static int alloc_pebs_buffer(int cpu)
 	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
 		max * x86_pmu.pebs_record_size;
 
-	ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
-		thresh * x86_pmu.pebs_record_size;
-
 	return 0;
 }
 
@@ -500,10 +510,11 @@ int intel_pmu_drain_bts_buffer(void)
 		u64	flags;
 	};
 	struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *top;
+	struct bts_record *at, *base, *top;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	struct perf_sample_data data;
+	unsigned long skip = 0;
 	struct pt_regs regs;
 
 	if (!event)
@@ -512,10 +523,10 @@ int intel_pmu_drain_bts_buffer(void)
 	if (!x86_pmu.bts_active)
 		return 0;
 
-	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top = (struct bts_record *)(unsigned long)ds->bts_index;
+	base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top  = (struct bts_record *)(unsigned long)ds->bts_index;
 
-	if (top <= at)
+	if (top <= base)
 		return 0;
 
 	memset(&regs, 0, sizeof(regs));
@@ -525,16 +536,43 @@ int intel_pmu_drain_bts_buffer(void)
 	perf_sample_data_init(&data, 0, event->hw.last_period);
 
 	/*
+	 * BTS leaks kernel addresses in branches across the cpl boundary,
+	 * such as traps or system calls, so unless the user is asking for
+	 * kernel tracing (and right now it's not possible), we'd need to
+	 * filter them out. But first we need to count how many of those we
+	 * have in the current batch. This is an extra O(n) pass, however,
+	 * it's much faster than the other one especially considering that
+	 * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+	 * alloc_bts_buffer()).
+	 */
+	for (at = base; at < top; at++) {
+		/*
+		 * Note that right now *this* BTS code only works if
+		 * attr::exclude_kernel is set, but let's keep this extra
+		 * check here in case that changes.
+		 */
+		if (event->attr.exclude_kernel &&
+		    (kernel_ip(at->from) || kernel_ip(at->to)))
+			skip++;
+	}
+
+	/*
 	 * Prepare a generic sample, i.e. fill in the invariant fields.
 	 * We will overwrite the from and to address before we output
 	 * the sample.
 	 */
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size * (top - at)))
+	if (perf_output_begin(&handle, event, header.size *
+			      (top - base - skip)))
 		return 1;
 
-	for (; at < top; at++) {
+	for (at = base; at < top; at++) {
+		/* Filter out any records that contain kernel addresses. */
+		if (event->attr.exclude_kernel &&
+		    (kernel_ip(at->from) || kernel_ip(at->to)))
+			continue;
+
 		data.ip		= at->from;
 		data.addr	= at->to;
 
@@ -549,6 +587,19 @@ int intel_pmu_drain_bts_buffer(void)
 	return 1;
 }
 
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+	struct pt_regs regs;
+
+	x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (!sched_in)
+		intel_pmu_drain_pebs_buffer();
+}
+
 /*
  * PEBS
  */
@@ -665,6 +716,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -684,25 +757,71 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 	return &emptyconstraint;
 }
 
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
 void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
+	bool first_pebs;
+	u64 threshold;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
+	first_pebs = !pebs_is_enabled(cpuc);
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 
 	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
 		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled |= 1ULL << 63;
+
+	/*
+	 * When the event is constrained enough we can use a larger
+	 * threshold and run the event with less frequent PMI.
+	 */
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+		threshold = ds->pebs_absolute_maximum -
+			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+		if (first_pebs)
+			perf_sched_cb_inc(event->ctx->pmu);
+	} else {
+		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+		/*
+		 * If not all events can use larger buffer,
+		 * roll back to threshold = 1
+		 */
+		if (!first_pebs &&
+		    (ds->pebs_interrupt_threshold > threshold))
+			perf_sched_cb_dec(event->ctx->pmu);
+	}
+
+	/* Use auto-reload if possible to save a MSR write in the PMI */
+	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+		ds->pebs_event_reset[hwc->idx] =
+			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+	}
+
+	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+		ds->pebs_interrupt_threshold = threshold;
 }
 
 void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
+	bool large_pebs = ds->pebs_interrupt_threshold >
+		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+	if (large_pebs)
+		intel_pmu_drain_pebs_buffer();
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
 
@@ -711,6 +830,9 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);
 
+	if (large_pebs && !pebs_is_enabled(cpuc))
+		perf_sched_cb_dec(event->ctx->pmu);
+
 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
@@ -827,7 +949,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	return 0;
 }
 
-static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
 {
 	if (pebs->tsx_tuning) {
 		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
@@ -836,7 +958,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
 	return 0;
 }
 
-static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
 {
 	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
 
@@ -846,8 +968,10 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
 	return txn;
 }
 
-static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs, void *__pebs)
+static void setup_pebs_sample_data(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs,
+				   struct perf_sample_data *data,
+				   struct pt_regs *regs)
 {
 #define PERF_X86_EVENT_PEBS_HSW_PREC \
 		(PERF_X86_EVENT_PEBS_ST_HSW | \
@@ -858,14 +982,12 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	 * unconditionally access the 'extra' entries.
 	 */
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct pebs_record_hsw *pebs = __pebs;
-	struct perf_sample_data data;
-	struct pt_regs regs;
+	struct pebs_record_skl *pebs = __pebs;
 	u64 sample_type;
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
 
-	if (!intel_pmu_save_and_restart(event))
+	if (pebs == NULL)
 		return;
 
 	sample_type = event->attr.sample_type;
@@ -874,15 +996,15 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
 	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
 
-	perf_sample_data_init(&data, 0, event->hw.last_period);
+	perf_sample_data_init(data, 0, event->hw.last_period);
 
-	data.period = event->hw.last_period;
+	data->period = event->hw.last_period;
 
 	/*
 	 * Use latency for weight (only avail with PEBS-LL)
 	 */
 	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-		data.weight = pebs->lat;
+		data->weight = pebs->lat;
 
 	/*
 	 * data.data_src encodes the data source
@@ -895,7 +1017,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 			val = precise_datala_hsw(event, pebs->dse);
 		else if (fst)
 			val = precise_store_data(pebs->dse);
-		data.data_src.val = val;
+		data->data_src.val = val;
 	}
 
 	/*
@@ -908,61 +1030,133 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
 	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
 	 */
-	regs = *iregs;
-	regs.flags = pebs->flags;
-	set_linear_ip(&regs, pebs->ip);
-	regs.bp = pebs->bp;
-	regs.sp = pebs->sp;
+	*regs = *iregs;
+	regs->flags = pebs->flags;
+	set_linear_ip(regs, pebs->ip);
+	regs->bp = pebs->bp;
+	regs->sp = pebs->sp;
 
 	if (sample_type & PERF_SAMPLE_REGS_INTR) {
-		regs.ax = pebs->ax;
-		regs.bx = pebs->bx;
-		regs.cx = pebs->cx;
-		regs.dx = pebs->dx;
-		regs.si = pebs->si;
-		regs.di = pebs->di;
-		regs.bp = pebs->bp;
-		regs.sp = pebs->sp;
-
-		regs.flags = pebs->flags;
+		regs->ax = pebs->ax;
+		regs->bx = pebs->bx;
+		regs->cx = pebs->cx;
+		regs->dx = pebs->dx;
+		regs->si = pebs->si;
+		regs->di = pebs->di;
+		regs->bp = pebs->bp;
+		regs->sp = pebs->sp;
+
+		regs->flags = pebs->flags;
 #ifndef CONFIG_X86_32
-		regs.r8 = pebs->r8;
-		regs.r9 = pebs->r9;
-		regs.r10 = pebs->r10;
-		regs.r11 = pebs->r11;
-		regs.r12 = pebs->r12;
-		regs.r13 = pebs->r13;
-		regs.r14 = pebs->r14;
-		regs.r15 = pebs->r15;
+		regs->r8 = pebs->r8;
+		regs->r9 = pebs->r9;
+		regs->r10 = pebs->r10;
+		regs->r11 = pebs->r11;
+		regs->r12 = pebs->r12;
+		regs->r13 = pebs->r13;
+		regs->r14 = pebs->r14;
+		regs->r15 = pebs->r15;
 #endif
 	}
 
 	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-		regs.ip = pebs->real_ip;
-		regs.flags |= PERF_EFLAGS_EXACT;
-	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
-		regs.flags |= PERF_EFLAGS_EXACT;
+		regs->ip = pebs->real_ip;
+		regs->flags |= PERF_EFLAGS_EXACT;
+	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+		regs->flags |= PERF_EFLAGS_EXACT;
 	else
-		regs.flags &= ~PERF_EFLAGS_EXACT;
+		regs->flags &= ~PERF_EFLAGS_EXACT;
 
 	if ((sample_type & PERF_SAMPLE_ADDR) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
-		data.addr = pebs->dla;
+		data->addr = pebs->dla;
 
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
 		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data.weight = intel_hsw_weight(pebs);
+			data->weight = intel_hsw_weight(pebs);
 
 		if (sample_type & PERF_SAMPLE_TRANSACTION)
-			data.txn = intel_hsw_transaction(pebs);
+			data->txn = intel_hsw_transaction(pebs);
 	}
 
+	/*
+	 * v3 supplies an accurate time stamp, so we use that
+	 * for the time stamp.
+	 *
+	 * We can only do this for the default trace clock.
+	 */
+	if (x86_pmu.intel_cap.pebs_format >= 3 &&
+		event->attr.use_clockid == 0)
+		data->time = native_sched_clock_from_tsc(pebs->tsc);
+
 	if (has_branch_stack(event))
-		data.br_stack = &cpuc->lbr_stack;
+		data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	void *at;
+	u64 pebs_status;
+
+	if (base == NULL)
+		return NULL;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
+
+		if (test_bit(bit, (unsigned long *)&p->status)) {
+			/* PEBS v3 has accurate status bits */
+			if (x86_pmu.intel_cap.pebs_format >= 3)
+				return at;
+
+			if (p->status == (1 << bit))
+				return at;
+
+			/* clear non-PEBS bit and re-check */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status == (1 << bit))
+				return at;
+		}
+	}
+	return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+				   struct pt_regs *iregs,
+				   void *base, void *top,
+				   int bit, int count)
+{
+	struct perf_sample_data data;
+	struct pt_regs regs;
+	void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+	if (!intel_pmu_save_and_restart(event) &&
+	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+		return;
+
+	while (count > 1) {
+		setup_pebs_sample_data(event, iregs, at, &data, &regs);
+		perf_event_output(event, &data, &regs);
+		at += x86_pmu.pebs_record_size;
+		at = get_next_pebs_record_by_bit(at, top, bit);
+		count--;
+	}
+
+	setup_pebs_sample_data(event, iregs, at, &data, &regs);
 
-	if (perf_event_overflow(event, &data, &regs))
+	/*
+	 * All but the last records are processed.
+	 * The last one is left to be able to call the overflow handler.
+	 */
+	if (perf_event_overflow(event, &data, &regs)) {
 		x86_pmu_stop(event, 0);
+		return;
+	}
+
 }
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -992,72 +1186,99 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event->attr.precise_ip)
 		return;
 
-	n = top - at;
+	n = (top - at) / x86_pmu.pebs_record_size;
 	if (n <= 0)
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
-	at += n - 1;
-
-	__intel_pmu_pebs_event(event, iregs, at);
+	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event = NULL;
-	void *at, *top;
-	u64 status = 0;
-	int bit;
+	struct perf_event *event;
+	void *base, *at, *top;
+	short counts[MAX_PEBS_EVENTS] = {};
+	short error[MAX_PEBS_EVENTS] = {};
+	int bit, i;
 
 	if (!x86_pmu.pebs_active)
 		return;
 
-	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
-	if (unlikely(at > top))
+	if (unlikely(base >= top))
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
-		  "Unexpected number of pebs records %ld\n",
-		  (long)(top - at) / x86_pmu.pebs_record_size);
-
-	for (; at < top; at += x86_pmu.pebs_record_size) {
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
 		struct pebs_record_nhm *p = at;
+		u64 pebs_status;
 
-		for_each_set_bit(bit, (unsigned long *)&p->status,
-				 x86_pmu.max_pebs_events) {
-			event = cpuc->events[bit];
-			if (!test_bit(bit, cpuc->active_mask))
-				continue;
+		/* PEBS v3 has accurate status bits */
+		if (x86_pmu.intel_cap.pebs_format >= 3) {
+			for_each_set_bit(bit, (unsigned long *)&p->status,
+					 MAX_PEBS_EVENTS)
+				counts[bit]++;
 
-			WARN_ON_ONCE(!event);
+			continue;
+		}
 
-			if (!event->attr.precise_ip)
-				continue;
+		pebs_status = p->status & cpuc->pebs_enabled;
+		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
 
-			if (__test_and_set_bit(bit, (unsigned long *)&status))
-				continue;
+		bit = find_first_bit((unsigned long *)&pebs_status,
+					x86_pmu.max_pebs_events);
+		if (WARN(bit >= x86_pmu.max_pebs_events,
+			 "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
+			 (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
+			 *(unsigned long long *)cpuc->active_mask))
+			continue;
 
-			break;
+		/*
+		 * The PEBS hardware does not deal well with the situation
+		 * when events happen near to each other and multiple bits
+		 * are set. But it should happen rarely.
+		 *
+		 * If these events include one PEBS and multiple non-PEBS
+		 * events, it doesn't impact PEBS record. The record will
+		 * be handled normally. (slow path)
+		 *
+		 * If these events include two or more PEBS events, the
+		 * records for the events can be collapsed into a single
+		 * one, and it's not possible to reconstruct all events
+		 * that caused the PEBS record. It's called collision.
+		 * If collision happened, the record will be dropped.
+		 */
+		if (p->status != (1ULL << bit)) {
+			for_each_set_bit(i, (unsigned long *)&pebs_status,
+					 x86_pmu.max_pebs_events)
+				error[i]++;
+			continue;
 		}
 
-		if (!event || bit >= x86_pmu.max_pebs_events)
+		counts[bit]++;
+	}
+
+	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+		if ((counts[bit] == 0) && (error[bit] == 0))
 			continue;
 
-		__intel_pmu_pebs_event(event, iregs, at);
+		event = cpuc->events[bit];
+		WARN_ON_ONCE(!event);
+		WARN_ON_ONCE(!event->attr.precise_ip);
+
+		/* log dropped samples number */
+		if (error[bit])
+			perf_log_lost_samples(event, error[bit]);
+
+		if (counts[bit]) {
+			__intel_pmu_pebs_event(event, iregs, base,
+					       top, bit, counts[bit]);
+		}
 	}
 }
 
@@ -1098,6 +1319,14 @@ void __init intel_ds_init(void)
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			break;
 
+		case 3:
+			pr_cont("PEBS fmt3%c, ", pebs_type);
+			x86_pmu.pebs_record_size =
+						sizeof(struct pebs_record_skl);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+			break;
+
 		default:
 			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 94e5b506c..659f01e16 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,7 +13,8 @@ enum {
 	LBR_FORMAT_EIP		= 0x02,
 	LBR_FORMAT_EIP_FLAGS	= 0x03,
 	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_EIP_FLAGS2,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
 };
 
 static enum {
@@ -96,6 +97,7 @@ enum {
 	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
 	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
 	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
+	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -113,6 +115,7 @@ enum {
 	 X86_BR_IRQ	 |\
 	 X86_BR_ABORT	 |\
 	 X86_BR_IND_CALL |\
+	 X86_BR_IND_JMP  |\
 	 X86_BR_ZERO_CALL)
 
 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
@@ -138,13 +141,20 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	u64 debugctl, lbr_select = 0, orig_debugctl;
 
 	/*
+	 * No need to unfreeze manually, as v4 can do that as part
+	 * of the GLOBAL_STATUS ack.
+	 */
+	if (pmi && x86_pmu.version >= 4)
+		return;
+
+	/*
 	 * No need to reprogram LBR_SELECT in a PMI, as it
 	 * did not change.
 	 */
-	if (cpuc->lbr_sel && !pmi) {
+	if (cpuc->lbr_sel)
 		lbr_select = cpuc->lbr_sel->config;
+	if (!pmi)
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
-	}
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	orig_debugctl = debugctl;
@@ -184,6 +194,8 @@ static void intel_pmu_lbr_reset_64(void)
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 		wrmsrl(x86_pmu.lbr_to   + i, 0);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + i, 0);
 	}
 }
 
@@ -227,12 +239,15 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 	}
 
 	mask = x86_pmu.lbr_nr - 1;
-	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	tos = task_ctx->tos;
+	for (i = 0; i < tos; i++) {
 		lbr_idx = (tos - i) & mask;
 		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
+	wrmsrl(x86_pmu.lbr_tos, tos);
 	task_ctx->lbr_stack_state = LBR_NONE;
 }
 
@@ -249,11 +264,14 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	for (i = 0; i < tos; i++) {
 		lbr_idx = (tos - i) & mask;
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
+	task_ctx->tos = tos;
 	task_ctx->lbr_stack_state = LBR_VALID;
 }
 
@@ -262,9 +280,6 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx;
 
-	if (!x86_pmu.lbr_nr)
-		return;
-
 	/*
 	 * If LBR callstack feature is enabled and the stack was saved when
 	 * the task was scheduled out, restore the stack. Otherwise flush
@@ -412,16 +427,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 	int out = 0;
+	int num = x86_pmu.lbr_nr;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+		num = tos;
+
+	for (i = 0; i < num; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
 		int skip = 0;
+		u16 cycles = 0;
 		int lbr_flags = lbr_desc[lbr_format];
 
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
+		if (lbr_format == LBR_FORMAT_INFO) {
+			u64 info;
+
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			mis = !!(info & LBR_INFO_MISPRED);
+			pred = !mis;
+			in_tx = !!(info & LBR_INFO_IN_TX);
+			abort = !!(info & LBR_INFO_ABORT);
+			cycles = (info & LBR_INFO_CYCLES);
+		}
 		if (lbr_flags & LBR_EIP_FLAGS) {
 			mis = !!(from & LBR_FROM_FLAG_MISPRED);
 			pred = !mis;
@@ -451,6 +481,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[out].predicted = pred;
 		cpuc->lbr_entries[out].in_tx	 = in_tx;
 		cpuc->lbr_entries[out].abort	 = abort;
+		cpuc->lbr_entries[out].cycles	 = cycles;
 		cpuc->lbr_entries[out].reserved	 = 0;
 		out++;
 	}
@@ -523,6 +554,11 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 			X86_BR_CALL_STACK;
 	}
 
+	if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+		mask |= X86_BR_IND_JMP;
+
+	if (br_type & PERF_SAMPLE_BRANCH_CALL)
+		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
@@ -736,7 +772,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 			break;
 		case 4:
 		case 5:
-			ret = X86_BR_JMP;
+			ret = X86_BR_IND_JMP;
 			break;
 		}
 		break;
@@ -844,6 +880,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	 */
 	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
 };
 
 static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -856,6 +893,8 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 						| LBR_FAR,
 	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
 };
 
 static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -870,6 +909,8 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
 						| LBR_RETURN | LBR_CALL_STACK,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
 };
 
 /* core */
@@ -942,6 +983,26 @@ void intel_pmu_lbr_init_hsw(void)
 	pr_cont("16-deep LBR, ");
 }
 
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+	x86_pmu.lbr_nr	 = 32;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("32-deep LBR, ");
+}
+
 /* atom */
 void __init intel_pmu_lbr_init_atom(void)
 {
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 123ff1bb2..868e11943 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -65,15 +65,21 @@ static struct pt_cap_desc {
 } pt_caps[] = {
 	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
 	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
+	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
 	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
+	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
+	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
 };
 
 static u32 pt_cap_get(enum pt_capabilities cap)
 {
 	struct pt_cap_desc *cd = &pt_caps[cap];
-	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
 	unsigned int shift = __ffs(cd->mask);
 
 	return (c & cd->mask) >> shift;
@@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = {
 	.name	= "caps",
 };
 
+PMU_FORMAT_ATTR(cyc,		"config:1"	);
+PMU_FORMAT_ATTR(mtc,		"config:9"	);
 PMU_FORMAT_ATTR(tsc,		"config:10"	);
 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
+PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
+PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
 
 static struct attribute *pt_formats_attr[] = {
+	&format_attr_cyc.attr,
+	&format_attr_mtc.attr,
 	&format_attr_tsc.attr,
 	&format_attr_noretcomp.attr,
+	&format_attr_mtc_period.attr,
+	&format_attr_cyc_thresh.attr,
+	&format_attr_psb_period.attr,
 	NULL,
 };
 
@@ -123,16 +139,13 @@ static int __init pt_pmu_hw_init(void)
 	long i;
 
 	attrs = NULL;
-	ret = -ENODEV;
-	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
-		goto fail;
 
 	for (i = 0; i < PT_CPUID_LEAVES; i++) {
 		cpuid_count(20, i,
-			    &pt_pmu.caps[CR_EAX + i*4],
-			    &pt_pmu.caps[CR_EBX + i*4],
-			    &pt_pmu.caps[CR_ECX + i*4],
-			    &pt_pmu.caps[CR_EDX + i*4]);
+			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
 	}
 
 	ret = -ENOMEM;
@@ -170,15 +183,65 @@ fail:
 	return ret;
 }
 
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
+			  RTIT_CTL_CYC_THRESH	| \
+			  RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
+			 RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
+			RTIT_CTL_DISRETC	| \
+			RTIT_CTL_CYC_PSB	| \
+			RTIT_CTL_MTC)
 
 static bool pt_event_valid(struct perf_event *event)
 {
 	u64 config = event->attr.config;
+	u64 allowed, requested;
 
 	if ((config & PT_CONFIG_MASK) != config)
 		return false;
 
+	if (config & RTIT_CTL_CYC_PSB) {
+		if (!pt_cap_get(PT_CAP_psb_cyc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_psb_periods);
+		requested = (config & RTIT_CTL_PSB_FREQ) >>
+			RTIT_CTL_PSB_FREQ_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+		requested = (config & RTIT_CTL_CYC_THRESH) >>
+			RTIT_CTL_CYC_THRESH_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+	}
+
+	if (config & RTIT_CTL_MTC) {
+		/*
+		 * In the unlikely case that CPUID lists valid mtc periods,
+		 * but not the mtc capability, drop out here.
+		 *
+		 * Spec says that setting mtc period bits while mtc bit in
+		 * CPUID is 0 will #GP, so better safe than sorry.
+		 */
+		if (!pt_cap_get(PT_CAP_mtc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_mtc_periods);
+		if (!allowed)
+			return false;
+
+		requested = (config & RTIT_CTL_MTC_RANGE) >>
+			RTIT_CTL_MTC_RANGE_OFFSET;
+
+		if (!(allowed & BIT(requested)))
+			return false;
+	}
+
 	return true;
 }
 
@@ -187,19 +250,15 @@ static bool pt_event_valid(struct perf_event *event)
  * These all are cpu affine and operate on a local PT
  */
 
-static bool pt_is_running(void)
-{
-	u64 ctl;
-
-	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-	return !!(ctl & RTIT_CTL_TRACEEN);
-}
-
 static void pt_config(struct perf_event *event)
 {
 	u64 reg;
 
+	if (!event->hw.itrace_started) {
+		event->hw.itrace_started = 1;
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	}
+
 	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
 
 	if (!event->attr.exclude_kernel)
@@ -609,7 +668,12 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
  * @handle:	Current output handle.
  *
  * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected.
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
  */
 static int pt_buffer_reset_markers(struct pt_buffer *buf,
 				   struct perf_output_handle *handle)
@@ -618,9 +682,6 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 	unsigned long head = local64_read(&buf->head);
 	unsigned long idx, npages, wakeup;
 
-	if (buf->snapshot)
-		return 0;
-
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 <
 	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
@@ -674,7 +735,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 	struct topa *cur = buf->first, *prev = buf->last;
 	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
 		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
-	int pg = 0, idx = 0, ntopa = 0;
+	int pg = 0, idx = 0;
 
 	while (pg < buf->nr_pages) {
 		int tidx;
@@ -689,9 +750,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 			/* advance to next topa table */
 			idx = 0;
 			cur = list_entry(cur->list.next, struct topa, list);
-			ntopa++;
-		} else
+		} else {
 			idx++;
+		}
 		te_cur = TOPA_ENTRY(cur, idx);
 	}
 
@@ -703,7 +764,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
  * @head:	Write pointer (aux_head) from AUX buffer.
  *
  * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly.
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
  */
 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
 {
@@ -901,6 +969,7 @@ void intel_pt_interrupt(void)
 		}
 
 		pt_buffer_reset_offsets(buf, pt->handle.head);
+		/* snapshot counters don't use PMI, so it's safe */
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
 		if (ret) {
 			perf_aux_output_end(&pt->handle, 0, true);
@@ -909,7 +978,6 @@ void intel_pt_interrupt(void)
 
 		pt_config_buffer(buf->cur->table, buf->cur_idx,
 				 buf->output_off);
-		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 		pt_config(event);
 	}
 }
@@ -923,7 +991,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 
-	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+	if (!buf || pt_buffer_is_full(buf, pt)) {
 		event->hw.state = PERF_HES_STOPPED;
 		return;
 	}
@@ -933,7 +1001,6 @@ static void pt_event_start(struct perf_event *event, int mode)
 
 	pt_config_buffer(buf->cur->table, buf->cur_idx,
 			 buf->output_off);
-	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	pt_config(event);
 }
 
@@ -954,7 +1021,6 @@ static void pt_event_stop(struct perf_event *event, int mode)
 	event->hw.state = PERF_HES_STOPPED;
 
 	if (mode & PERF_EF_UPDATE) {
-		struct pt *pt = this_cpu_ptr(&pt_ctx);
 		struct pt_buffer *buf = perf_get_aux(&pt->handle);
 
 		if (!buf)
@@ -1061,6 +1127,10 @@ static __init int pt_init(void)
 	int ret, cpu, prior_warn = 0;
 
 	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+		return -ENODEV;
+
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		u64 ctl;
@@ -1106,5 +1176,4 @@ static __init int pt_init(void)
 
 	return ret;
 }
-
-module_init(pt_init);
+arch_initcall(pt_init);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 358c54ad2..ed446bdcb 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 			 1<<RAPL_IDX_RAM_NRG_STAT|\
 			 1<<RAPL_IDX_PP1_NRG_STAT)
 
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
 /*
  * event code: LSB 8 bits, passed in attr->config
  * any other bit is reserved
@@ -103,12 +107,6 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
 static struct kobj_attribute format_attr_##_var =		\
 	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
 
-#define RAPL_EVENT_DESC(_name, _config)				\
-{								\
-	.attr	= __ATTR(_name, 0444, rapl_event_show, NULL),	\
-	.config	= _config,					\
-}
-
 #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
 
 #define RAPL_EVENT_ATTR_STR(_name, v, str)				\
@@ -204,9 +202,8 @@ again:
 
 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 {
-	__hrtimer_start_range_ns(&pmu->hrtimer,
-			pmu->timer_interval, 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+		     HRTIMER_MODE_REL_PINNED);
 }
 
 static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
@@ -487,6 +484,18 @@ static struct attribute *rapl_events_hsw_attr[] = {
 	NULL,
 };
 
+static struct attribute *rapl_events_knl_attr[] = {
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
 static struct attribute_group rapl_pmu_events_group = {
 	.name = "events",
 	.attrs = NULL, /* patched at runtime */
@@ -731,6 +740,10 @@ static int __init rapl_pmu_init(void)
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
+	case 87: /* Knights Landing */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_KNL;
+		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 
 	default:
 		/* unsupported */
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 90b7c501c..61215a69b 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,59 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
 	EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+	struct pci2phy_map *map;
+	int phys_id = -1;
+
+	raw_spin_lock(&pci2phy_map_lock);
+	list_for_each_entry(map, &pci2phy_map_head, list) {
+		if (map->segment == pci_domain_nr(bus)) {
+			phys_id = map->pbus_to_physid[bus->number];
+			break;
+		}
+	}
+	raw_spin_unlock(&pci2phy_map_lock);
+
+	return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+	struct pci2phy_map *map, *alloc = NULL;
+	int i;
+
+	lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+	list_for_each_entry(map, &pci2phy_map_head, list) {
+		if (map->segment == segment)
+			goto end;
+	}
+
+	if (!alloc) {
+		raw_spin_unlock(&pci2phy_map_lock);
+		alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+		raw_spin_lock(&pci2phy_map_lock);
+
+		if (!alloc)
+			return NULL;
+
+		goto lookup;
+	}
+
+	map = alloc;
+	alloc = NULL;
+	map->segment = segment;
+	for (i = 0; i < 256; i++)
+		map->pbus_to_physid[i] = -1;
+	list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+	kfree(alloc);
+	return map;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
 			  struct kobj_attribute *attr, char *buf)
 {
@@ -233,9 +287,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
 
 void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
 {
-	__hrtimer_start_range_ns(&box->hrtimer,
-			ns_to_ktime(box->hrtimer_duration), 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
@@ -810,7 +863,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	int phys_id;
 	bool first_box = false;
 
-	phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+	phys_id = uncore_pcibus_to_physid(pdev->bus);
 	if (phys_id < 0)
 		return -ENODEV;
 
@@ -857,9 +910,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box = pci_get_drvdata(pdev);
 	struct intel_uncore_pmu *pmu;
-	int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+	int i, cpu, phys_id;
 	bool last_box = false;
 
+	phys_id = uncore_pcibus_to_physid(pdev->bus);
 	box = pci_get_drvdata(pdev);
 	if (!box) {
 		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
@@ -912,6 +966,9 @@ static int __init uncore_pci_init(void)
 	case 63: /* Haswell-EP */
 		ret = hswep_uncore_pci_init();
 		break;
+	case 86: /* BDX-DE */
+		ret = bdx_uncore_pci_init();
+		break;
 	case 42: /* Sandy Bridge */
 		ret = snb_uncore_pci_init();
 		break;
@@ -922,6 +979,9 @@ static int __init uncore_pci_init(void)
 	case 69: /* Haswell Celeron */
 		ret = hsw_uncore_pci_init();
 		break;
+	case 61: /* Broadwell */
+		ret = bdw_uncore_pci_init();
+		break;
 	default:
 		return 0;
 	}
@@ -1207,6 +1267,11 @@ static int __init uncore_cpu_init(void)
 		break;
 	case 42: /* Sandy Bridge */
 	case 58: /* Ivy Bridge */
+	case 60: /* Haswell */
+	case 69: /* Haswell */
+	case 70: /* Haswell */
+	case 61: /* Broadwell */
+	case 71: /* Broadwell */
 		snb_uncore_cpu_init();
 		break;
 	case 45: /* Sandy Bridge-EP */
@@ -1222,6 +1287,9 @@ static int __init uncore_cpu_init(void)
 	case 63: /* Haswell-EP */
 		hswep_uncore_cpu_init();
 		break;
+	case 86: /* BDX-DE */
+		bdx_uncore_cpu_init();
+		break;
 	default:
 		return 0;
 	}
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index ceac8f5dc..2f0a4a98e 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,15 @@ struct uncore_event_desc {
 	const char *config;
 };
 
+struct pci2phy_map {
+	struct list_head list;
+	int segment;
+	int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
 ssize_t uncore_event_show(struct kobject *kobj,
 			  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +326,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
@@ -325,6 +335,7 @@ extern struct event_constraint uncore_constraint_empty;
 int snb_uncore_pci_init(void);
 int ivb_uncore_pci_init(void);
 int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
 void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 
@@ -335,6 +346,8 @@ int ivbep_uncore_pci_init(void);
 void ivbep_uncore_cpu_init(void);
 int hswep_uncore_pci_init(void);
 void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
 
 /* perf_event_intel_uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 4562e9e22..845256158 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -7,6 +7,7 @@
 #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
 #define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC	0x1604
 
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
@@ -44,6 +45,11 @@
 #define SNB_UNC_CBO_0_PER_CTR0                  0x706
 #define SNB_UNC_CBO_MSR_OFFSET                  0x10
 
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0			0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET			0x10
+
 /* NHM global control register */
 #define NHM_UNC_PERF_GLOBAL_CTL                 0x391
 #define NHM_UNC_FIXED_CTR                       0x394
@@ -114,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = {
 	.read_counter	= uncore_msr_read_counter,
 };
 
-static struct event_constraint snb_uncore_cbox_constraints[] = {
+static struct event_constraint snb_uncore_arb_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
 	EVENT_CONSTRAINT_END
@@ -133,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = {
 	.single_fixed	= 1,
 	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
 	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
-	.constraints	= snb_uncore_cbox_constraints,
 	.ops		= &snb_uncore_msr_ops,
 	.format_group	= &snb_uncore_format_group,
 	.event_descs	= snb_uncore_events,
 };
 
+static struct intel_uncore_type snb_uncore_arb = {
+	.name		= "arb",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.perf_ctr	= SNB_UNC_ARB_PER_CTR0,
+	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET,
+	.constraints	= snb_uncore_arb_constraints,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+};
+
 static struct intel_uncore_type *snb_msr_uncores[] = {
 	&snb_uncore_cbox,
+	&snb_uncore_arb,
 	NULL,
 };
 
@@ -400,15 +420,25 @@ static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
 static int snb_pci2phy_map_init(int devid)
 {
 	struct pci_dev *dev = NULL;
-	int bus;
+	struct pci2phy_map *map;
+	int bus, segment;
 
 	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
 	if (!dev)
 		return -ENOTTY;
 
 	bus = dev->bus->number;
-
-	uncore_pcibus_to_physid[bus] = 0;
+	segment = pci_domain_nr(dev->bus);
+
+	raw_spin_lock(&pci2phy_map_lock);
+	map = __find_pci2phy_map(segment);
+	if (!map) {
+		raw_spin_unlock(&pci2phy_map_lock);
+		pci_dev_put(dev);
+		return -ENOMEM;
+	}
+	map->pbus_to_physid[bus] = 0;
+	raw_spin_unlock(&pci2phy_map_lock);
 
 	pci_dev_put(dev);
 
@@ -486,6 +516,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
 	{ /* end: all zeroes */ },
 };
 
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
 static struct pci_driver snb_uncore_pci_driver = {
 	.name		= "snb_uncore",
 	.id_table	= snb_uncore_pci_ids,
@@ -501,6 +539,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
 	.id_table	= hsw_uncore_pci_ids,
 };
 
+static struct pci_driver bdw_uncore_pci_driver = {
+	.name		= "bdw_uncore",
+	.id_table	= bdw_uncore_pci_ids,
+};
+
 struct imc_uncore_pci_dev {
 	__u32 pci_id;
 	struct pci_driver *driver;
@@ -514,6 +557,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
 	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
 	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
 	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+	IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
 	{  /* end marker */ }
 };
 
@@ -561,6 +605,11 @@ int hsw_uncore_pci_init(void)
 	return imc_uncore_pci_init();
 }
 
+int bdw_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
 /* end of Sandy Bridge uncore support */
 
 /* Nehalem uncore support */
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 6d6e85dd5..f0f4fcba2 100644
--- a/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/kernel/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1087,7 +1087,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
 static int snbep_pci2phy_map_init(int devid)
 {
 	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid;
+	int i, bus, nodeid, segment;
+	struct pci2phy_map *map;
 	int err = 0;
 	u32 config = 0;
 
@@ -1106,16 +1107,27 @@ static int snbep_pci2phy_map_init(int devid)
 		err = pci_read_config_dword(ubox_dev, 0x54, &config);
 		if (err)
 			break;
+
+		segment = pci_domain_nr(ubox_dev->bus);
+		raw_spin_lock(&pci2phy_map_lock);
+		map = __find_pci2phy_map(segment);
+		if (!map) {
+			raw_spin_unlock(&pci2phy_map_lock);
+			err = -ENOMEM;
+			break;
+		}
+
 		/*
 		 * every three bits in the Node ID mapping register maps
 		 * to a particular node.
 		 */
 		for (i = 0; i < 8; i++) {
 			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				uncore_pcibus_to_physid[bus] = i;
+				map->pbus_to_physid[bus] = i;
 				break;
 			}
 		}
+		raw_spin_unlock(&pci2phy_map_lock);
 	}
 
 	if (!err) {
@@ -1123,13 +1135,17 @@ static int snbep_pci2phy_map_init(int devid)
 		 * For PCI bus with no UBOX device, find the next bus
 		 * that has UBOX device and use its mapping.
 		 */
-		i = -1;
-		for (bus = 255; bus >= 0; bus--) {
-			if (uncore_pcibus_to_physid[bus] >= 0)
-				i = uncore_pcibus_to_physid[bus];
-			else
-				uncore_pcibus_to_physid[bus] = i;
+		raw_spin_lock(&pci2phy_map_lock);
+		list_for_each_entry(map, &pci2phy_map_head, list) {
+			i = -1;
+			for (bus = 255; bus >= 0; bus--) {
+				if (map->pbus_to_physid[bus] >= 0)
+					i = map->pbus_to_physid[bus];
+				else
+					map->pbus_to_physid[bus] = i;
+			}
 		}
+		raw_spin_unlock(&pci2phy_map_lock);
 	}
 
 	pci_dev_put(ubox_dev);
@@ -2215,7 +2231,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = {
 	NULL,
 };
 
-static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
 	{ /* Home Agent 0 */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
 		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
@@ -2321,3 +2337,167 @@ int hswep_uncore_pci_init(void)
 	return 0;
 }
 /* end of Haswell-EP uncore support */
+
+/* BDX-DE uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_msr_ops,
+	.format_group		= &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= bdx_uncore_cbox_constraints,
+	.ops			= &hswep_uncore_cbox_ops,
+	.format_group		= &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+	&bdx_uncore_ubox,
+	&bdx_uncore_cbox,
+	&hswep_uncore_pcu,
+	NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = bdx_msr_uncores;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 5,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &hswep_uncore_irp_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	BDX_PCI_UNCORE_HA,
+	BDX_PCI_UNCORE_IMC,
+	BDX_PCI_UNCORE_IRP,
+	BDX_PCI_UNCORE_R2PCIE,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
+	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
+	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
+	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
+	NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+	.name		= "bdx_uncore",
+	.id_table	= bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x6f1e);
+
+	if (ret)
+		return ret;
+	uncore_pci_uncores = bdx_pci_uncores;
+	uncore_pci_driver = &bdx_uncore_pci_driver;
+	return 0;
+}
+
+/* end of BDX-DE uncore support */
diff --git a/kernel/arch/x86/kernel/cpu/perf_event_msr.c b/kernel/arch/x86/kernel/cpu/perf_event_msr.c
new file mode 100644
index 000000000..ec863b9a9
--- /dev/null
+++ b/kernel/arch/x86/kernel/cpu/perf_event_msr.c
@@ -0,0 +1,241 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+	PERF_MSR_TSC			= 0,
+	PERF_MSR_APERF			= 1,
+	PERF_MSR_MPERF			= 2,
+	PERF_MSR_PPERF			= 3,
+	PERF_MSR_SMI			= 4,
+
+	PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_MSR_SMI)
+			return true;
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+struct perf_msr {
+	u64	msr;
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct perf_msr msr[] = {
+	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
+	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
+	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
+	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
+	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group events_attr_group = {
+	.name = "events",
+	.attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+static struct attribute_group format_attr_group = {
+	.name = "format",
+	.attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&events_attr_group,
+	&format_attr_group,
+	NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (cfg >= PERF_MSR_EVENT_MAX)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	if (!msr[cfg].attr)
+		return -EINVAL;
+
+	event->hw.idx = -1;
+	event->hw.event_base = msr[cfg].msr;
+	event->hw.config = cfg;
+
+	return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+	u64 now;
+
+	if (event->hw.event_base)
+		rdmsrl(event->hw.event_base, now);
+	else
+		rdtscll(now);
+
+	return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+	u64 prev, now;
+	s64 delta;
+
+	/* Careful, an NMI might modify the previous event value. */
+again:
+	prev = local64_read(&event->hw.prev_count);
+	now = msr_read_counter(event);
+
+	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+	if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+		delta = sign_extend64(delta, 31);
+
+	local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+	u64 now;
+
+	now = msr_read_counter(event);
+	local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+	msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+	msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		msr_event_start(event, flags);
+
+	return 0;
+}
+
+static struct pmu pmu_msr = {
+	.task_ctx_nr	= perf_sw_context,
+	.attr_groups	= attr_groups,
+	.event_init	= msr_event_init,
+	.add		= msr_event_add,
+	.del		= msr_event_del,
+	.start		= msr_event_start,
+	.stop		= msr_event_stop,
+	.read		= msr_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+	int i, j = 0;
+
+	if (!boot_cpu_has(X86_FEATURE_TSC)) {
+		pr_cont("no MSR PMU driver.\n");
+		return 0;
+	}
+
+	/* Probe the MSRs. */
+	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+		u64 val;
+
+		/*
+		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+		 */
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
+	}
+
+	/* List remaining MSRs in the sysfs attrs. */
+	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
+	}
+	events_attrs[j] = NULL;
+
+	perf_pmu_register(&pmu_msr, "msr", -1);
+
+	return 0;
+}
+device_initcall(msr_init);
diff --git a/kernel/arch/x86/kernel/cpu/proc.c b/kernel/arch/x86/kernel/cpu/proc.c
index e7d8c7608..18ca99f27 100644
--- a/kernel/arch/x86/kernel/cpu/proc.c
+++ b/kernel/arch/x86/kernel/cpu/proc.c
@@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 {
 #ifdef CONFIG_SMP
 	seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-	seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
+	seq_printf(m, "siblings\t: %d\n",
+		   cpumask_weight(topology_core_cpumask(cpu)));
 	seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 	seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/kernel/arch/x86/kernel/cpu/scattered.c b/kernel/arch/x86/kernel/cpu/scattered.c
index 3d423a101..608fb26c7 100644
--- a/kernel/arch/x86/kernel/cpu/scattered.c
+++ b/kernel/arch/x86/kernel/cpu/scattered.c
@@ -37,7 +37,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_PLN,		CR_EAX, 4, 0x00000006, 0 },
 		{ X86_FEATURE_PTS,		CR_EAX, 6, 0x00000006, 0 },
 		{ X86_FEATURE_HWP,		CR_EAX, 7, 0x00000006, 0 },
-		{ X86_FEATURE_HWP_NOITFY,	CR_EAX, 8, 0x00000006, 0 },
+		{ X86_FEATURE_HWP_NOTIFY,	CR_EAX, 8, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_ACT_WINDOW,	CR_EAX, 9, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_EPP,		CR_EAX,10, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_PKG_REQ,	CR_EAX,11, 0x00000006, 0 },
diff --git a/kernel/arch/x86/kernel/cpuid.c b/kernel/arch/x86/kernel/cpuid.c
index 83741a715..bd3507da3 100644
--- a/kernel/arch/x86/kernel/cpuid.c
+++ b/kernel/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb,
 	return notifier_from_errno(err);
 }
 
-static struct notifier_block __refdata cpuid_class_cpu_notifier =
+static struct notifier_block cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
diff --git a/kernel/arch/x86/kernel/crash.c b/kernel/arch/x86/kernel/crash.c
index c76d3e37c..2c1910f67 100644
--- a/kernel/arch/x86/kernel/crash.c
+++ b/kernel/arch/x86/kernel/crash.c
@@ -22,6 +22,7 @@
 #include <linux/elfcore.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <asm/processor.h>
 #include <asm/hardirq.h>
@@ -74,8 +75,6 @@ struct crash_memmap_data {
 	unsigned int type;
 };
 
-int in_crash_kexec;
-
 /*
  * This is used to VMCLEAR all VMCSs loaded on the
  * processor. And when loading kvm_intel module, the
@@ -131,7 +130,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 
 static void kdump_nmi_shootdown_cpus(void)
 {
-	in_crash_kexec = 1;
 	nmi_shootdown_cpus(kdump_nmi_callback);
 
 	disable_local_APIC();
@@ -184,10 +182,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(unsigned long start_pfn,
-				unsigned long nr_pfn, void *arg)
+static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
 {
-	int *nr_ranges = arg;
+	unsigned int *nr_ranges = arg;
 
 	(*nr_ranges)++;
 	return 0;
@@ -213,7 +210,7 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
 
 	ced->image = image;
 
-	walk_system_ram_range(0, -1, &nr_ranges,
+	walk_system_ram_res(0, -1, &nr_ranges,
 				get_nr_ram_ranges_callback);
 
 	ced->max_nr_ranges = nr_ranges;
diff --git a/kernel/arch/x86/kernel/devicetree.c b/kernel/arch/x86/kernel/devicetree.c
index 6367a780c..1f4acd68b 100644
--- a/kernel/arch/x86/kernel/devicetree.c
+++ b/kernel/arch/x86/kernel/devicetree.c
@@ -4,7 +4,6 @@
 #include <linux/bootmem.h>
 #include <linux/export.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/of.h>
@@ -17,6 +16,7 @@
 #include <linux/of_pci.h>
 #include <linux/initrd.h>
 
+#include <asm/irqdomain.h>
 #include <asm/hpet.h>
 #include <asm/apic.h>
 #include <asm/pci_x86.h>
@@ -65,7 +65,7 @@ static int __init add_bus_probe(void)
 
 	return of_platform_bus_probe(NULL, ce4100_ids, NULL);
 }
-module_init(add_bus_probe);
+device_initcall(add_bus_probe);
 
 #ifdef CONFIG_PCI
 struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
@@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] =
 	},
 };
 
-static int ioapic_xlate(struct irq_domain *domain,
-			struct device_node *controller,
-			const u32 *intspec, u32 intsize,
-			irq_hw_number_t *out_hwirq, u32 *out_type)
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
+	struct of_phandle_args *irq_data = (void *)arg;
 	struct of_ioapic_type *it;
-	u32 line, idx, gsi;
+	struct irq_alloc_info tmp;
 
-	if (WARN_ON(intsize < 2))
+	if (WARN_ON(irq_data->args_count < 2))
 		return -EINVAL;
-
-	line = intspec[0];
-
-	if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
+	if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type))
 		return -EINVAL;
 
-	it = &of_ioapic_type[intspec[1]];
+	it = &of_ioapic_type[irq_data->args[1]];
+	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+	tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+	tmp.ioapic_pin = irq_data->args[0];
 
-	idx = (u32)(long)domain->host_data;
-	gsi = mp_pin_to_gsi(idx, line);
-	if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
-		return -EBUSY;
-
-	*out_hwirq = line;
-	*out_type = it->out_type;
-	return 0;
+	return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
 }
 
-const struct irq_domain_ops ioapic_irq_domain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-	.xlate = ioapic_xlate,
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+	.alloc		= dt_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
 };
 
 static void __init dtb_add_ioapic(struct device_node *dn)
diff --git a/kernel/arch/x86/kernel/e820.c b/kernel/arch/x86/kernel/e820.c
index e2ce85db2..569c1e4f9 100644
--- a/kernel/arch/x86/kernel/e820.c
+++ b/kernel/arch/x86/kernel/e820.c
@@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type)
 	case E820_UNUSABLE:
 		printk(KERN_CONT "unusable");
 		break;
+	case E820_PMEM:
 	case E820_PRAM:
 		printk(KERN_CONT "persistent (type %u)", type);
 		break;
@@ -910,7 +911,7 @@ void __init finish_e820_parsing(void)
 	}
 }
 
-static inline const char *e820_type_to_string(int e820_type)
+static const char *e820_type_to_string(int e820_type)
 {
 	switch (e820_type) {
 	case E820_RESERVED_KERN:
@@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type)
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
 	case E820_UNUSABLE:	return "Unusable memory";
-	case E820_PRAM: return "Persistent RAM";
+	case E820_PRAM: return "Persistent Memory (legacy)";
+	case E820_PMEM: return "Persistent Memory";
 	default:	return "reserved";
 	}
 }
 
+static bool do_mark_busy(u32 type, struct resource *res)
+{
+	/* this is the legacy bios/dos rom-shadow + mmio region */
+	if (res->start < (1ULL<<20))
+		return true;
+
+	/*
+	 * Treat persistent memory like device memory, i.e. reserve it
+	 * for exclusive use of a driver
+	 */
+	switch (type) {
+	case E820_RESERVED:
+	case E820_PRAM:
+	case E820_PMEM:
+		return false;
+	default:
+		return true;
+	}
+}
+
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
@@ -952,9 +974,7 @@ void __init e820_reserve_resources(void)
 		 * pci device BAR resource and insert them later in
 		 * pcibios_resource_survey()
 		 */
-		if (((e820.map[i].type != E820_RESERVED) &&
-		     (e820.map[i].type != E820_PRAM)) ||
-		     res->start < (1ULL<<20)) {
+		if (do_mark_busy(e820.map[i].type, res)) {
 			res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
@@ -1123,7 +1143,8 @@ void __init memblock_find_dma_reserve(void)
 		nr_pages += end_pfn - start_pfn;
 	}
 
-	for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+				NULL) {
 		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
 		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
 		if (start_pfn < end_pfn)
diff --git a/kernel/arch/x86/kernel/early-quirks.c b/kernel/arch/x86/kernel/early-quirks.c
index fe9f0b79a..db9a675e7 100644
--- a/kernel/arch/x86/kernel/early-quirks.c
+++ b/kernel/arch/x86/kernel/early-quirks.c
@@ -546,6 +546,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = {
 	INTEL_BDW_D_IDS(&gen8_stolen_funcs),
 	INTEL_CHV_IDS(&chv_stolen_funcs),
 	INTEL_SKL_IDS(&gen9_stolen_funcs),
+	INTEL_BXT_IDS(&gen9_stolen_funcs),
 };
 
 static void __init intel_graphics_stolen(int num, int slot, int func)
@@ -583,7 +584,7 @@ static void __init intel_graphics_stolen(int num, int slot, int func)
 static void __init force_disable_hpet(int num, int slot, int func)
 {
 #ifdef CONFIG_HPET_TIMER
-	boot_hpet_disable = 1;
+	boot_hpet_disable = true;
 	pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
 #endif
 }
@@ -627,8 +628,12 @@ static struct chipset early_qrk[] __initdata = {
 	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
 	  QFLAG_APPLY_ONCE, intel_graphics_stolen },
 	/*
-	 * HPET on current version of Baytrail platform has accuracy
-	 * problems, disable it for now:
+	 * HPET on the current version of the Baytrail platform has accuracy
+	 * problems: it will halt in deep idle state - so we disable it.
+	 *
+	 * More details can be found in section 18.10.1.3 of the datasheet:
+	 *
+	 *    http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
 	 */
 	{ PCI_VENDOR_ID_INTEL, 0x0f00,
 		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
diff --git a/kernel/arch/x86/kernel/early_printk.c b/kernel/arch/x86/kernel/early_printk.c
index 89427d8d4..21bf92490 100644
--- a/kernel/arch/x86/kernel/early_printk.c
+++ b/kernel/arch/x86/kernel/early_printk.c
@@ -175,7 +175,9 @@ static __init void early_serial_init(char *s)
 	}
 
 	if (*s) {
-		if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+		baud = simple_strtoull(s, &e, 0);
+
+		if (baud == 0 || s == e)
 			baud = DEFAULT_BAUD;
 	}
 
@@ -193,14 +195,14 @@ static __init void early_serial_init(char *s)
 #ifdef CONFIG_PCI
 static void mem32_serial_out(unsigned long addr, int offset, int value)
 {
-	u32 *vaddr = (u32 *)addr;
+	u32 __iomem *vaddr = (u32 __iomem *)addr;
 	/* shift implied by pointer type */
 	writel(value, vaddr + offset);
 }
 
 static unsigned int mem32_serial_in(unsigned long addr, int offset)
 {
-	u32 *vaddr = (u32 *)addr;
+	u32 __iomem *vaddr = (u32 __iomem *)addr;
 	/* shift implied by pointer type */
 	return readl(vaddr + offset);
 }
@@ -314,7 +316,7 @@ static struct console early_serial_console = {
 	.index =	-1,
 };
 
-static inline void early_console_register(struct console *con, int keep_early)
+static void early_console_register(struct console *con, int keep_early)
 {
 	if (con->index != -1) {
 		printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
diff --git a/kernel/arch/x86/kernel/entry_32.S b/kernel/arch/x86/kernel/entry_32.S
deleted file mode 100644
index 8612b314a..000000000
--- a/kernel/arch/x86/kernel/entry_32.S
+++ /dev/null
@@ -1,1417 +0,0 @@
-/*
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
- *
- * Stack layout in 'syscall_exit':
- * 	ptrace needs to have all regs on the stack.
- *	if the order here is changed, it needs to be
- *	updated in fork.c:copy_process, signal.c:do_signal,
- *	ptrace.c and ptrace.h
- *
- *	 0(%esp) - %ebx
- *	 4(%esp) - %ecx
- *	 8(%esp) - %edx
- *       C(%esp) - %esi
- *	10(%esp) - %edi
- *	14(%esp) - %ebp
- *	18(%esp) - %eax
- *	1C(%esp) - %ds
- *	20(%esp) - %es
- *	24(%esp) - %fs
- *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
- *	2C(%esp) - orig_eax
- *	30(%esp) - %eip
- *	34(%esp) - %cs
- *	38(%esp) - %eflags
- *	3C(%esp) - %oldesp
- *	40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/err.h>
-#include <asm/thread_info.h>
-#include <asm/irqflags.h>
-#include <asm/errno.h>
-#include <asm/segment.h>
-#include <asm/smp.h>
-#include <asm/page_types.h>
-#include <asm/percpu.h>
-#include <asm/dwarf2.h>
-#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
-#include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
-#include <asm/alternative-asm.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE	   0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit	syscall_trace_entry
-#define sysexit_audit	syscall_exit_work
-#endif
-
-	.section .entry.text, "ax"
-
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization.  The following will never clobber any registers:
- *   INTERRUPT_RETURN (aka. "iret")
- *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-#define preempt_stop(clobbers)
-#define resume_kernel		restore_all
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
-	jz 1f
-	TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc.  Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
-	pushl_cfi $0
-.endm
-.macro POP_GS pop=0
-	addl $(4 + \pop), %esp
-	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else	/* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
-	pushl_cfi %gs
-	/*CFI_REL_OFFSET gs, 0*/
-.endm
-
-.macro POP_GS pop=0
-98:	popl_cfi %gs
-	/*CFI_RESTORE gs*/
-  .if \pop <> 0
-	add $\pop, %esp
-	CFI_ADJUST_CFA_OFFSET -\pop
-  .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99:	movl $0, (%esp)
-	jmp 98b
-.popsection
-	_ASM_EXTABLE(98b,99b)
-.endm
-
-.macro PTGS_TO_GS
-98:	mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99:	movl $0, PT_GS(%esp)
-	jmp 98b
-.popsection
-	_ASM_EXTABLE(98b,99b)
-.endm
-
-.macro GS_TO_REG reg
-	movl %gs, \reg
-	/*CFI_REGISTER gs, \reg*/
-.endm
-.macro REG_TO_PTGS reg
-	movl \reg, PT_GS(%esp)
-	/*CFI_REL_OFFSET gs, PT_GS*/
-.endm
-.macro SET_KERNEL_GS reg
-	movl $(__KERNEL_STACK_CANARY), \reg
-	movl \reg, %gs
-.endm
-
-#endif	/* CONFIG_X86_32_LAZY_GS */
-
-.macro SAVE_ALL
-	cld
-	PUSH_GS
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0;*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0;*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0;*/
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
-	movl $(__USER_DS), %edx
-	movl %edx, %ds
-	movl %edx, %es
-	movl $(__KERNEL_PERCPU), %edx
-	movl %edx, %fs
-	SET_KERNEL_GS %edx
-.endm
-
-.macro RESTORE_INT_REGS
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebp
-	CFI_RESTORE ebp
-	popl_cfi %eax
-	CFI_RESTORE eax
-.endm
-
-.macro RESTORE_REGS pop=0
-	RESTORE_INT_REGS
-1:	popl_cfi %ds
-	/*CFI_RESTORE ds;*/
-2:	popl_cfi %es
-	/*CFI_RESTORE es;*/
-3:	popl_cfi %fs
-	/*CFI_RESTORE fs;*/
-	POP_GS \pop
-.pushsection .fixup, "ax"
-4:	movl $0, (%esp)
-	jmp 1b
-5:	movl $0, (%esp)
-	jmp 2b
-6:	movl $0, (%esp)
-	jmp 3b
-.popsection
-	_ASM_EXTABLE(1b,4b)
-	_ASM_EXTABLE(2b,5b)
-	_ASM_EXTABLE(3b,6b)
-	POP_GS_EX
-.endm
-
-.macro RING0_INT_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 3*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 4*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
-	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
-	CFI_OFFSET eip, PT_EIP-PT_OLDESP
-	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
-	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
-	CFI_OFFSET eax, PT_EAX-PT_OLDESP
-	CFI_OFFSET ebp, PT_EBP-PT_OLDESP
-	CFI_OFFSET edi, PT_EDI-PT_OLDESP
-	CFI_OFFSET esi, PT_ESI-PT_OLDESP
-	CFI_OFFSET edx, PT_EDX-PT_OLDESP
-	CFI_OFFSET ecx, PT_ECX-PT_OLDESP
-	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
-ENTRY(ret_from_fork)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	call schedule_tail
-	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
-	jmp syscall_exit
-	CFI_ENDPROC
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	call schedule_tail
-	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
-	movl PT_EBP(%esp),%eax
-	call *PT_EBX(%esp)
-	movl $0,PT_EAX(%esp)
-	jmp syscall_exit
-	CFI_ENDPROC
-ENDPROC(ret_from_kernel_thread)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
-	# userspace resumption stub bypassing syscall exit tracing
-	ALIGN
-	RING0_PTREGS_FRAME
-ret_from_exception:
-	preempt_stop(CLBR_ANY)
-ret_from_intr:
-	GET_THREAD_INFO(%ebp)
-#ifdef CONFIG_VM86
-	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
-	movb PT_CS(%esp), %al
-	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
-	/*
-	 * We can be coming here from child spawned by kernel_thread().
-	 */
-	movl PT_CS(%esp), %eax
-	andl $SEGMENT_RPL_MASK, %eax
-#endif
-	cmpl $USER_RPL, %eax
-	jb resume_kernel		# not returning to v8086 or userspace
-
-ENTRY(resume_userspace)
-	LOCKDEP_SYS_EXIT
- 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
-					# int/exception return?
-	jne work_pending
-	jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
-	DISABLE_INTERRUPTS(CLBR_ANY)
-need_resched:
-	# preempt count == 0 + NEED_RS set?
-	cmpl $0,PER_CPU_VAR(__preempt_count)
-#ifndef CONFIG_PREEMPT_LAZY
-	jnz restore_all
-#else
-	jz test_int_off
-
-	# atleast preempt count == 0 ?
-	cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
-	jne restore_all
-
-	cmpl $0,TI_preempt_lazy_count(%ebp)	# non-zero preempt_lazy_count ?
-	jnz restore_all
-
-	testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
-	jz restore_all
-test_int_off:
-#endif
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
-	jz restore_all
-	call preempt_schedule_irq
-	jmp need_resched
-END(resume_kernel)
-#endif
-	CFI_ENDPROC
-
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
-   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
-
-	# sysenter call handler stub
-ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 0
-	CFI_REGISTER esp, ebp
-	movl TSS_sysenter_sp0(%esp),%esp
-sysenter_past_esp:
-	/*
-	 * Interrupts are disabled here, but we can't trace it until
-	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
-	 * we immediately enable interrupts at that point anyway.
-	 */
-	pushl_cfi $__USER_DS
-	/*CFI_REL_OFFSET ss, 0*/
-	pushl_cfi %ebp
-	CFI_REL_OFFSET esp, 0
-	pushfl_cfi
-	orl $X86_EFLAGS_IF, (%esp)
-	pushl_cfi $__USER_CS
-	/*CFI_REL_OFFSET cs, 0*/
-	/*
-	 * Push current_thread_info()->sysenter_return to the stack.
-	 * A tiny bit of offset fixup is necessary: TI_sysenter_return
-	 * is relative to thread_info, which is at the bottom of the
-	 * kernel stack page.  4*4 means the 4 words pushed above;
-	 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
-	 * and THREAD_SIZE takes us to the bottom.
-	 */
-	pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
-	CFI_REL_OFFSET eip, 0
-
-	pushl_cfi %eax
-	SAVE_ALL
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
-	cmpl $__PAGE_OFFSET-3,%ebp
-	jae syscall_fault
-	ASM_STAC
-1:	movl (%ebp),%ebp
-	ASM_CLAC
-	movl %ebp,PT_EBP(%esp)
-	_ASM_EXTABLE(1b,syscall_fault)
-
-	GET_THREAD_INFO(%ebp)
-
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
-	jnz sysenter_audit
-sysenter_do_call:
-	cmpl $(NR_syscalls), %eax
-	jae sysenter_badsys
-	call *sys_call_table(,%eax,4)
-sysenter_after_call:
-	movl %eax,PT_EAX(%esp)
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $_TIF_ALLWORK_MASK, %ecx
-	jnz sysexit_audit
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
-	movl PT_EIP(%esp), %edx
-	movl PT_OLDESP(%esp), %ecx
-	xorl %ebp,%ebp
-	TRACE_IRQS_ON
-1:	mov  PT_FS(%esp), %fs
-	PTGS_TO_GS
-	ENABLE_INTERRUPTS_SYSEXIT
-
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
-	jnz syscall_trace_entry
-	/* movl PT_EAX(%esp), %eax	already set, syscall number: 1st arg to audit */
-	movl PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
-	/* movl PT_ECX(%esp), %ecx	already set, a1: 3nd arg to audit */
-	pushl_cfi PT_ESI(%esp)		/* a3: 5th arg */
-	pushl_cfi PT_EDX+4(%esp)	/* a2: 4th arg */
-	call __audit_syscall_entry
-	popl_cfi %ecx /* get that remapped edx off the stack */
-	popl_cfi %ecx /* get that remapped esi off the stack */
-	movl PT_EAX(%esp),%eax		/* reload syscall number */
-	jmp sysenter_do_call
-
-sysexit_audit:
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz syscall_exit_work
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	setbe %al		/* 1 if so, 0 if not */
-	movzbl %al,%eax		/* zero-extend that */
-	call __audit_syscall_exit
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz syscall_exit_work
-	movl PT_EAX(%esp),%eax	/* reload syscall return value */
-	jmp sysenter_exit
-#endif
-
-	CFI_ENDPROC
-.pushsection .fixup,"ax"
-2:	movl $0,PT_FS(%esp)
-	jmp 1b
-.popsection
-	_ASM_EXTABLE(1b,2b)
-	PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
-
-	# system call handler stub
-ENTRY(system_call)
-	RING0_INT_FRAME			# can't unwind into user space anyway
-	ASM_CLAC
-	pushl_cfi %eax			# save orig_eax
-	SAVE_ALL
-	GET_THREAD_INFO(%ebp)
-					# system call tracing in operation / emulation
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
-	jnz syscall_trace_entry
-	cmpl $(NR_syscalls), %eax
-	jae syscall_badsys
-syscall_call:
-	call *sys_call_table(,%eax,4)
-syscall_after_call:
-	movl %eax,PT_EAX(%esp)		# store the return value
-syscall_exit:
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
-	jnz syscall_exit_work
-
-restore_all:
-	TRACE_IRQS_IRET
-restore_all_notrace:
-#ifdef CONFIG_X86_ESPFIX32
-	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
-	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
-	# are returning to the kernel.
-	# See comments in process.c:copy_thread() for details.
-	movb PT_OLDSS(%esp), %ah
-	movb PT_CS(%esp), %al
-	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
-	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
-	CFI_REMEMBER_STATE
-	je ldt_ss			# returning to user-space with LDT SS
-#endif
-restore_nocheck:
-	RESTORE_REGS 4			# skip orig_eax/error_code
-irq_return:
-	INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
-	pushl $0			# no error code
-	pushl $do_iret_error
-	jmp error_code
-.previous
-	_ASM_EXTABLE(irq_return,iret_exc)
-
-#ifdef CONFIG_X86_ESPFIX32
-	CFI_RESTORE_STATE
-ldt_ss:
-#ifdef CONFIG_PARAVIRT
-	/*
-	 * The kernel can't run on a non-flat stack if paravirt mode
-	 * is active.  Rather than try to fixup the high bits of
-	 * ESP, bypass this code entirely.  This may break DOSemu
-	 * and/or Wine support in a paravirt VM, although the option
-	 * is still available to implement the setting of the high
-	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
-	 */
-	cmpl $0, pv_info+PARAVIRT_enabled
-	jne restore_nocheck
-#endif
-
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
-	mov %esp, %edx			/* load kernel esp */
-	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
-	mov %dx, %ax			/* eax: new kernel esp */
-	sub %eax, %edx			/* offset (low word is 0) */
-	shr $16, %edx
-	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
-	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-	pushl_cfi $__ESPFIX_SS
-	pushl_cfi %eax			/* new kernel esp */
-	/* Disable interrupts, but do not irqtrace this section: we
-	 * will soon execute iret and the tracer was already set to
-	 * the irqstate after the iret */
-	DISABLE_INTERRUPTS(CLBR_EAX)
-	lss (%esp), %esp		/* switch to espfix segment */
-	CFI_ADJUST_CFA_OFFSET -8
-	jmp restore_nocheck
-#endif
-	CFI_ENDPROC
-ENDPROC(system_call)
-
-	# perform work that needs to be done immediately before resumption
-	ALIGN
-	RING0_PTREGS_FRAME		# can't unwind into user space anyway
-work_pending:
-	testl $_TIF_NEED_RESCHED_MASK, %ecx
-	jz work_notifysig
-work_resched:
-	call schedule
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
-					# than syscall tracing?
-	jz restore_all
-	testl $_TIF_NEED_RESCHED_MASK, %ecx
-	jnz work_resched
-
-work_notifysig:				# deal with pending signals and
-					# notify-resume requests
-#ifdef CONFIG_VM86
-	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl %esp, %eax
-	jnz work_notifysig_v86		# returning to kernel-space or
-					# vm86-space
-1:
-#else
-	movl %esp, %eax
-#endif
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	movb PT_CS(%esp), %bl
-	andb $SEGMENT_RPL_MASK, %bl
-	cmpb $USER_RPL, %bl
-	jb resume_kernel
-	xorl %edx, %edx
-	call do_notify_resume
-	jmp resume_userspace
-
-#ifdef CONFIG_VM86
-	ALIGN
-work_notifysig_v86:
-	pushl_cfi %ecx			# save ti_flags for do_notify_resume
-	call save_v86_state		# %eax contains pt_regs pointer
-	popl_cfi %ecx
-	movl %eax, %esp
-	jmp 1b
-#endif
-END(work_pending)
-
-	# perform syscall exit tracing
-	ALIGN
-syscall_trace_entry:
-	movl $-ENOSYS,PT_EAX(%esp)
-	movl %esp, %eax
-	call syscall_trace_enter
-	/* What it returned is what we'll actually use.  */
-	cmpl $(NR_syscalls), %eax
-	jnae syscall_call
-	jmp syscall_exit
-END(syscall_trace_entry)
-
-	# perform syscall exit tracing
-	ALIGN
-syscall_exit_work:
-	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
-	jz work_pending
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
-					# schedule() instead
-	movl %esp, %eax
-	call syscall_trace_leave
-	jmp resume_userspace
-END(syscall_exit_work)
-	CFI_ENDPROC
-
-	RING0_INT_FRAME			# can't unwind into user space anyway
-syscall_fault:
-	ASM_CLAC
-	GET_THREAD_INFO(%ebp)
-	movl $-EFAULT,PT_EAX(%esp)
-	jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
-	movl $-ENOSYS,%eax
-	jmp syscall_after_call
-END(syscall_badsys)
-
-sysenter_badsys:
-	movl $-ENOSYS,%eax
-	jmp sysenter_after_call
-END(sysenter_badsys)
-	CFI_ENDPROC
-
-.macro FIXUP_ESPFIX_STACK
-/*
- * Switch back for ESPFIX stack to the normal zerobased stack
- *
- * We can't call C functions using the ESPFIX stack. This code reads
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
-#ifdef CONFIG_X86_ESPFIX32
-	/* fixup the stack */
-	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
-	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
-	shl $16, %eax
-	addl %esp, %eax			/* the adjusted stack pointer */
-	pushl_cfi $__KERNEL_DS
-	pushl_cfi %eax
-	lss (%esp), %esp		/* switch to the normal stack segment */
-	CFI_ADJUST_CFA_OFFSET -8
-#endif
-.endm
-.macro UNWIND_ESPFIX_STACK
-#ifdef CONFIG_X86_ESPFIX32
-	movl %ss, %eax
-	/* see if on espfix stack */
-	cmpw $__ESPFIX_SS, %ax
-	jne 27f
-	movl $__KERNEL_DS, %eax
-	movl %eax, %ds
-	movl %eax, %es
-	/* switch to normal stack */
-	FIXUP_ESPFIX_STACK
-27:
-#endif
-.endm
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
-	.align 8
-ENTRY(irq_entries_start)
-	RING0_INT_FRAME
-    vector=FIRST_EXTERNAL_VECTOR
-    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
-	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
-    vector=vector+1
-	jmp	common_interrupt
-	CFI_ADJUST_CFA_OFFSET -4
-	.align	8
-    .endr
-END(irq_entries_start)
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
-	.p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
-	ASM_CLAC
-	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	movl %esp,%eax
-	call do_IRQ
-	jmp ret_from_intr
-ENDPROC(common_interrupt)
-	CFI_ENDPROC
-
-#define BUILD_INTERRUPT3(name, nr, fn)	\
-ENTRY(name)				\
-	RING0_INT_FRAME;		\
-	ASM_CLAC;			\
-	pushl_cfi $~(nr);		\
-	SAVE_ALL;			\
-	TRACE_IRQS_OFF			\
-	movl %esp,%eax;			\
-	call fn;			\
-	jmp ret_from_intr;		\
-	CFI_ENDPROC;			\
-ENDPROC(name)
-
-
-#ifdef CONFIG_TRACING
-#define TRACE_BUILD_INTERRUPT(name, nr)		\
-	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
-#else
-#define TRACE_BUILD_INTERRUPT(name, nr)
-#endif
-
-#define BUILD_INTERRUPT(name, nr) \
-	BUILD_INTERRUPT3(name, nr, smp_##name); \
-	TRACE_BUILD_INTERRUPT(name, nr)
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-ENTRY(coprocessor_error)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_error
-	jmp error_code
-	CFI_ENDPROC
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-#ifdef CONFIG_X86_INVD_BUG
-	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-	ALTERNATIVE "pushl_cfi $do_general_protection",	\
-		    "pushl $do_simd_coprocessor_error", \
-		    X86_FEATURE_XMM
-#else
-	pushl_cfi $do_simd_coprocessor_error
-#endif
-	jmp error_code
-	CFI_ENDPROC
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
-	pushl_cfi $do_device_not_available
-	jmp error_code
-	CFI_ENDPROC
-END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
-	iret
-	_ASM_EXTABLE(native_iret, iret_exc)
-END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
-	sti
-	sysexit
-END(native_irq_enable_sysexit)
-#endif
-
-ENTRY(overflow)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_overflow
-	jmp error_code
-	CFI_ENDPROC
-END(overflow)
-
-ENTRY(bounds)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_bounds
-	jmp error_code
-	CFI_ENDPROC
-END(bounds)
-
-ENTRY(invalid_op)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_invalid_op
-	jmp error_code
-	CFI_ENDPROC
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_segment_overrun
-	jmp error_code
-	CFI_ENDPROC
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_invalid_TSS
-	jmp error_code
-	CFI_ENDPROC
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_segment_not_present
-	jmp error_code
-	CFI_ENDPROC
-END(segment_not_present)
-
-ENTRY(stack_segment)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_stack_segment
-	jmp error_code
-	CFI_ENDPROC
-END(stack_segment)
-
-ENTRY(alignment_check)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_alignment_check
-	jmp error_code
-	CFI_ENDPROC
-END(alignment_check)
-
-ENTRY(divide_error)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0			# no error code
-	pushl_cfi $do_divide_error
-	jmp error_code
-	CFI_ENDPROC
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi machine_check_vector
-	jmp error_code
-	CFI_ENDPROC
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_spurious_interrupt_bug
-	jmp error_code
-	CFI_ENDPROC
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_XEN
-/* Xen doesn't set %esp to be precisely what the normal sysenter
-   entrypoint expects, so fix it up before using the normal path. */
-ENTRY(xen_sysenter_target)
-	RING0_INT_FRAME
-	addl $5*4, %esp		/* remove xen-provided frame */
-	CFI_ADJUST_CFA_OFFSET -5*4
-	jmp sysenter_past_esp
-	CFI_ENDPROC
-
-ENTRY(xen_hypervisor_callback)
-	CFI_STARTPROC
-	pushl_cfi $-1 /* orig_ax = -1 => not a system call */
-	SAVE_ALL
-	TRACE_IRQS_OFF
-
-	/* Check to see if we got the event in the critical
-	   region in xen_iret_direct, after we've reenabled
-	   events and checked for pending events.  This simulates
-	   iret instruction's behaviour where it delivers a
-	   pending interrupt when enabling interrupts. */
-	movl PT_EIP(%esp),%eax
-	cmpl $xen_iret_start_crit,%eax
-	jb   1f
-	cmpl $xen_iret_end_crit,%eax
-	jae  1f
-
-	jmp  xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1:	mov %esp, %eax
-	call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPT
-	call xen_maybe_preempt_hcall
-#endif
-	jmp  ret_from_intr
-	CFI_ENDPROC
-ENDPROC(xen_hypervisor_callback)
-
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-#  1. Fault while reloading DS, ES, FS or GS
-#  2. Fault while executing IRET
-# Category 1 we fix up by reattempting the load, and zeroing the segment
-# register if the load fails.
-# Category 2 we fix up by jumping to do_iret_error. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by maintaining a status value in EAX.
-ENTRY(xen_failsafe_callback)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	movl $1,%eax
-1:	mov 4(%esp),%ds
-2:	mov 8(%esp),%es
-3:	mov 12(%esp),%fs
-4:	mov 16(%esp),%gs
-	/* EAX == 0 => Category 1 (Bad segment)
-	   EAX != 0 => Category 2 (Bad IRET) */
-	testl %eax,%eax
-	popl_cfi %eax
-	lea 16(%esp),%esp
-	CFI_ADJUST_CFA_OFFSET -16
-	jz 5f
-	jmp iret_exc
-5:	pushl_cfi $-1 /* orig_ax = -1 => not a system call */
-	SAVE_ALL
-	jmp ret_from_exception
-	CFI_ENDPROC
-
-.section .fixup,"ax"
-6:	xorl %eax,%eax
-	movl %eax,4(%esp)
-	jmp 1b
-7:	xorl %eax,%eax
-	movl %eax,8(%esp)
-	jmp 2b
-8:	xorl %eax,%eax
-	movl %eax,12(%esp)
-	jmp 3b
-9:	xorl %eax,%eax
-	movl %eax,16(%esp)
-	jmp 4b
-.previous
-	_ASM_EXTABLE(1b,6b)
-	_ASM_EXTABLE(2b,7b)
-	_ASM_EXTABLE(3b,8b)
-	_ASM_EXTABLE(4b,9b)
-ENDPROC(xen_failsafe_callback)
-
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
-		xen_evtchn_do_upcall)
-
-#endif	/* CONFIG_XEN */
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
-	hyperv_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
-	ret
-END(mcount)
-
-ENTRY(ftrace_caller)
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	pushl $0	/* Pass NULL as regs pointer */
-	movl 4*4(%esp), %eax
-	movl 0x4(%ebp), %edx
-	movl function_trace_op, %ecx
-	subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
-	call ftrace_stub
-
-	addl $4,%esp	/* skip NULL pointer */
-	popl %edx
-	popl %ecx
-	popl %eax
-ftrace_ret:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	jmp ftrace_stub
-#endif
-
-.globl ftrace_stub
-ftrace_stub:
-	ret
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
-	pushf	/* push flags before compare (in cs location) */
-
-	/*
-	 * i386 does not save SS and ESP when coming from kernel.
-	 * Instead, to get sp, &regs->sp is used (see ptrace.h).
-	 * Unfortunately, that means eflags must be at the same location
-	 * as the current return ip is. We move the return ip into the
-	 * ip location, and move flags into the return ip location.
-	 */
-	pushl 4(%esp)	/* save return ip into ip slot */
-
-	pushl $0	/* Load 0 into orig_ax */
-	pushl %gs
-	pushl %fs
-	pushl %es
-	pushl %ds
-	pushl %eax
-	pushl %ebp
-	pushl %edi
-	pushl %esi
-	pushl %edx
-	pushl %ecx
-	pushl %ebx
-
-	movl 13*4(%esp), %eax	/* Get the saved flags */
-	movl %eax, 14*4(%esp)	/* Move saved flags into regs->flags location */
-				/* clobbering return ip */
-	movl $__KERNEL_CS,13*4(%esp)
-
-	movl 12*4(%esp), %eax	/* Load ip (1st parameter) */
-	subl $MCOUNT_INSN_SIZE, %eax	/* Adjust ip */
-	movl 0x4(%ebp), %edx	/* Load parent ip (2nd parameter) */
-	movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
-	pushl %esp		/* Save pt_regs as 4th parameter */
-
-GLOBAL(ftrace_regs_call)
-	call ftrace_stub
-
-	addl $4, %esp		/* Skip pt_regs */
-	movl 14*4(%esp), %eax	/* Move flags back into cs */
-	movl %eax, 13*4(%esp)	/* Needed to keep addl from modifying flags */
-	movl 12*4(%esp), %eax	/* Get return ip from regs->ip */
-	movl %eax, 14*4(%esp)	/* Put return ip back for ret */
-
-	popl %ebx
-	popl %ecx
-	popl %edx
-	popl %esi
-	popl %edi
-	popl %ebp
-	popl %eax
-	popl %ds
-	popl %es
-	popl %fs
-	popl %gs
-	addl $8, %esp		/* Skip orig_ax and ip */
-	popf			/* Pop flags at end (no addl to corrupt flags) */
-	jmp ftrace_ret
-
-	popf
-	jmp  ftrace_stub
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
-	cmpl $__PAGE_OFFSET, %esp
-	jb ftrace_stub		/* Paging not enabled yet? */
-
-	cmpl $ftrace_stub, ftrace_trace_function
-	jnz trace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	cmpl $ftrace_stub, ftrace_graph_return
-	jnz ftrace_graph_caller
-
-	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
-	jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
-	ret
-
-	/* taken from glibc */
-trace:
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	movl 0xc(%esp), %eax
-	movl 0x4(%ebp), %edx
-	subl $MCOUNT_INSN_SIZE, %eax
-
-	call *ftrace_trace_function
-
-	popl %edx
-	popl %ecx
-	popl %eax
-	jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	movl 0xc(%esp), %eax
-	lea 0x4(%ebp), %edx
-	movl (%ebp), %ecx
-	subl $MCOUNT_INSN_SIZE, %eax
-	call prepare_ftrace_return
-	popl %edx
-	popl %ecx
-	popl %eax
-	ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
-	pushl %eax
-	pushl %edx
-	movl %ebp, %eax
-	call ftrace_return_to_handler
-	movl %eax, %ecx
-	popl %edx
-	popl %eax
-	jmp *%ecx
-#endif
-
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $trace_do_page_fault
-	jmp error_code
-	CFI_ENDPROC
-END(trace_page_fault)
-#endif
-
-ENTRY(page_fault)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_page_fault
-	ALIGN
-error_code:
-	/* the function address is in %gs's slot on the stack */
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0*/
-	pushl_cfi_reg eax
-	pushl_cfi_reg ebp
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg edx
-	pushl_cfi_reg ecx
-	pushl_cfi_reg ebx
-	cld
-	movl $(__KERNEL_PERCPU), %ecx
-	movl %ecx, %fs
-	UNWIND_ESPFIX_STACK
-	GS_TO_REG %ecx
-	movl PT_GS(%esp), %edi		# get the function address
-	movl PT_ORIG_EAX(%esp), %edx	# get the error code
-	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	REG_TO_PTGS %ecx
-	SET_KERNEL_GS %ecx
-	movl $(__USER_DS), %ecx
-	movl %ecx, %ds
-	movl %ecx, %es
-	TRACE_IRQS_OFF
-	movl %esp,%eax			# pt_regs pointer
-	call *%edi
-	jmp ret_from_exception
-	CFI_ENDPROC
-END(page_fault)
-
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-	cmpw $__KERNEL_CS, 4(%esp)
-	jne \ok
-\label:
-	movl TSS_sysenter_sp0 + \offset(%esp), %esp
-	CFI_DEF_CFA esp, 0
-	CFI_UNDEFINED eip
-	pushfl_cfi
-	pushl_cfi $__KERNEL_CS
-	pushl_cfi $sysenter_past_esp
-	CFI_REL_OFFSET eip, 0
-.endm
-
-ENTRY(debug)
-	RING0_INT_FRAME
-	ASM_CLAC
-	cmpl $ia32_sysenter_target,(%esp)
-	jne debug_stack_correct
-	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
-	pushl_cfi $-1			# mark this as an int
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx			# error code 0
-	movl %esp,%eax			# pt_regs pointer
-	call do_debug
-	jmp ret_from_exception
-	CFI_ENDPROC
-END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-ENTRY(nmi)
-	RING0_INT_FRAME
-	ASM_CLAC
-#ifdef CONFIG_X86_ESPFIX32
-	pushl_cfi %eax
-	movl %ss, %eax
-	cmpw $__ESPFIX_SS, %ax
-	popl_cfi %eax
-	je nmi_espfix_stack
-#endif
-	cmpl $ia32_sysenter_target,(%esp)
-	je nmi_stack_fixup
-	pushl_cfi %eax
-	movl %esp,%eax
-	/* Do not access memory above the end of our stack page,
-	 * it might not exist.
-	 */
-	andl $(THREAD_SIZE-1),%eax
-	cmpl $(THREAD_SIZE-20),%eax
-	popl_cfi %eax
-	jae nmi_stack_correct
-	cmpl $ia32_sysenter_target,12(%esp)
-	je nmi_debug_stack_check
-nmi_stack_correct:
-	/* We have a RING0_INT_FRAME here */
-	pushl_cfi %eax
-	SAVE_ALL
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_nmi
-	jmp restore_all_notrace
-	CFI_ENDPROC
-
-nmi_stack_fixup:
-	RING0_INT_FRAME
-	FIX_STACK 12, nmi_stack_correct, 1
-	jmp nmi_stack_correct
-
-nmi_debug_stack_check:
-	/* We have a RING0_INT_FRAME here */
-	cmpw $__KERNEL_CS,16(%esp)
-	jne nmi_stack_correct
-	cmpl $debug,(%esp)
-	jb nmi_stack_correct
-	cmpl $debug_esp_fix_insn,(%esp)
-	ja nmi_stack_correct
-	FIX_STACK 24, nmi_stack_correct, 1
-	jmp nmi_stack_correct
-
-#ifdef CONFIG_X86_ESPFIX32
-nmi_espfix_stack:
-	/* We have a RING0_INT_FRAME here.
-	 *
-	 * create the pointer to lss back
-	 */
-	pushl_cfi %ss
-	pushl_cfi %esp
-	addl $4, (%esp)
-	/* copy the iret frame of 12 bytes */
-	.rept 3
-	pushl_cfi 16(%esp)
-	.endr
-	pushl_cfi %eax
-	SAVE_ALL
-	FIXUP_ESPFIX_STACK		# %eax == %esp
-	xorl %edx,%edx			# zero error code
-	call do_nmi
-	RESTORE_REGS
-	lss 12+4(%esp), %esp		# back to espfix stack
-	CFI_ADJUST_CFA_OFFSET -24
-	jmp irq_return
-#endif
-	CFI_ENDPROC
-END(nmi)
-
-ENTRY(int3)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_int3
-	jmp ret_from_exception
-	CFI_ENDPROC
-END(int3)
-
-ENTRY(general_protection)
-	RING0_EC_FRAME
-	pushl_cfi $do_general_protection
-	jmp error_code
-	CFI_ENDPROC
-END(general_protection)
-
-#ifdef CONFIG_KVM_GUEST
-ENTRY(async_page_fault)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_async_page_fault
-	jmp error_code
-	CFI_ENDPROC
-END(async_page_fault)
-#endif
-
diff --git a/kernel/arch/x86/kernel/espfix_64.c b/kernel/arch/x86/kernel/espfix_64.c
index f5d0730e7..4d38416e2 100644
--- a/kernel/arch/x86/kernel/espfix_64.c
+++ b/kernel/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
-		rdtscll(rand);
+		rand = rdtsc();
 		rand *= 0xc345c6b72fd16123UL;
 	}
 
@@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
 	init_espfix_random();
 
 	/* The rest is the same as for any other processor */
-	init_espfix_ap();
+	init_espfix_ap(0);
 }
 
-void init_espfix_ap(void)
+void init_espfix_ap(int cpu)
 {
-	unsigned int cpu, page;
+	unsigned int page;
 	unsigned long addr;
 	pud_t pud, *pud_p;
 	pmd_t pmd, *pmd_p;
 	pte_t pte, *pte_p;
-	int n;
+	int n, node;
 	void *stack_page;
 	pteval_t ptemask;
 
 	/* We only have to do this once... */
-	if (likely(this_cpu_read(espfix_stack)))
+	if (likely(per_cpu(espfix_stack, cpu)))
 		return;		/* Already initialized */
 
-	cpu = smp_processor_id();
 	addr = espfix_base_addr(cpu);
 	page = cpu/ESPFIX_STACKS_PER_PAGE;
 
@@ -165,12 +164,15 @@ void init_espfix_ap(void)
 	if (stack_page)
 		goto unlock_done;
 
+	node = cpu_to_node(cpu);
 	ptemask = __supported_pte_mask;
 
 	pud_p = &espfix_pud_page[pud_index(addr)];
 	pud = *pud_p;
 	if (!pud_present(pud)) {
-		pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+		pmd_p = (pmd_t *)page_address(page);
 		pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
 		paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
 		for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -180,7 +182,9 @@ void init_espfix_ap(void)
 	pmd_p = pmd_offset(&pud, addr);
 	pmd = *pmd_p;
 	if (!pmd_present(pmd)) {
-		pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+		struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+		pte_p = (pte_t *)page_address(page);
 		pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
 		paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
 		for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -188,7 +192,7 @@ void init_espfix_ap(void)
 	}
 
 	pte_p = pte_offset_kernel(&pmd, addr);
-	stack_page = (void *)__get_free_page(GFP_KERNEL);
+	stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
 	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
 	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -199,7 +203,7 @@ void init_espfix_ap(void)
 unlock_done:
 	mutex_unlock(&espfix_init_mutex);
 done:
-	this_cpu_write(espfix_stack, addr);
-	this_cpu_write(espfix_waddr, (unsigned long)stack_page
-		       + (addr & ~PAGE_MASK));
+	per_cpu(espfix_stack, cpu) = addr;
+	per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+				      + (addr & ~PAGE_MASK);
 }
diff --git a/kernel/arch/x86/kernel/fpu/Makefile b/kernel/arch/x86/kernel/fpu/Makefile
new file mode 100644
index 000000000..68279efb8
--- /dev/null
+++ b/kernel/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,5 @@
+#
+# Build rules for the FPU support code:
+#
+
+obj-y				+= init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/kernel/arch/x86/kernel/fpu/bugs.c b/kernel/arch/x86/kernel/fpu/bugs.c
new file mode 100644
index 000000000..dd9ca9b60
--- /dev/null
+++ b/kernel/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,71 @@
+/*
+ * x86 FPU bug checks:
+ */
+#include <asm/fpu/internal.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+static void __init check_fpu(void)
+{
+	u32 cr0_saved;
+	s32 fdiv_bug;
+
+	/* We might have CR0::TS set already, clear it: */
+	cr0_saved = read_cr0();
+	write_cr0(cr0_saved & ~X86_CR0_TS);
+
+	kernel_fpu_begin();
+
+	/*
+	 * trap_init() enabled FXSR and company _before_ testing for FP
+	 * problems here.
+	 *
+	 * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+	 */
+	__asm__("fninit\n\t"
+		"fldl %1\n\t"
+		"fdivl %2\n\t"
+		"fmull %2\n\t"
+		"fldl %1\n\t"
+		"fsubp %%st,%%st(1)\n\t"
+		"fistpl %0\n\t"
+		"fwait\n\t"
+		"fninit"
+		: "=m" (*&fdiv_bug)
+		: "m" (*&x), "m" (*&y));
+
+	kernel_fpu_end();
+
+	write_cr0(cr0_saved);
+
+	if (fdiv_bug) {
+		set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+		pr_warn("Hmm, FPU with FDIV bug\n");
+	}
+}
+
+void __init fpu__init_check_bugs(void)
+{
+	/*
+	 * kernel_fpu_begin/end() in check_fpu() relies on the patched
+	 * alternative instructions.
+	 */
+	if (cpu_has_fpu)
+		check_fpu();
+}
diff --git a/kernel/arch/x86/kernel/fpu/core.c b/kernel/arch/x86/kernel/fpu/core.c
new file mode 100644
index 000000000..d25097c3f
--- /dev/null
+++ b/kernel/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,523 @@
+/*
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/traps.h>
+
+#include <linux/hardirq.h>
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+union fpregs_state init_fpstate __read_mostly;
+
+/*
+ * Track whether the kernel is using the FPU state
+ * currently.
+ *
+ * This flag is used:
+ *
+ *   - by IRQ context code to potentially use the FPU
+ *     if it's unused.
+ *
+ *   - to debug kernel_fpu_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+static void kernel_fpu_disable(void)
+{
+	WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, true);
+}
+
+static void kernel_fpu_enable(void)
+{
+	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, false);
+}
+
+static bool kernel_fpu_disabled(void)
+{
+	return this_cpu_read(in_kernel_fpu);
+}
+
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: the thread must not have fpu (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ *
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
+ */
+static bool interrupted_kernel_fpu_idle(void)
+{
+	if (kernel_fpu_disabled())
+		return false;
+
+	if (use_eager_fpu())
+		return true;
+
+	return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static bool interrupted_user_mode(void)
+{
+	struct pt_regs *regs = get_irq_regs();
+	return regs && user_mode(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+	return !in_interrupt() ||
+		interrupted_user_mode() ||
+		interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void __kernel_fpu_begin(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	WARN_ON_FPU(!irq_fpu_usable());
+
+	kernel_fpu_disable();
+
+	if (fpu->fpregs_active) {
+		copy_fpregs_to_fpstate(fpu);
+	} else {
+		this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+		__fpregs_activate_hw();
+	}
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (fpu->fpregs_active)
+		copy_kernel_to_fpregs(&fpu->state);
+	else
+		__fpregs_deactivate_hw();
+
+	kernel_fpu_enable();
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
+
+void kernel_fpu_begin(void)
+{
+	preempt_disable();
+	__kernel_fpu_begin();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+	__kernel_fpu_end();
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * CR0::TS save/restore functions:
+ */
+int irq_ts_save(void)
+{
+	/*
+	 * If in process context and not atomic, we can take a spurious DNA fault.
+	 * Otherwise, doing clts() in process context requires disabling preemption
+	 * or some heavy lifting like kernel_fpu_begin()
+	 */
+	if (!in_atomic())
+		return 0;
+
+	if (read_cr0() & X86_CR0_TS) {
+		clts();
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_ts_save);
+
+void irq_ts_restore(int TS_state)
+{
+	if (TS_state)
+		stts();
+}
+EXPORT_SYMBOL_GPL(irq_ts_restore);
+
+/*
+ * Save the FPU state (mark it for reload if necessary):
+ *
+ * This only ever gets called for the current task.
+ */
+void fpu__save(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu);
+
+	preempt_disable();
+	if (fpu->fpregs_active) {
+		if (!copy_fpregs_to_fpstate(fpu))
+			fpregs_deactivate(fpu);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__save);
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fregs_state *fp)
+{
+	fp->cwd = 0xffff037fu;
+	fp->swd = 0xffff0000u;
+	fp->twd = 0xffffffffu;
+	fp->fos = 0xffff0000u;
+}
+
+void fpstate_init(union fpregs_state *state)
+{
+	if (!cpu_has_fpu) {
+		fpstate_init_soft(&state->soft);
+		return;
+	}
+
+	memset(state, 0, xstate_size);
+
+	if (cpu_has_fxsr)
+		fpstate_init_fxstate(&state->fxsave);
+	else
+		fpstate_init_fstate(&state->fsave);
+}
+EXPORT_SYMBOL_GPL(fpstate_init);
+
+/*
+ * Copy the current task's FPU state to a new task's FPU context.
+ *
+ * In both the 'eager' and the 'lazy' case we save hardware registers
+ * directly to the destination buffer.
+ */
+static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+	WARN_ON_FPU(src_fpu != &current->thread.fpu);
+
+	/*
+	 * Don't let 'init optimized' areas of the XSAVE area
+	 * leak into the child task:
+	 */
+	if (use_eager_fpu())
+		memset(&dst_fpu->state.xsave, 0, xstate_size);
+
+	/*
+	 * Save current FPU registers directly into the child
+	 * FPU context, without any memory-to-memory copying.
+	 *
+	 * If the FPU context got destroyed in the process (FNSAVE
+	 * done on old CPUs) then copy it back into the source
+	 * context and mark the current task for lazy restore.
+	 *
+	 * We have to do all this with preemption disabled,
+	 * mostly because of the FNSAVE case, because in that
+	 * case we must not allow preemption in the window
+	 * between the FNSAVE and us marking the context lazy.
+	 *
+	 * It shouldn't be an issue as even FNSAVE is plenty
+	 * fast in terms of critical section length.
+	 */
+	preempt_disable();
+	if (!copy_fpregs_to_fpstate(dst_fpu)) {
+		memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
+		fpregs_deactivate(src_fpu);
+	}
+	preempt_enable();
+}
+
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+	dst_fpu->counter = 0;
+	dst_fpu->fpregs_active = 0;
+	dst_fpu->last_cpu = -1;
+
+	if (src_fpu->fpstate_active && cpu_has_fpu)
+		fpu_copy(dst_fpu, src_fpu);
+
+	return 0;
+}
+
+/*
+ * Activate the current task's in-memory FPU context,
+ * if it has not been used before:
+ */
+void fpu__activate_curr(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu);
+
+	if (!fpu->fpstate_active) {
+		fpstate_init(&fpu->state);
+
+		/* Safe to do for the current task: */
+		fpu->fpstate_active = 1;
+	}
+}
+EXPORT_SYMBOL_GPL(fpu__activate_curr);
+
+/*
+ * This function must be called before we read a task's fpstate.
+ *
+ * If the task has not used the FPU before then initialize its
+ * fpstate.
+ *
+ * If the task has used the FPU before then save it.
+ */
+void fpu__activate_fpstate_read(struct fpu *fpu)
+{
+	/*
+	 * If fpregs are active (in the current CPU), then
+	 * copy them to the fpstate:
+	 */
+	if (fpu->fpregs_active) {
+		fpu__save(fpu);
+	} else {
+		if (!fpu->fpstate_active) {
+			fpstate_init(&fpu->state);
+
+			/* Safe to do for current and for stopped child tasks: */
+			fpu->fpstate_active = 1;
+		}
+	}
+}
+
+/*
+ * This function must be called before we write a task's fpstate.
+ *
+ * If the task has used the FPU before then unlazy it.
+ * If the task has not used the FPU before then initialize its fpstate.
+ *
+ * After this function call, after registers in the fpstate are
+ * modified and the child task has woken up, the child task will
+ * restore the modified FPU state from the modified context. If we
+ * didn't clear its lazy status here then the lazy in-registers
+ * state pending on its former CPU could be restored, corrupting
+ * the modifications.
+ */
+void fpu__activate_fpstate_write(struct fpu *fpu)
+{
+	/*
+	 * Only stopped child tasks can be used to modify the FPU
+	 * state in the fpstate buffer:
+	 */
+	WARN_ON_FPU(fpu == &current->thread.fpu);
+
+	if (fpu->fpstate_active) {
+		/* Invalidate any lazy state: */
+		fpu->last_cpu = -1;
+	} else {
+		fpstate_init(&fpu->state);
+
+		/* Safe to do for stopped child tasks: */
+		fpu->fpstate_active = 1;
+	}
+}
+
+/*
+ * 'fpu__restore()' is called to copy FPU registers from
+ * the FPU fpstate to the live hw registers and to activate
+ * access to the hardware registers, so that FPU instructions
+ * can be used afterwards.
+ *
+ * Must be called with kernel preemption disabled (for example
+ * with local interrupts disabled, as it is in the case of
+ * do_device_not_available()).
+ */
+void fpu__restore(struct fpu *fpu)
+{
+	fpu__activate_curr(fpu);
+
+	/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
+	kernel_fpu_disable();
+	fpregs_activate(fpu);
+	copy_kernel_to_fpregs(&fpu->state);
+	fpu->counter++;
+	kernel_fpu_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__restore);
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct fpu *fpu)
+{
+	preempt_disable();
+	fpu->counter = 0;
+
+	if (fpu->fpregs_active) {
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
+		fpregs_deactivate(fpu);
+	}
+
+	fpu->fpstate_active = 0;
+
+	preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from
+ * the init fpstate:
+ */
+static inline void copy_init_fpstate_to_fpregs(void)
+{
+	if (use_xsave())
+		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+	else
+		copy_kernel_to_fxregs(&init_fpstate.fxsave);
+}
+
+/*
+ * Clear the FPU state back to init state.
+ *
+ * Called by sys_execve(), by the signal handler code and by various
+ * error paths.
+ */
+void fpu__clear(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+
+	if (!use_eager_fpu()) {
+		/* FPU state will be reallocated lazily at the first use. */
+		fpu__drop(fpu);
+	} else {
+		if (!fpu->fpstate_active) {
+			fpu__activate_curr(fpu);
+			user_fpu_begin();
+		}
+		copy_init_fpstate_to_fpregs();
+	}
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+static inline unsigned short get_fpu_cwd(struct fpu *fpu)
+{
+	if (cpu_has_fxsr) {
+		return fpu->state.fxsave.cwd;
+	} else {
+		return (unsigned short)fpu->state.fsave.cwd;
+	}
+}
+
+static inline unsigned short get_fpu_swd(struct fpu *fpu)
+{
+	if (cpu_has_fxsr) {
+		return fpu->state.fxsave.swd;
+	} else {
+		return (unsigned short)fpu->state.fsave.swd;
+	}
+}
+
+static inline unsigned short get_fpu_mxcsr(struct fpu *fpu)
+{
+	if (cpu_has_xmm) {
+		return fpu->state.fxsave.mxcsr;
+	} else {
+		return MXCSR_DEFAULT;
+	}
+}
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+	int err;
+
+	if (trap_nr == X86_TRAP_MF) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(fpu);
+		swd = get_fpu_swd(fpu);
+
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(fpu);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
+
+	if (err & 0x001) {	/* Invalid op */
+		/*
+		 * swd & 0x240 == 0x040: Stack Underflow
+		 * swd & 0x240 == 0x240: Stack Overflow
+		 * User must clear the SF bit (0x40) if set
+		 */
+		return FPE_FLTINV;
+	} else if (err & 0x004) { /* Divide by Zero */
+		return FPE_FLTDIV;
+	} else if (err & 0x008) { /* Overflow */
+		return FPE_FLTOVF;
+	} else if (err & 0x012) { /* Denormal, Underflow */
+		return FPE_FLTUND;
+	} else if (err & 0x020) { /* Precision */
+		return FPE_FLTRES;
+	}
+
+	/*
+	 * If we're using IRQ 13, or supposedly even some trap
+	 * X86_TRAP_MF implementations, it's possible
+	 * we get a spurious trap, which is not an error.
+	 */
+	return 0;
+}
diff --git a/kernel/arch/x86/kernel/fpu/init.c b/kernel/arch/x86/kernel/fpu/init.c
new file mode 100644
index 000000000..be39b5fde
--- /dev/null
+++ b/kernel/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,396 @@
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/internal.h>
+#include <asm/tlbflush.h>
+
+#include <linux/sched.h>
+
+/*
+ * Initialize the TS bit in CR0 according to the style of context-switches
+ * we are using:
+ */
+static void fpu__init_cpu_ctx_switch(void)
+{
+	if (!cpu_has_eager_fpu)
+		stts();
+	else
+		clts();
+}
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+	unsigned long cr0;
+	unsigned long cr4_mask = 0;
+
+	if (cpu_has_fxsr)
+		cr4_mask |= X86_CR4_OSFXSR;
+	if (cpu_has_xmm)
+		cr4_mask |= X86_CR4_OSXMMEXCPT;
+	if (cr4_mask)
+		cr4_set_bits(cr4_mask);
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+	if (!cpu_has_fpu)
+		cr0 |= X86_CR0_EM;
+	write_cr0(cr0);
+
+	/* Flush out any pending x87 state: */
+#ifdef CONFIG_MATH_EMULATION
+	if (!cpu_has_fpu)
+		fpstate_init_soft(&current->thread.fpu.state.soft);
+	else
+#endif
+		asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+	fpu__init_cpu_generic();
+	fpu__init_cpu_xstate();
+	fpu__init_cpu_ctx_switch();
+}
+
+/*
+ * The earliest FPU detection code.
+ *
+ * Set the X86_FEATURE_FPU CPU-capability bit based on
+ * trying to execute an actual sequence of FPU instructions:
+ */
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+	unsigned long cr0;
+	u16 fsw, fcw;
+
+	fsw = fcw = 0xffff;
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+	write_cr0(cr0);
+
+	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+		     : "+m" (fsw), "+m" (fcw));
+
+	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+		set_cpu_cap(c, X86_FEATURE_FPU);
+	else
+		clear_cpu_cap(c, X86_FEATURE_FPU);
+
+#ifndef CONFIG_MATH_EMULATION
+	if (!cpu_has_fpu) {
+		pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+		for (;;)
+			asm volatile("hlt");
+	}
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+
+static void __init fpu__init_system_mxcsr(void)
+{
+	unsigned int mask = 0;
+
+	if (cpu_has_fxsr) {
+		/* Static because GCC does not get 16-byte stack alignment right: */
+		static struct fxregs_state fxregs __initdata;
+
+		asm volatile("fxsave %0" : "+m" (fxregs));
+
+		mask = fxregs.mxcsr_mask;
+
+		/*
+		 * If zero then use the default features mask,
+		 * which has all features set, except the
+		 * denormals-are-zero feature bit:
+		 */
+		if (mask == 0)
+			mask = 0x0000ffbf;
+	}
+	mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+	/*
+	 * Set up the legacy init FPU context. (xstate init might overwrite this
+	 * with a more modern format, if the CPU supports it.)
+	 */
+	fpstate_init_fxstate(&init_fpstate.fxsave);
+
+	fpu__init_system_mxcsr();
+}
+
+/*
+ * Size of the FPU context state. All tasks in the system use the
+ * same context size, regardless of what portion they use.
+ * This is inherent to the XSAVE architecture which puts all state
+ * components into a single, continuous memory block:
+ */
+unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
+
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+	BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+	int task_size = sizeof(struct task_struct);
+
+	/*
+	 * Subtract off the static size of the register state.
+	 * It potentially has a bunch of padding.
+	 */
+	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+	/*
+	 * Add back the dynamically-calculated register state
+	 * size.
+	 */
+	task_size += xstate_size;
+
+	/*
+	 * We dynamically size 'struct fpu', so we require that
+	 * it be at the end of 'thread_struct' and that
+	 * 'thread_struct' be at the end of 'task_struct'.  If
+	 * you hit a compile error here, check the structure to
+	 * see if something got added to the end.
+	 */
+	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+	arch_task_struct_size = task_size;
+}
+
+/*
+ * Set up the xstate_size based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	/*
+	 * Note that xstate_size might be overwriten later during
+	 * fpu__init_system_xstate().
+	 */
+
+	if (!cpu_has_fpu) {
+		/*
+		 * Disable xsave as we do not support it if i387
+		 * emulation is enabled.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+		xstate_size = sizeof(struct swregs_state);
+	} else {
+		if (cpu_has_fxsr)
+			xstate_size = sizeof(struct fxregs_state);
+		else
+			xstate_size = sizeof(struct fregs_state);
+	}
+	/*
+	 * Quirk: we don't yet handle the XSAVES* instructions
+	 * correctly, as we don't correctly convert between
+	 * standard and compacted format when interfacing
+	 * with user-space - so disable it for now.
+	 *
+	 * The difference is small: with recent CPUs the
+	 * compacted format is only marginally smaller than
+	 * the standard FPU state format.
+	 *
+	 * ( This is easy to backport while we are fixing
+	 *   XSAVES* support. )
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
+ * FPU context switching strategies:
+ *
+ * Against popular belief, we don't do lazy FPU saves, due to the
+ * task migration complications it brings on SMP - we only do
+ * lazy FPU restores.
+ *
+ * 'lazy' is the traditional strategy, which is based on setting
+ * CR0::TS to 1 during context-switch (instead of doing a full
+ * restore of the FPU state), which causes the first FPU instruction
+ * after the context switch (whenever it is executed) to fault - at
+ * which point we lazily restore the FPU state into FPU registers.
+ *
+ * Tasks are of course under no obligation to execute FPU instructions,
+ * so it can easily happen that another context-switch occurs without
+ * a single FPU instruction being executed. If we eventually switch
+ * back to the original task (that still owns the FPU) then we have
+ * not only saved the restores along the way, but we also have the
+ * FPU ready to be used for the original task.
+ *
+ * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * state during every context switch, regardless of whether the task
+ * has used FPU instructions in that time slice or not. This is done
+ * because modern FPU context saving instructions are able to optimize
+ * state saving and restoration in hardware: they can detect both
+ * unused and untouched FPU state and optimize accordingly.
+ *
+ * [ Note that even in 'lazy' mode we might optimize context switches
+ *   to use 'eager' restores, if we detect that a task is using the FPU
+ *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
+ */
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+
+static int __init eager_fpu_setup(char *s)
+{
+	if (!strcmp(s, "on"))
+		eagerfpu = ENABLE;
+	else if (!strcmp(s, "off"))
+		eagerfpu = DISABLE;
+	else if (!strcmp(s, "auto"))
+		eagerfpu = AUTO;
+	return 1;
+}
+__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Pick the FPU context switching strategy:
+ */
+static void __init fpu__init_system_ctx_switch(void)
+{
+	static bool on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	WARN_ON_FPU(current->thread.fpu.fpstate_active);
+	current_thread_info()->status = 0;
+
+	/* Auto enable eagerfpu for xsaveopt */
+	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+		eagerfpu = ENABLE;
+
+	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
+		if (eagerfpu == DISABLE) {
+			pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+			       xfeatures_mask & XFEATURE_MASK_EAGER);
+			xfeatures_mask &= ~XFEATURE_MASK_EAGER;
+		} else {
+			eagerfpu = ENABLE;
+		}
+	}
+
+	if (eagerfpu == ENABLE)
+		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
+
+	printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(struct cpuinfo_x86 *c)
+{
+	fpu__init_system_early_generic(c);
+
+	/*
+	 * The FPU has to be operational for some of the
+	 * later FPU init activities:
+	 */
+	fpu__init_cpu();
+
+	/*
+	 * But don't leave CR0::TS set yet, as some of the FPU setup
+	 * methods depend on being able to execute FPU instructions
+	 * that will fault on a set TS, such as the FXSAVE in
+	 * fpu__init_system_mxcsr().
+	 */
+	clts();
+
+	fpu__init_system_generic();
+	fpu__init_system_xstate_size_legacy();
+	fpu__init_system_xstate();
+	fpu__init_task_struct_size();
+
+	fpu__init_system_ctx_switch();
+}
+
+/*
+ * Boot parameter to turn off FPU support and fall back to math-emu:
+ */
+static int __init no_387(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FPU);
+	return 1;
+}
+__setup("no387", no_387);
+
+/*
+ * Disable all xstate CPU features:
+ */
+static int __init x86_noxsave_setup(char *s)
+{
+	if (strlen(s))
+		return 0;
+
+	fpu__xstate_clear_all_cpu_caps();
+
+	return 1;
+}
+__setup("noxsave", x86_noxsave_setup);
+
+/*
+ * Disable the XSAVEOPT instruction specifically:
+ */
+static int __init x86_noxsaveopt_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+	return 1;
+}
+__setup("noxsaveopt", x86_noxsaveopt_setup);
+
+/*
+ * Disable the XSAVES instruction:
+ */
+static int __init x86_noxsaves_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	return 1;
+}
+__setup("noxsaves", x86_noxsaves_setup);
+
+/*
+ * Disable FX save/restore and SSE support:
+ */
+static int __init x86_nofxsr_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
+
+	return 1;
+}
+__setup("nofxsr", x86_nofxsr_setup);
diff --git a/kernel/arch/x86/kernel/fpu/regset.c b/kernel/arch/x86/kernel/fpu/regset.c
new file mode 100644
index 000000000..0bc349042
--- /dev/null
+++ b/kernel/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,356 @@
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilites supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	struct fpu *target_fpu = &target->thread.fpu;
+
+	return target_fpu->fpstate_active ? regset->n : 0;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	struct fpu *target_fpu = &target->thread.fpu;
+
+	return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0;
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	fpu__activate_fpstate_read(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &fpu->state.fxsave, 0, -1);
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	int ret;
+
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	fpu__activate_fpstate_write(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &fpu->state.fxsave, 0, -1);
+
+	/*
+	 * mxcsr reserved bits must be masked to zero for security reasons.
+	 */
+	fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+
+	/*
+	 * update the header bits in the xsave header, indicating the
+	 * presence of FP and SSE state.
+	 */
+	if (cpu_has_xsave)
+		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+
+	return ret;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct xregs_state *xsave;
+	int ret;
+
+	if (!cpu_has_xsave)
+		return -ENODEV;
+
+	fpu__activate_fpstate_read(fpu);
+
+	xsave = &fpu->state.xsave;
+
+	/*
+	 * Copy the 48bytes defined by the software first into the xstate
+	 * memory layout in the thread struct, so that we can copy the entire
+	 * xstateregs to the user using one user_regset_copyout().
+	 */
+	memcpy(&xsave->i387.sw_reserved,
+		xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+	/*
+	 * Copy the xstate memory layout.
+	 */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+	return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+		  unsigned int pos, unsigned int count,
+		  const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct xregs_state *xsave;
+	int ret;
+
+	if (!cpu_has_xsave)
+		return -ENODEV;
+
+	fpu__activate_fpstate_write(fpu);
+
+	xsave = &fpu->state.xsave;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+	/*
+	 * mxcsr reserved bits must be masked to zero for security reasons.
+	 */
+	xsave->i387.mxcsr &= mxcsr_feature_mask;
+	xsave->header.xfeatures &= xfeatures_mask;
+	/*
+	 * These bits must be zero.
+	 */
+	memset(&xsave->header.reserved, 0, 48);
+
+	return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
+	tmp = ~twd;
+	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+	/* and move the valid bits to the lower byte. */
+	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+	return tmp;
+}
+
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID	0
+#define FP_EXP_TAG_ZERO		1
+#define FP_EXP_TAG_SPECIAL	2
+#define FP_EXP_TAG_EMPTY	3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+	struct _fpxreg *st;
+	u32 tos = (fxsave->swd >> 11) & 7;
+	u32 twd = (unsigned long) fxsave->twd;
+	u32 tag;
+	u32 ret = 0xffff0000u;
+	int i;
+
+	for (i = 0; i < 8; i++, twd >>= 1) {
+		if (twd & 0x1) {
+			st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+			switch (st->exponent & 0x7fff) {
+			case 0x7fff:
+				tag = FP_EXP_TAG_SPECIAL;
+				break;
+			case 0x0000:
+				if (!st->significand[0] &&
+				    !st->significand[1] &&
+				    !st->significand[2] &&
+				    !st->significand[3])
+					tag = FP_EXP_TAG_ZERO;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
+				break;
+			default:
+				if (st->significand[3] & 0x8000)
+					tag = FP_EXP_TAG_VALID;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
+				break;
+			}
+		} else {
+			tag = FP_EXP_TAG_EMPTY;
+		}
+		ret |= tag << (2 * i);
+	}
+	return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
+
+	env->cwd = fxsave->cwd | 0xffff0000u;
+	env->swd = fxsave->swd | 0xffff0000u;
+	env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+	env->fip = fxsave->rip;
+	env->foo = fxsave->rdp;
+	/*
+	 * should be actually ds/cs at fpu exception time, but
+	 * that information is not available in 64bit mode.
+	 */
+	env->fcs = task_pt_regs(tsk)->cs;
+	if (tsk == current) {
+		savesegment(ds, env->fos);
+	} else {
+		env->fos = tsk->thread.ds;
+	}
+	env->fos |= 0xffff0000;
+#else
+	env->fip = fxsave->fip;
+	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+	env->foo = fxsave->foo;
+	env->fos = fxsave->fos;
+#endif
+
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void convert_to_fxsr(struct task_struct *tsk,
+		     const struct user_i387_ia32_struct *env)
+
+{
+	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
+
+	fxsave->cwd = env->cwd;
+	fxsave->swd = env->swd;
+	fxsave->twd = twd_i387_to_fxsr(env->twd);
+	fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+	fxsave->rip = env->fip;
+	fxsave->rdp = env->foo;
+	/* cs and ds ignored */
+#else
+	fxsave->fip = env->fip;
+	fxsave->fcs = (env->fcs & 0xffff);
+	fxsave->foo = env->foo;
+	fxsave->fos = env->fos;
+#endif
+
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct user_i387_ia32_struct env;
+
+	fpu__activate_fpstate_read(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
+	if (!cpu_has_fxsr)
+		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					   &fpu->state.fsave, 0,
+					   -1);
+
+	fpstate_sanitize_xstate(fpu);
+
+	if (kbuf && pos == 0 && count == sizeof(env)) {
+		convert_from_fxsr(kbuf, target);
+		return 0;
+	}
+
+	convert_from_fxsr(&env, target);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct user_i387_ia32_struct env;
+	int ret;
+
+	fpu__activate_fpstate_write(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+	if (!cpu_has_fxsr)
+		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					  &fpu->state.fsave, 0,
+					  -1);
+
+	if (pos > 0 || count < sizeof(env))
+		convert_from_fxsr(&env, target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	if (!ret)
+		convert_to_fxsr(target, &env);
+
+	/*
+	 * update the header bit in the xsave header, indicating the
+	 * presence of FP.
+	 */
+	if (cpu_has_xsave)
+		fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+	return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
+ */
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
+{
+	struct task_struct *tsk = current;
+	struct fpu *fpu = &tsk->thread.fpu;
+	int fpvalid;
+
+	fpvalid = fpu->fpstate_active;
+	if (fpvalid)
+		fpvalid = !fpregs_get(tsk, NULL,
+				      0, sizeof(struct user_i387_ia32_struct),
+				      ufpu, NULL);
+
+	return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/kernel/arch/x86/kernel/fpu/signal.c b/kernel/arch/x86/kernel/fpu/signal.c
new file mode 100644
index 000000000..31c6a6050
--- /dev/null
+++ b/kernel/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,403 @@
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/sigframe.h>
+
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline int check_for_xstate(struct fxregs_state __user *buf,
+				   void __user *fpstate,
+				   struct _fpx_sw_bytes *fx_sw)
+{
+	int min_xstate_size = sizeof(struct fxregs_state) +
+			      sizeof(struct xstate_header);
+	unsigned int magic2;
+
+	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+		return -1;
+
+	/* Check for the first magic field and other error scenarios. */
+	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+	    fx_sw->xstate_size < min_xstate_size ||
+	    fx_sw->xstate_size > xstate_size ||
+	    fx_sw->xstate_size > fx_sw->extended_size)
+		return -1;
+
+	/*
+	 * Check for the presence of second magic word at the end of memory
+	 * layout. This detects the case where the user just copied the legacy
+	 * fpstate layout with out copying the extended state information
+	 * in the memory layout.
+	 */
+	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+	    || magic2 != FP_XSTATE_MAGIC2)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+	if (use_fxsr()) {
+		struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+		struct user_i387_ia32_struct env;
+		struct _fpstate_32 __user *fp = buf;
+
+		convert_from_fxsr(&env, tsk);
+
+		if (__copy_to_user(buf, &env, sizeof(env)) ||
+		    __put_user(xsave->i387.swd, &fp->status) ||
+		    __put_user(X86_FXSR_MAGIC, &fp->magic))
+			return -1;
+	} else {
+		struct fregs_state __user *fp = buf;
+		u32 swd;
+		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+			return -1;
+	}
+
+	return 0;
+}
+
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+	struct xregs_state __user *x = buf;
+	struct _fpx_sw_bytes *sw_bytes;
+	u32 xfeatures;
+	int err;
+
+	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+
+	if (!use_xsave())
+		return err;
+
+	err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
+
+	/*
+	 * Read the xfeatures which we copied (directly from the cpu or
+	 * from the state in task struct) to the user buffers.
+	 */
+	err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+	/*
+	 * For legacy compatible, we always set FP/SSE bits in the bit
+	 * vector while saving the state to the user context. This will
+	 * enable us capturing any changes(during sigreturn) to
+	 * the FP/SSE bits by the legacy applications which don't touch
+	 * xfeatures in the xsave header.
+	 *
+	 * xsave aware apps can change the xfeatures in the xsave
+	 * header as well as change any contents in the memory layout.
+	 * xrestore as part of sigreturn will capture all the changes.
+	 */
+	xfeatures |= XFEATURE_MASK_FPSSE;
+
+	err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+	return err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+{
+	int err;
+
+	if (use_xsave())
+		err = copy_xregs_to_user(buf);
+	else if (use_fxsr())
+		err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+	else
+		err = copy_fregs_to_user((struct fregs_state __user *) buf);
+
+	if (unlikely(err) && __clear_user(buf, xstate_size))
+		err = -EFAULT;
+	return err;
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ *  state is copied.
+ *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ *	buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+{
+	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+	struct task_struct *tsk = current;
+	int ia32_fxstate = (buf != buf_fx);
+
+	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+			 config_enabled(CONFIG_IA32_EMULATION));
+
+	if (!access_ok(VERIFY_WRITE, buf, size))
+		return -EACCES;
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_get(current, NULL, 0,
+			sizeof(struct user_i387_ia32_struct), NULL,
+			(struct _fpstate_32 __user *) buf) ? -1 : 1;
+
+	if (fpregs_active()) {
+		/* Save the live register state to the user directly. */
+		if (copy_fpregs_to_sigframe(buf_fx))
+			return -1;
+		/* Update the thread's fxstate to save the fsave header. */
+		if (ia32_fxstate)
+			copy_fxregs_to_kernel(&tsk->thread.fpu);
+	} else {
+		fpstate_sanitize_xstate(&tsk->thread.fpu);
+		if (__copy_to_user(buf_fx, xsave, xstate_size))
+			return -1;
+	}
+
+	/* Save the fsave header for the 32-bit frames. */
+	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+		return -1;
+
+	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+		return -1;
+
+	return 0;
+}
+
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+			 struct user_i387_ia32_struct *ia32_env,
+			 u64 xfeatures, int fx_only)
+{
+	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+	struct xstate_header *header = &xsave->header;
+
+	if (use_xsave()) {
+		/* These bits must be zero. */
+		memset(header->reserved, 0, 48);
+
+		/*
+		 * Init the state that is not present in the memory
+		 * layout and not enabled by the OS.
+		 */
+		if (fx_only)
+			header->xfeatures = XFEATURE_MASK_FPSSE;
+		else
+			header->xfeatures &= (xfeatures_mask & xfeatures);
+	}
+
+	if (use_fxsr()) {
+		/*
+		 * mscsr reserved bits must be masked to zero for security
+		 * reasons.
+		 */
+		xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+		convert_to_fxsr(tsk, ia32_env);
+	}
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ */
+static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+{
+	if (use_xsave()) {
+		if ((unsigned long)buf % 64 || fx_only) {
+			u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			return copy_user_to_fxregs(buf);
+		} else {
+			u64 init_bv = xfeatures_mask & ~xbv;
+			if (unlikely(init_bv))
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			return copy_user_to_xregs(buf, xbv);
+		}
+	} else if (use_fxsr()) {
+		return copy_user_to_fxregs(buf);
+	} else
+		return copy_user_to_fregs(buf);
+}
+
+static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+{
+	int ia32_fxstate = (buf != buf_fx);
+	struct task_struct *tsk = current;
+	struct fpu *fpu = &tsk->thread.fpu;
+	int state_size = xstate_size;
+	u64 xfeatures = 0;
+	int fx_only = 0;
+
+	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+			 config_enabled(CONFIG_IA32_EMULATION));
+
+	if (!buf) {
+		fpu__clear(fpu);
+		return 0;
+	}
+
+	if (!access_ok(VERIFY_READ, buf, size))
+		return -EACCES;
+
+	fpu__activate_curr(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_set(current, NULL,
+				       0, sizeof(struct user_i387_ia32_struct),
+				       NULL, buf) != 0;
+
+	if (use_xsave()) {
+		struct _fpx_sw_bytes fx_sw_user;
+		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+			/*
+			 * Couldn't find the extended state information in the
+			 * memory layout. Restore just the FP/SSE and init all
+			 * the other extended state.
+			 */
+			state_size = sizeof(struct fxregs_state);
+			fx_only = 1;
+		} else {
+			state_size = fx_sw_user.xstate_size;
+			xfeatures = fx_sw_user.xfeatures;
+		}
+	}
+
+	if (ia32_fxstate) {
+		/*
+		 * For 32-bit frames with fxstate, copy the user state to the
+		 * thread's fpu state, reconstruct fxstate from the fsave
+		 * header. Sanitize the copied state etc.
+		 */
+		struct fpu *fpu = &tsk->thread.fpu;
+		struct user_i387_ia32_struct env;
+		int err = 0;
+
+		/*
+		 * Drop the current fpu which clears fpu->fpstate_active. This ensures
+		 * that any context-switch during the copy of the new state,
+		 * avoids the intermediate state from getting restored/saved.
+		 * Thus avoiding the new restored state from getting corrupted.
+		 * We will be ready to restore/save the state only after
+		 * fpu->fpstate_active is again set.
+		 */
+		fpu__drop(fpu);
+
+		if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) ||
+		    __copy_from_user(&env, buf, sizeof(env))) {
+			fpstate_init(&fpu->state);
+			err = -1;
+		} else {
+			sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+		}
+
+		fpu->fpstate_active = 1;
+		if (use_eager_fpu()) {
+			preempt_disable();
+			fpu__restore(fpu);
+			preempt_enable();
+		}
+
+		return err;
+	} else {
+		/*
+		 * For 64-bit frames and 32-bit fsave frames, restore the user
+		 * state to the registers directly (with exceptions handled).
+		 */
+		user_fpu_begin();
+		if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
+			fpu__clear(fpu);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int xstate_sigframe_size(void)
+{
+	return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+int fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+	void __user *buf_fx = buf;
+	int size = xstate_sigframe_size();
+
+	if (ia32_frame && use_fxsr()) {
+		buf_fx = buf + sizeof(struct fregs_state);
+		size += sizeof(struct fregs_state);
+	}
+
+	return __fpu__restore_sig(buf, buf_fx, size);
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+		     unsigned long *buf_fx, unsigned long *size)
+{
+	unsigned long frame_size = xstate_sigframe_size();
+
+	*buf_fx = sp = round_down(sp - frame_size, 64);
+	if (ia32_frame && use_fxsr()) {
+		frame_size += sizeof(struct fregs_state);
+		sp -= sizeof(struct fregs_state);
+	}
+
+	*size = frame_size;
+
+	return sp;
+}
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+void fpu__init_prepare_fx_sw_frame(void)
+{
+	int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
+
+	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+	fx_sw_reserved.extended_size = size;
+	fx_sw_reserved.xfeatures = xfeatures_mask;
+	fx_sw_reserved.xstate_size = xstate_size;
+
+	if (config_enabled(CONFIG_IA32_EMULATION) ||
+	    config_enabled(CONFIG_X86_32)) {
+		int fsave_header_size = sizeof(struct fregs_state);
+
+		fx_sw_reserved_ia32 = fx_sw_reserved;
+		fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
+	}
+}
+
diff --git a/kernel/arch/x86/kernel/fpu/xstate.c b/kernel/arch/x86/kernel/fpu/xstate.c
new file mode 100644
index 000000000..70fc31222
--- /dev/null
+++ b/kernel/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,752 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/tlbflush.h>
+
+static const char *xfeature_names[] =
+{
+	"x87 floating point registers"	,
+	"SSE registers"			,
+	"AVX registers"			,
+	"MPX bounds registers"		,
+	"MPX CSR"			,
+	"AVX-512 opmask"		,
+	"AVX-512 Hi256"			,
+	"AVX-512 ZMM_Hi256"		,
+	"unknown xstate feature"	,
+};
+
+/*
+ * Mask of xstate features supported by the CPU and the kernel:
+ */
+u64 xfeatures_mask __read_mostly;
+
+static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX]   = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+
+/*
+ * Clear all of the X86_FEATURE_* bits that are unavailable
+ * when the CPU has no XSAVE support.
+ */
+void fpu__xstate_clear_all_cpu_caps(void)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+}
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+
+	if (unlikely(feature_name)) {
+		long xfeature_idx, max_idx;
+		u64 xfeatures_print;
+		/*
+		 * So we use FLS here to be able to print the most advanced
+		 * feature that was requested but is missing. So if a driver
+		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
+		 * missing AVX feature - this is the most informative message
+		 * to users:
+		 */
+		if (xfeatures_missing)
+			xfeatures_print = xfeatures_missing;
+		else
+			xfeatures_print = xfeatures_needed;
+
+		xfeature_idx = fls64(xfeatures_print)-1;
+		max_idx = ARRAY_SIZE(xfeature_names)-1;
+		xfeature_idx = min(xfeature_idx, max_idx);
+
+		*feature_name = xfeature_names[xfeature_idx];
+	}
+
+	if (xfeatures_missing)
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+/*
+ * When executing XSAVEOPT (or other optimized XSAVE instructions), if
+ * a processor implementation detects that an FPU state component is still
+ * (or is again) in its initialized state, it may clear the corresponding
+ * bit in the header.xfeatures field, and can skip the writeout of registers
+ * to the corresponding memory layout.
+ *
+ * This means that when the bit is zero, the state component might still contain
+ * some previous - non-initialized register state.
+ *
+ * Before writing xstate information to user-space we sanitize those components,
+ * to always ensure that the memory layout of a feature will be in the init state
+ * if the corresponding header bit is zero. This is to ensure that user-space doesn't
+ * see some stale state in the memory layout during signal handling, debugging etc.
+ */
+void fpstate_sanitize_xstate(struct fpu *fpu)
+{
+	struct fxregs_state *fx = &fpu->state.fxsave;
+	int feature_bit;
+	u64 xfeatures;
+
+	if (!use_xsaveopt())
+		return;
+
+	xfeatures = fpu->state.xsave.header.xfeatures;
+
+	/*
+	 * None of the feature bits are in init state. So nothing else
+	 * to do for us, as the memory layout is up to date.
+	 */
+	if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+		return;
+
+	/*
+	 * FP is in init state
+	 */
+	if (!(xfeatures & XFEATURE_MASK_FP)) {
+		fx->cwd = 0x37f;
+		fx->swd = 0;
+		fx->twd = 0;
+		fx->fop = 0;
+		fx->rip = 0;
+		fx->rdp = 0;
+		memset(&fx->st_space[0], 0, 128);
+	}
+
+	/*
+	 * SSE is in init state
+	 */
+	if (!(xfeatures & XFEATURE_MASK_SSE))
+		memset(&fx->xmm_space[0], 0, 256);
+
+	/*
+	 * First two features are FPU and SSE, which above we handled
+	 * in a special way already:
+	 */
+	feature_bit = 0x2;
+	xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+
+	/*
+	 * Update all the remaining memory layouts according to their
+	 * standard xstate layout, if their header bit is in the init
+	 * state:
+	 */
+	while (xfeatures) {
+		if (xfeatures & 0x1) {
+			int offset = xstate_offsets[feature_bit];
+			int size = xstate_sizes[feature_bit];
+
+			memcpy((void *)fx + offset,
+			       (void *)&init_fpstate.xsave + offset,
+			       size);
+		}
+
+		xfeatures >>= 1;
+		feature_bit++;
+	}
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+	if (!cpu_has_xsave || !xfeatures_mask)
+		return;
+
+	cr4_set_bits(X86_CR4_OSXSAVE);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Note that in the future we will likely need a pair of
+ * functions here: one for user xstates and the other for
+ * system xstates.  For now, they are the same.
+ */
+static int xfeature_enabled(enum xfeature xfeature)
+{
+	return !!(xfeatures_mask & (1UL << xfeature));
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout.
+ */
+static void __init setup_xstate_features(void)
+{
+	u32 eax, ebx, ecx, edx, i;
+	/* start at the beginnning of the "extended state" */
+	unsigned int last_good_offset = offsetof(struct xregs_state,
+						 extended_state_area);
+
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!xfeature_enabled(i))
+			continue;
+
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+		xstate_offsets[i] = ebx;
+		xstate_sizes[i] = eax;
+		/*
+		 * In our xstate size checks, we assume that the
+		 * highest-numbered xstate feature has the
+		 * highest offset in the buffer.  Ensure it does.
+		 */
+		WARN_ONCE(last_good_offset > xstate_offsets[i],
+			"x86/fpu: misordered xstate at %d\n", last_good_offset);
+		last_good_offset = xstate_offsets[i];
+
+		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax);
+	}
+}
+
+static void __init print_xstate_feature(u64 xstate_mask)
+{
+	const char *feature_name;
+
+	if (cpu_has_xfeatures(xstate_mask, &feature_name))
+		pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+	print_xstate_feature(XFEATURE_MASK_FP);
+	print_xstate_feature(XFEATURE_MASK_SSE);
+	print_xstate_feature(XFEATURE_MASK_YMM);
+	print_xstate_feature(XFEATURE_MASK_BNDREGS);
+	print_xstate_feature(XFEATURE_MASK_BNDCSR);
+	print_xstate_feature(XFEATURE_MASK_OPMASK);
+	print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
+	print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
+}
+
+/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ */
+static void __init setup_xstate_comp(void)
+{
+	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+	int i;
+
+	/*
+	 * The FP xstates and SSE xstates are legacy states. They are always
+	 * in the fixed offsets in the xsave area in either compacted form
+	 * or standard form.
+	 */
+	xstate_comp_offsets[0] = 0;
+	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+
+	if (!cpu_has_xsaves) {
+		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+			if (xfeature_enabled(i)) {
+				xstate_comp_offsets[i] = xstate_offsets[i];
+				xstate_comp_sizes[i] = xstate_sizes[i];
+			}
+		}
+		return;
+	}
+
+	xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
+		FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (xfeature_enabled(i))
+			xstate_comp_sizes[i] = xstate_sizes[i];
+		else
+			xstate_comp_sizes[i] = 0;
+
+		if (i > FIRST_EXTENDED_XFEATURE)
+			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+					+ xstate_comp_sizes[i-1];
+
+	}
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	if (!cpu_has_xsave)
+		return;
+
+	setup_xstate_features();
+	print_xstate_features();
+
+	if (cpu_has_xsaves) {
+		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+		init_fpstate.xsave.header.xfeatures = xfeatures_mask;
+	}
+
+	/*
+	 * Init all the features state with header_bv being 0x0
+	 */
+	copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+
+	/*
+	 * Dump the init state again. This is to identify the init state
+	 * of any feature which is not represented by all zero's.
+	 */
+	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+}
+
+static int xfeature_is_supervisor(int xfeature_nr)
+{
+	/*
+	 * We currently do not support supervisor states, but if
+	 * we did, we could find out like this.
+	 *
+	 * SDM says: If state component i is a user state component,
+	 * ECX[0] return 0; if state component i is a supervisor
+	 * state component, ECX[0] returns 1.
+	u32 eax, ebx, ecx, edx;
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx;
+	return !!(ecx & 1);
+	*/
+	return 0;
+}
+/*
+static int xfeature_is_user(int xfeature_nr)
+{
+	return !xfeature_is_supervisor(xfeature_nr);
+}
+*/
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do {		\
+	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
+	WARN_ON(nr >= XFEATURE_MAX);	\
+} while (0)
+
+/*
+ * We could cache this like xstate_size[], but we only use
+ * it here, so it would be a waste of space.
+ */
+static int xfeature_is_aligned(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	/*
+	 * The value returned by ECX[1] indicates the alignment
+	 * of state component i when the compacted format
+	 * of the extended region of an XSAVE area is used
+	 */
+	return !!(ecx & 2);
+}
+
+static int xfeature_uncompacted_offset(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	return ebx;
+}
+
+static int xfeature_size(int xfeature_nr)
+{
+	u32 eax, ebx, ecx, edx;
+
+	CHECK_XFEATURE(xfeature_nr);
+	cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+	return eax;
+}
+
+/*
+ * 'XSAVES' implies two different things:
+ * 1. saving of supervisor/system state
+ * 2. using the compacted format
+ *
+ * Use this function when dealing with the compacted format so
+ * that it is obvious which aspect of 'XSAVES' is being handled
+ * by the calling code.
+ */
+static int using_compacted_format(void)
+{
+	return cpu_has_xsaves;
+}
+
+static void __xstate_dump_leaves(void)
+{
+	int i;
+	u32 eax, ebx, ecx, edx;
+	static int should_dump = 1;
+
+	if (!should_dump)
+		return;
+	should_dump = 0;
+	/*
+	 * Dump out a few leaves past the ones that we support
+	 * just in case there are some goodies up there
+	 */
+	for (i = 0; i < XFEATURE_MAX + 10; i++) {
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+			XSTATE_CPUID, i, eax, ebx, ecx, edx);
+	}
+}
+
+#define XSTATE_WARN_ON(x) do {							\
+	if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) {	\
+		__xstate_dump_leaves();						\
+	}									\
+} while (0)
+
+#define XCHECK_SZ(sz, nr, nr_macro, __struct) do {			\
+	if ((nr == nr_macro) &&						\
+	    WARN_ONCE(sz != sizeof(__struct),				\
+		"%s: struct is %zu bytes, cpu state %d bytes\n",	\
+		__stringify(nr_macro), sizeof(__struct), sz)) {		\
+		__xstate_dump_leaves();					\
+	}								\
+} while (0)
+
+/*
+ * We have a C struct for each 'xstate'.  We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static void check_xstate_against_struct(int nr)
+{
+	/*
+	 * Ask the CPU for the size of the state.
+	 */
+	int sz = xfeature_size(nr);
+	/*
+	 * Match each CPU state with the corresponding software
+	 * structure.
+	 */
+	XCHECK_SZ(sz, nr, XFEATURE_YMM,       struct ymmh_struct);
+	XCHECK_SZ(sz, nr, XFEATURE_BNDREGS,   struct mpx_bndreg_state);
+	XCHECK_SZ(sz, nr, XFEATURE_BNDCSR,    struct mpx_bndcsr_state);
+	XCHECK_SZ(sz, nr, XFEATURE_OPMASK,    struct avx_512_opmask_state);
+	XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
+	XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM,  struct avx_512_hi16_state);
+
+	/*
+	 * Make *SURE* to add any feature numbers in below if
+	 * there are "holes" in the xsave state component
+	 * numbers.
+	 */
+	if ((nr < XFEATURE_YMM) ||
+	    (nr >= XFEATURE_MAX)) {
+		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
+		XSTATE_WARN_ON(1);
+	}
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be.  We are recalculating
+ * it to be safe.
+ */
+static void do_extra_xstate_size_checks(void)
+{
+	int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	int i;
+
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!xfeature_enabled(i))
+			continue;
+
+		check_xstate_against_struct(i);
+		/*
+		 * Supervisor state components can be managed only by
+		 * XSAVES, which is compacted-format only.
+		 */
+		if (!using_compacted_format())
+			XSTATE_WARN_ON(xfeature_is_supervisor(i));
+
+		/* Align from the end of the previous feature */
+		if (xfeature_is_aligned(i))
+			paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+		/*
+		 * The offset of a given state in the non-compacted
+		 * format is given to us in a CPUID leaf.  We check
+		 * them for being ordered (increasing offsets) in
+		 * setup_xstate_features().
+		 */
+		if (!using_compacted_format())
+			paranoid_xstate_size = xfeature_uncompacted_offset(i);
+		/*
+		 * The compacted-format offset always depends on where
+		 * the previous state ended.
+		 */
+		paranoid_xstate_size += xfeature_size(i);
+	}
+	XSTATE_WARN_ON(paranoid_xstate_size != xstate_size);
+}
+
+/*
+ * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ *
+ * Note the SDM's wording here.  "sub-function 0" only enumerates
+ * the size of the *user* states.  If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * Note that we do not currently set any bits on IA32_XSS so
+ * 'XCR0 | IA32_XSS == XCR0' for now.
+ */
+static unsigned int __init calculate_xstate_size(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	unsigned int calculated_xstate_size;
+
+	if (!cpu_has_xsaves) {
+		/*
+		 * - CPUID function 0DH, sub-function 0:
+		 *    EBX enumerates the size (in bytes) required by
+		 *    the XSAVE instruction for an XSAVE area
+		 *    containing all the *user* state components
+		 *    corresponding to bits currently set in XCR0.
+		 */
+		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+		calculated_xstate_size = ebx;
+	} else {
+		/*
+		 * - CPUID function 0DH, sub-function 1:
+		 *    EBX enumerates the size (in bytes) required by
+		 *    the XSAVES instruction for an XSAVE area
+		 *    containing all the state components
+		 *    corresponding to bits currently set in
+		 *    XCR0 | IA32_XSS.
+		 */
+		cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+		calculated_xstate_size = ebx;
+	}
+	return calculated_xstate_size;
+}
+
+/*
+ * Will the runtime-enumerated 'xstate_size' fit in the init
+ * task's statically-allocated buffer?
+ */
+static bool is_supported_xstate_size(unsigned int test_xstate_size)
+{
+	if (test_xstate_size <= sizeof(union fpregs_state))
+		return true;
+
+	pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
+			sizeof(union fpregs_state), test_xstate_size);
+	return false;
+}
+
+static int init_xstate_size(void)
+{
+	/* Recompute the context size for enabled features: */
+	unsigned int possible_xstate_size = calculate_xstate_size();
+
+	/* Ensure we have the space to store all enabled: */
+	if (!is_supported_xstate_size(possible_xstate_size))
+		return -EINVAL;
+
+	/*
+	 * The size is OK, we are definitely going to use xsave,
+	 * make it known to the world that we need more space.
+	 */
+	xstate_size = possible_xstate_size;
+	do_extra_xstate_size_checks();
+	return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it.  Disable it.
+ */
+static void fpu__init_disable_system_xstate(void)
+{
+	xfeatures_mask = 0;
+	cr4_clear_bits(X86_CR4_OSXSAVE);
+	fpu__xstate_clear_all_cpu_caps();
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	static int on_boot_cpu = 1;
+	int err;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	if (!cpu_has_xsave) {
+		pr_info("x86/fpu: Legacy x87 FPU detected.\n");
+		return;
+	}
+
+	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+		WARN_ON_FPU(1);
+		return;
+	}
+
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	xfeatures_mask = eax + ((u64)edx << 32);
+
+	if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+		BUG();
+	}
+
+	/* Support only the state known to the OS: */
+	xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+
+	/* Enable xstate instructions to be able to continue with initialization: */
+	fpu__init_cpu_xstate();
+	err = init_xstate_size();
+	if (err) {
+		/* something went wrong, boot without any XSAVE support */
+		fpu__init_disable_system_xstate();
+		return;
+	}
+
+	update_regset_xstate_info(xstate_size, xfeatures_mask);
+	fpu__init_prepare_fx_sw_frame();
+	setup_init_fpu_buf();
+	setup_xstate_comp();
+
+	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
+		xfeatures_mask,
+		xstate_size,
+		cpu_has_xsaves ? "compacted" : "standard");
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+	/*
+	 * Restore XCR0 on xsave capable CPUs:
+	 */
+	if (cpu_has_xsave)
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ *	xstate: the thread's storage area for all FPU data
+ *	xstate_feature: state which is defined in xsave.h (e.g.
+ *	XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
+ * Output:
+ *	address of the state in the xsave area, or NULL if the
+ *	field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
+{
+	int feature_nr = fls64(xstate_feature) - 1;
+	/*
+	 * Do we even *have* xsave state?
+	 */
+	if (!boot_cpu_has(X86_FEATURE_XSAVE))
+		return NULL;
+
+	/*
+	 * We should not ever be requesting features that we
+	 * have not enabled.  Remember that pcntxt_mask is
+	 * what we write to the XCR0 register.
+	 */
+	WARN_ONCE(!(xfeatures_mask & xstate_feature),
+		  "get of unsupported state");
+	/*
+	 * This assumes the last 'xsave*' instruction to
+	 * have requested that 'xstate_feature' be saved.
+	 * If it did not, we might be seeing and old value
+	 * of the field in the buffer.
+	 *
+	 * This can happen because the last 'xsave' did not
+	 * request that this feature be saved (unlikely)
+	 * or because the "init optimization" caused it
+	 * to not be saved.
+	 */
+	if (!(xsave->header.xfeatures & xstate_feature))
+		return NULL;
+
+	return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+/*
+ * This wraps up the common operations that need to occur when retrieving
+ * data from xsave state.  It first ensures that the current task was
+ * using the FPU and retrieves the data in to a buffer.  It then calculates
+ * the offset of the requested field in the buffer.
+ *
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ *	XFEATURE_MASK_SSE, etc...)
+ * Output:
+ *	address of the state in the xsave area or NULL if the state
+ *	is not present or is in its 'init state'.
+ */
+const void *get_xsave_field_ptr(int xsave_state)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (!fpu->fpstate_active)
+		return NULL;
+	/*
+	 * fpu__save() takes the CPU's xstate registers
+	 * and saves them off to the 'fpu memory buffer.
+	 */
+	fpu__save(fpu);
+
+	return get_xsave_addr(&fpu->state.xsave, xsave_state);
+}
diff --git a/kernel/arch/x86/kernel/ftrace.c b/kernel/arch/x86/kernel/ftrace.c
index 8b7b0a51e..311bcf338 100644
--- a/kernel/arch/x86/kernel/ftrace.c
+++ b/kernel/arch/x86/kernel/ftrace.c
@@ -556,6 +556,7 @@ void ftrace_replace_code(int enable)
 	run_sync();
 
 	report = "updating code";
+	count = 0;
 
 	for_ftrace_rec_iter(iter) {
 		rec = ftrace_rec_iter_record(iter);
@@ -563,11 +564,13 @@ void ftrace_replace_code(int enable)
 		ret = add_update(rec, enable);
 		if (ret)
 			goto remove_breakpoints;
+		count++;
 	}
 
 	run_sync();
 
 	report = "removing breakpoints";
+	count = 0;
 
 	for_ftrace_rec_iter(iter) {
 		rec = ftrace_rec_iter_record(iter);
@@ -575,6 +578,7 @@ void ftrace_replace_code(int enable)
 		ret = finish_update(rec, enable);
 		if (ret)
 			goto remove_breakpoints;
+		count++;
 	}
 
 	run_sync();
diff --git a/kernel/arch/x86/kernel/head_32.S b/kernel/arch/x86/kernel/head_32.S
index 7e429c99c..6bc9ae24b 100644
--- a/kernel/arch/x86/kernel/head_32.S
+++ b/kernel/arch/x86/kernel/head_32.S
@@ -152,7 +152,7 @@ ENTRY(startup_32)
 	movl %eax, pa(olpc_ofw_pgd)
 #endif
 
-#ifdef CONFIG_MICROCODE_EARLY
+#ifdef CONFIG_MICROCODE
 	/* Early load ucode on BSP. */
 	call load_ucode_bsp
 #endif
@@ -311,12 +311,11 @@ ENTRY(startup_32_smp)
 	movl %eax,%ss
 	leal -__PAGE_OFFSET(%ecx),%esp
 
-#ifdef CONFIG_MICROCODE_EARLY
+#ifdef CONFIG_MICROCODE
 	/* Early load ucode on AP. */
 	call load_ucode_ap
 #endif
 
-
 default_entry:
 #define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
 			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
@@ -557,7 +556,7 @@ early_idt_handler_common:
 	cld
 
 	cmpl $2,(%esp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 	cmpl $2,%ss:early_recursion_flag
 	je hlt_loop
@@ -610,7 +609,7 @@ ex_entry:
 	pop %ecx
 	pop %eax
 	decl %ss:early_recursion_flag
-is_nmi:
+.Lis_nmi:
 	addl $8,%esp		/* drop vector number and error code */
 	iret
 ENDPROC(early_idt_handler_common)
diff --git a/kernel/arch/x86/kernel/head_64.S b/kernel/arch/x86/kernel/head_64.S
index 7e5da2cbe..ffdc0e860 100644
--- a/kernel/arch/x86/kernel/head_64.S
+++ b/kernel/arch/x86/kernel/head_64.S
@@ -65,6 +65,9 @@ startup_64:
 	 * tables and then reload them.
 	 */
 
+	/* Sanitize CPU configuration */
+	call verify_cpu
+
 	/*
 	 * Compute the delta between the address I am compiled to run at and the
 	 * address I am actually running at.
@@ -174,6 +177,9 @@ ENTRY(secondary_startup_64)
 	 * after the boot processor executes this code.
 	 */
 
+	/* Sanitize CPU configuration */
+	call verify_cpu
+
 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
 1:
 
@@ -288,6 +294,8 @@ ENTRY(secondary_startup_64)
 	pushq	%rax		# target address in negative space
 	lretq
 
+#include "verify_cpu.S"
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
@@ -346,7 +354,7 @@ early_idt_handler_common:
 	cld
 
 	cmpl $2,(%rsp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
@@ -411,7 +419,7 @@ early_idt_handler_common:
 	popq %rcx
 	popq %rax
 	decl early_recursion_flag(%rip)
-is_nmi:
+.Lis_nmi:
 	addq $16,%rsp		# drop vector number and error code
 	INTERRUPT_RETURN
 ENDPROC(early_idt_handler_common)
diff --git a/kernel/arch/x86/kernel/hpet.c b/kernel/arch/x86/kernel/hpet.c
index 3acbff471..b8e6ff5cd 100644
--- a/kernel/arch/x86/kernel/hpet.c
+++ b/kernel/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
 #include <linux/pm.h>
 #include <linux/io.h>
 
+#include <asm/irqdomain.h>
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
@@ -36,10 +37,10 @@
  */
 unsigned long				hpet_address;
 u8					hpet_blockid; /* OS timer block num */
-u8					hpet_msi_disable;
+bool					hpet_msi_disable;
 
 #ifdef CONFIG_PCI_MSI
-static unsigned long			hpet_num_timers;
+static unsigned int			hpet_num_timers;
 #endif
 static void __iomem			*hpet_virt_address;
 
@@ -85,9 +86,9 @@ static inline void hpet_clear_mapping(void)
 /*
  * HPET command line enable / disable
  */
-int boot_hpet_disable;
-int hpet_force_user;
-static int hpet_verbose;
+bool boot_hpet_disable;
+bool hpet_force_user;
+static bool hpet_verbose;
 
 static int __init hpet_setup(char *str)
 {
@@ -97,11 +98,11 @@ static int __init hpet_setup(char *str)
 		if (next)
 			*next++ = 0;
 		if (!strncmp("disable", str, 7))
-			boot_hpet_disable = 1;
+			boot_hpet_disable = true;
 		if (!strncmp("force", str, 5))
-			hpet_force_user = 1;
+			hpet_force_user = true;
 		if (!strncmp("verbose", str, 7))
-			hpet_verbose = 1;
+			hpet_verbose = true;
 		str = next;
 	}
 	return 1;
@@ -110,7 +111,7 @@ __setup("hpet=", hpet_setup);
 
 static int __init disable_hpet(char *str)
 {
-	boot_hpet_disable = 1;
+	boot_hpet_disable = true;
 	return 1;
 }
 __setup("nohpet", disable_hpet);
@@ -123,7 +124,7 @@ static inline int is_hpet_capable(void)
 /*
  * HPET timer interrupt enable / disable
  */
-static int hpet_legacy_int_enabled;
+static bool hpet_legacy_int_enabled;
 
 /**
  * is_hpet_enabled - check whether the hpet timer interrupt is enabled
@@ -225,26 +226,11 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
  */
 static unsigned long hpet_freq;
 
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
-			   struct clock_event_device *evt);
-
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
-	.name		= "hpet",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= hpet_legacy_set_mode,
-	.set_next_event = hpet_legacy_next_event,
-	.irq		= 0,
-	.rating		= 50,
-};
+static struct clock_event_device hpet_clockevent;
 
 static void hpet_stop_counter(void)
 {
-	unsigned long cfg = hpet_readl(HPET_CFG);
+	u32 cfg = hpet_readl(HPET_CFG);
 	cfg &= ~HPET_CFG_ENABLE;
 	hpet_writel(cfg, HPET_CFG);
 }
@@ -286,7 +272,7 @@ static void hpet_enable_legacy_int(void)
 
 	cfg |= HPET_CFG_LEGACY;
 	hpet_writel(cfg, HPET_CFG);
-	hpet_legacy_int_enabled = 1;
+	hpet_legacy_int_enabled = true;
 }
 
 static void hpet_legacy_clockevent_register(void)
@@ -305,66 +291,74 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 
-static int hpet_setup_msi_irq(unsigned int irq);
-
-static void hpet_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt, int timer)
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
 {
 	unsigned int cfg, cmp, now;
 	uint64_t delta;
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		hpet_stop_counter();
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
-		delta >>= evt->shift;
-		now = hpet_readl(HPET_COUNTER);
-		cmp = now + (unsigned int) delta;
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
-		       HPET_TN_SETVAL | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		hpet_writel(cmp, HPET_Tn_CMP(timer));
-		udelay(1);
-		/*
-		 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
-		 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
-		 * bit is automatically cleared after the first write.
-		 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
-		 * Publication # 24674)
-		 */
-		hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
-		hpet_start_counter();
-		hpet_print_config();
-		break;
+	hpet_stop_counter();
+	delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+	delta >>= evt->shift;
+	now = hpet_readl(HPET_COUNTER);
+	cmp = now + (unsigned int)delta;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+	       HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+	hpet_writel(cmp, HPET_Tn_CMP(timer));
+	udelay(1);
+	/*
+	 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+	 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+	 * bit is automatically cleared after the first write.
+	 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+	 * Publication # 24674)
+	 */
+	hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+	hpet_start_counter();
+	hpet_print_config();
 
-	case CLOCK_EVT_MODE_ONESHOT:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_PERIODIC;
-		cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+	return 0;
+}
 
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_ENABLE;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;
 
-	case CLOCK_EVT_MODE_RESUME:
-		if (timer == 0) {
-			hpet_enable_legacy_int();
-		} else {
-			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-			hpet_setup_msi_irq(hdev->irq);
-			disable_irq(hdev->irq);
-			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
-			enable_irq(hdev->irq);
-		}
-		hpet_print_config();
-		break;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;
+
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_ENABLE;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt, int timer)
+{
+	if (!timer) {
+		hpet_enable_legacy_int();
+	} else {
+		struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+		irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
+		disable_irq(hdev->irq);
+		irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
+		enable_irq(hdev->irq);
 	}
+	hpet_print_config();
+
+	return 0;
 }
 
 static int hpet_next_event(unsigned long delta,
@@ -404,10 +398,24 @@ static int hpet_next_event(unsigned long delta,
 	return res < HPET_MIN_CYCLES ? -ETIME : 0;
 }
 
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			struct clock_event_device *evt)
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
+{
+	return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
 {
-	hpet_set_mode(mode, evt, 0);
+	return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
+{
+	return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+	return hpet_resume(evt, 0);
 }
 
 static int hpet_legacy_next_event(unsigned long delta,
@@ -417,16 +425,33 @@ static int hpet_legacy_next_event(unsigned long delta,
 }
 
 /*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+	.name			= "hpet",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT,
+	.set_state_periodic	= hpet_legacy_set_periodic,
+	.set_state_oneshot	= hpet_legacy_set_oneshot,
+	.set_state_shutdown	= hpet_legacy_shutdown,
+	.tick_resume		= hpet_legacy_resume,
+	.set_next_event		= hpet_legacy_next_event,
+	.irq			= 0,
+	.rating			= 50,
+};
+
+/*
  * HPET MSI Support
  */
 #ifdef CONFIG_PCI_MSI
 
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
+static struct irq_domain *hpet_domain;
 
 void hpet_msi_unmask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
 
 	/* unmask it */
@@ -437,7 +462,7 @@ void hpet_msi_unmask(struct irq_data *data)
 
 void hpet_msi_mask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;
 
 	/* mask it */
@@ -459,43 +484,39 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
 	msg->address_hi = 0;
 }
 
-static void hpet_msi_set_mode(enum clock_event_mode mode,
-				struct clock_event_device *evt)
+static int hpet_msi_shutdown(struct clock_event_device *evt)
 {
 	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-	hpet_set_mode(mode, evt, hdev->num);
+
+	return hpet_shutdown(evt, hdev->num);
 }
 
-static int hpet_msi_next_event(unsigned long delta,
-				struct clock_event_device *evt)
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
 {
 	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-	return hpet_next_event(delta, evt, hdev->num);
-}
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
-		irq_free_hwirq(irq);
-		return -EINVAL;
-	}
-	return 0;
+	return hpet_set_oneshot(evt, hdev->num);
 }
 
-static int hpet_assign_irq(struct hpet_dev *dev)
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
 {
-	unsigned int irq = irq_alloc_hwirq(-1);
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
 
-	if (!irq)
-		return -EINVAL;
+	return hpet_set_periodic(evt, hdev->num);
+}
 
-	irq_set_handler_data(irq, dev);
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
 
-	if (hpet_setup_msi_irq(irq))
-		return -EINVAL;
+	return hpet_resume(evt, hdev->num);
+}
 
-	dev->irq = irq;
-	return 0;
+static int hpet_msi_next_event(unsigned long delta,
+				struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+	return hpet_next_event(delta, evt, hdev->num);
 }
 
 static irqreturn_t hpet_interrupt_handler(int irq, void *data)
@@ -540,9 +561,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	if (!(hdev->flags & HPET_DEV_VALID))
 		return;
 
-	if (hpet_setup_msi_irq(hdev->irq))
-		return;
-
 	hdev->cpu = cpu;
 	per_cpu(cpu_hpet_dev, cpu) = hdev;
 	evt->name = hdev->name;
@@ -551,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 
 	evt->rating = 110;
 	evt->features = CLOCK_EVT_FEAT_ONESHOT;
-	if (hdev->flags & HPET_DEV_PERI_CAP)
+	if (hdev->flags & HPET_DEV_PERI_CAP) {
 		evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+		evt->set_state_periodic = hpet_msi_set_periodic;
+	}
 
-	evt->set_mode = hpet_msi_set_mode;
+	evt->set_state_shutdown = hpet_msi_shutdown;
+	evt->set_state_oneshot = hpet_msi_set_oneshot;
+	evt->tick_resume = hpet_msi_resume;
 	evt->set_next_event = hpet_msi_next_event;
 	evt->cpumask = cpumask_of(hdev->cpu);
 
@@ -574,7 +596,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	unsigned int id;
 	unsigned int num_timers;
 	unsigned int num_timers_used = 0;
-	int i;
+	int i, irq;
 
 	if (hpet_msi_disable)
 		return;
@@ -587,6 +609,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	num_timers++; /* Value read out starts from 0 */
 	hpet_print_config();
 
+	hpet_domain = hpet_create_irq_domain(hpet_blockid);
+	if (!hpet_domain)
+		return;
+
 	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
 	if (!hpet_devs)
 		return;
@@ -604,12 +630,14 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 		hdev->flags = 0;
 		if (cfg & HPET_TN_PERIODIC_CAP)
 			hdev->flags |= HPET_DEV_PERI_CAP;
+		sprintf(hdev->name, "hpet%d", i);
 		hdev->num = i;
 
-		sprintf(hdev->name, "hpet%d", i);
-		if (hpet_assign_irq(hdev))
+		irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+		if (irq <= 0)
 			continue;
 
+		hdev->irq = irq;
 		hdev->flags |= HPET_DEV_FSB_CAP;
 		hdev->flags |= HPET_DEV_VALID;
 		num_timers_used++;
@@ -709,10 +737,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 }
 #else
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	return 0;
-}
 static void hpet_msi_capability_lookup(unsigned int start_timer)
 {
 	return;
@@ -761,7 +785,7 @@ static int hpet_clocksource_register(void)
 
 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
-	rdtscll(start);
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -771,7 +795,7 @@ static int hpet_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	if (t1 == hpet_readl(HPET_COUNTER)) {
@@ -959,7 +983,7 @@ void hpet_disable(void)
 			cfg = *hpet_boot_cfg;
 		else if (hpet_legacy_int_enabled) {
 			cfg &= ~HPET_CFG_LEGACY;
-			hpet_legacy_int_enabled = 0;
+			hpet_legacy_int_enabled = false;
 		}
 		cfg &= ~HPET_CFG_ENABLE;
 		hpet_writel(cfg, HPET_CFG);
@@ -1097,8 +1121,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
 
 static void hpet_disable_rtc_channel(void)
 {
-	unsigned long cfg;
-	cfg = hpet_readl(HPET_T1_CFG);
+	u32 cfg = hpet_readl(HPET_T1_CFG);
 	cfg &= ~HPET_TN_ENABLE;
 	hpet_writel(cfg, HPET_T1_CFG);
 }
diff --git a/kernel/arch/x86/kernel/hw_breakpoint.c b/kernel/arch/x86/kernel/hw_breakpoint.c
index 7114ba220..50a3fad5b 100644
--- a/kernel/arch/x86/kernel/hw_breakpoint.c
+++ b/kernel/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,7 @@
 #include <linux/irqflags.h>
 #include <linux/notifier.h>
 #include <linux/kallsyms.h>
+#include <linux/kprobes.h>
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
@@ -179,7 +180,11 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
 	va = info->address;
 	len = bp->attr.bp_len;
 
-	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+	/*
+	 * We don't need to worry about va + len - 1 overflowing:
+	 * we already require that va is aligned to a multiple of len.
+	 */
+	return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
 }
 
 int arch_bp_generic_fields(int x86_len, int x86_type,
@@ -243,6 +248,20 @@ static int arch_build_bp_info(struct perf_event *bp)
 		info->type = X86_BREAKPOINT_RW;
 		break;
 	case HW_BREAKPOINT_X:
+		/*
+		 * We don't allow kernel breakpoints in places that are not
+		 * acceptable for kprobes.  On non-kprobes kernels, we don't
+		 * allow kernel breakpoints at all.
+		 */
+		if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+			if (within_kprobe_blacklist(bp->attr.bp_addr))
+				return -EINVAL;
+#else
+			return -EINVAL;
+#endif
+		}
+
 		info->type = X86_BREAKPOINT_EXECUTE;
 		/*
 		 * x86 inst breakpoints need to have a specific undefined len.
@@ -276,8 +295,18 @@ static int arch_build_bp_info(struct perf_event *bp)
 		break;
 #endif
 	default:
+		/* AMD range breakpoint */
 		if (!is_power_of_2(bp->attr.bp_len))
 			return -EINVAL;
+		if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
+			return -EINVAL;
+		/*
+		 * It's impossible to use a range breakpoint to fake out
+		 * user vs kernel detection because bp_len - 1 can't
+		 * have the high bit set.  If we ever allow range instruction
+		 * breakpoints, then we'll have to check for kprobe-blacklisted
+		 * addresses anywhere in the range.
+		 */
 		if (!cpu_has_bpext)
 			return -EOPNOTSUPP;
 		info->mask = bp->attr.bp_len - 1;
diff --git a/kernel/arch/x86/kernel/i386_ksyms_32.c b/kernel/arch/x86/kernel/i386_ksyms_32.c
index 05fd74f53..64341aa48 100644
--- a/kernel/arch/x86/kernel/i386_ksyms_32.c
+++ b/kernel/arch/x86/kernel/i386_ksyms_32.c
@@ -40,7 +40,5 @@ EXPORT_SYMBOL(empty_zero_page);
 
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
diff --git a/kernel/arch/x86/kernel/i387.c b/kernel/arch/x86/kernel/i387.c
deleted file mode 100644
index 6185d3141..000000000
--- a/kernel/arch/x86/kernel/i387.c
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- */
-#include <linux/module.h>
-#include <linux/regset.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include <asm/sigcontext.h>
-#include <asm/processor.h>
-#include <asm/math_emu.h>
-#include <asm/tlbflush.h>
-#include <asm/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/user.h>
-
-static DEFINE_PER_CPU(bool, in_kernel_fpu);
-
-void kernel_fpu_disable(void)
-{
-	WARN_ON(this_cpu_read(in_kernel_fpu));
-	this_cpu_write(in_kernel_fpu, true);
-}
-
-void kernel_fpu_enable(void)
-{
-	this_cpu_write(in_kernel_fpu, false);
-}
-
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
-static inline bool interrupted_kernel_fpu_idle(void)
-{
-	if (this_cpu_read(in_kernel_fpu))
-		return false;
-
-	if (use_eager_fpu())
-		return true;
-
-	return !__thread_has_fpu(current) &&
-		(read_cr0() & X86_CR0_TS);
-}
-
-/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
- */
-static inline bool interrupted_user_mode(void)
-{
-	struct pt_regs *regs = get_irq_regs();
-	return regs && user_mode(regs);
-}
-
-/*
- * Can we use the FPU in kernel mode with the
- * whole "kernel_fpu_begin/end()" sequence?
- *
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
- */
-bool irq_fpu_usable(void)
-{
-	return !in_interrupt() ||
-		interrupted_user_mode() ||
-		interrupted_kernel_fpu_idle();
-}
-EXPORT_SYMBOL(irq_fpu_usable);
-
-void __kernel_fpu_begin(void)
-{
-	struct task_struct *me = current;
-
-	this_cpu_write(in_kernel_fpu, true);
-
-	if (__thread_has_fpu(me)) {
-		__save_init_fpu(me);
-	} else {
-		this_cpu_write(fpu_owner_task, NULL);
-		if (!use_eager_fpu())
-			clts();
-	}
-}
-EXPORT_SYMBOL(__kernel_fpu_begin);
-
-void __kernel_fpu_end(void)
-{
-	struct task_struct *me = current;
-
-	if (__thread_has_fpu(me)) {
-		if (WARN_ON(restore_fpu_checking(me)))
-			fpu_reset_state(me);
-	} else if (!use_eager_fpu()) {
-		stts();
-	}
-
-	this_cpu_write(in_kernel_fpu, false);
-}
-EXPORT_SYMBOL(__kernel_fpu_end);
-
-void unlazy_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	if (__thread_has_fpu(tsk)) {
-		if (use_eager_fpu()) {
-			__save_fpu(tsk);
-		} else {
-			__save_init_fpu(tsk);
-			__thread_fpu_end(tsk);
-		}
-	}
-	preempt_enable();
-}
-EXPORT_SYMBOL(unlazy_fpu);
-
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
-unsigned int xstate_size;
-EXPORT_SYMBOL_GPL(xstate_size);
-static struct i387_fxsave_struct fx_scratch;
-
-static void mxcsr_feature_mask_init(void)
-{
-	unsigned long mask = 0;
-
-	if (cpu_has_fxsr) {
-		memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
-		asm volatile("fxsave %0" : "+m" (fx_scratch));
-		mask = fx_scratch.mxcsr_mask;
-		if (mask == 0)
-			mask = 0x0000ffbf;
-	}
-	mxcsr_feature_mask &= mask;
-}
-
-static void init_thread_xstate(void)
-{
-	/*
-	 * Note that xstate_size might be overwriten later during
-	 * xsave_init().
-	 */
-
-	if (!cpu_has_fpu) {
-		/*
-		 * Disable xsave as we do not support it if i387
-		 * emulation is enabled.
-		 */
-		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-		xstate_size = sizeof(struct i387_soft_struct);
-		return;
-	}
-
-	if (cpu_has_fxsr)
-		xstate_size = sizeof(struct i387_fxsave_struct);
-	else
-		xstate_size = sizeof(struct i387_fsave_struct);
-
-	/*
-	 * Quirk: we don't yet handle the XSAVES* instructions
-	 * correctly, as we don't correctly convert between
-	 * standard and compacted format when interfacing
-	 * with user-space - so disable it for now.
-	 *
-	 * The difference is small: with recent CPUs the
-	 * compacted format is only marginally smaller than
-	 * the standard FPU state format.
-	 *
-	 * ( This is easy to backport while we are fixing
-	 *   XSAVES* support. )
-	 */
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-}
-
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-
-void fpu_init(void)
-{
-	unsigned long cr0;
-	unsigned long cr4_mask = 0;
-
-#ifndef CONFIG_MATH_EMULATION
-	if (!cpu_has_fpu) {
-		pr_emerg("No FPU found and no math emulation present\n");
-		pr_emerg("Giving up\n");
-		for (;;)
-			asm volatile("hlt");
-	}
-#endif
-	if (cpu_has_fxsr)
-		cr4_mask |= X86_CR4_OSFXSR;
-	if (cpu_has_xmm)
-		cr4_mask |= X86_CR4_OSXMMEXCPT;
-	if (cr4_mask)
-		cr4_set_bits(cr4_mask);
-
-	cr0 = read_cr0();
-	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
-	if (!cpu_has_fpu)
-		cr0 |= X86_CR0_EM;
-	write_cr0(cr0);
-
-	/*
-	 * init_thread_xstate is only called once to avoid overriding
-	 * xstate_size during boot time or during CPU hotplug.
-	 */
-	if (xstate_size == 0)
-		init_thread_xstate();
-
-	mxcsr_feature_mask_init();
-	xsave_init();
-	eager_fpu_init();
-}
-
-void fpu_finit(struct fpu *fpu)
-{
-	if (!cpu_has_fpu) {
-		finit_soft_fpu(&fpu->state->soft);
-		return;
-	}
-
-	memset(fpu->state, 0, xstate_size);
-
-	if (cpu_has_fxsr) {
-		fx_finit(&fpu->state->fxsave);
-	} else {
-		struct i387_fsave_struct *fp = &fpu->state->fsave;
-		fp->cwd = 0xffff037fu;
-		fp->swd = 0xffff0000u;
-		fp->twd = 0xffffffffu;
-		fp->fos = 0xffff0000u;
-	}
-}
-EXPORT_SYMBOL_GPL(fpu_finit);
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remember the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
-{
-	int ret;
-
-	if (tsk_used_math(tsk)) {
-		if (cpu_has_fpu && tsk == current)
-			unlazy_fpu(tsk);
-		task_disable_lazy_fpu_restore(tsk);
-		return 0;
-	}
-
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
-	if (ret)
-		return ret;
-
-	fpu_finit(&tsk->thread.fpu);
-
-	set_stopped_child_used_math(tsk);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(init_fpu);
-
-/*
- * The xstateregs_active() routine is the same as the fpregs_active() routine,
- * as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
- */
-int fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return tsk_used_math(target) ? regset->n : 0;
-}
-
-int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
-}
-
-int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
-{
-	int ret;
-
-	if (!cpu_has_fxsr)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.fpu.state->fxsave, 0, -1);
-}
-
-int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		const void *kbuf, const void __user *ubuf)
-{
-	int ret;
-
-	if (!cpu_has_fxsr)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.fpu.state->fxsave, 0, -1);
-
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
-	/*
-	 * update the header bits in the xsave header, indicating the
-	 * presence of FP and SSE state.
-	 */
-	if (cpu_has_xsave)
-		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
-	return ret;
-}
-
-int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
-{
-	struct xsave_struct *xsave;
-	int ret;
-
-	if (!cpu_has_xsave)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	xsave = &target->thread.fpu.state->xsave;
-
-	/*
-	 * Copy the 48bytes defined by the software first into the xstate
-	 * memory layout in the thread struct, so that we can copy the entire
-	 * xstateregs to the user using one user_regset_copyout().
-	 */
-	memcpy(&xsave->i387.sw_reserved,
-		xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-	/*
-	 * Copy the xstate memory layout.
-	 */
-	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-	return ret;
-}
-
-int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
-		  unsigned int pos, unsigned int count,
-		  const void *kbuf, const void __user *ubuf)
-{
-	struct xsave_struct *xsave;
-	int ret;
-
-	if (!cpu_has_xsave)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	xsave = &target->thread.fpu.state->xsave;
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	xsave->i387.mxcsr &= mxcsr_feature_mask;
-	xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
-	/*
-	 * These bits must be zero.
-	 */
-	memset(&xsave->xsave_hdr.reserved, 0, 48);
-	return ret;
-}
-
-#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
-	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
-	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
-	tmp = ~twd;
-	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-	/* and move the valid bits to the lower byte. */
-	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
-	return tmp;
-}
-
-#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
-#define FP_EXP_TAG_VALID	0
-#define FP_EXP_TAG_ZERO		1
-#define FP_EXP_TAG_SPECIAL	2
-#define FP_EXP_TAG_EMPTY	3
-
-static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-	struct _fpxreg *st;
-	u32 tos = (fxsave->swd >> 11) & 7;
-	u32 twd = (unsigned long) fxsave->twd;
-	u32 tag;
-	u32 ret = 0xffff0000u;
-	int i;
-
-	for (i = 0; i < 8; i++, twd >>= 1) {
-		if (twd & 0x1) {
-			st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
-			switch (st->exponent & 0x7fff) {
-			case 0x7fff:
-				tag = FP_EXP_TAG_SPECIAL;
-				break;
-			case 0x0000:
-				if (!st->significand[0] &&
-				    !st->significand[1] &&
-				    !st->significand[2] &&
-				    !st->significand[3])
-					tag = FP_EXP_TAG_ZERO;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			default:
-				if (st->significand[3] & 0x8000)
-					tag = FP_EXP_TAG_VALID;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			}
-		} else {
-			tag = FP_EXP_TAG_EMPTY;
-		}
-		ret |= tag << (2 * i);
-	}
-	return ret;
-}
-
-/*
- * FXSR floating point environment conversions.
- */
-
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
-{
-	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
-	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
-	int i;
-
-	env->cwd = fxsave->cwd | 0xffff0000u;
-	env->swd = fxsave->swd | 0xffff0000u;
-	env->twd = twd_fxsr_to_i387(fxsave);
-
-#ifdef CONFIG_X86_64
-	env->fip = fxsave->rip;
-	env->foo = fxsave->rdp;
-	/*
-	 * should be actually ds/cs at fpu exception time, but
-	 * that information is not available in 64bit mode.
-	 */
-	env->fcs = task_pt_regs(tsk)->cs;
-	if (tsk == current) {
-		savesegment(ds, env->fos);
-	} else {
-		env->fos = tsk->thread.ds;
-	}
-	env->fos |= 0xffff0000;
-#else
-	env->fip = fxsave->fip;
-	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
-	env->foo = fxsave->foo;
-	env->fos = fxsave->fos;
-#endif
-
-	for (i = 0; i < 8; ++i)
-		memcpy(&to[i], &from[i], sizeof(to[0]));
-}
-
-void convert_to_fxsr(struct task_struct *tsk,
-		     const struct user_i387_ia32_struct *env)
-
-{
-	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
-	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
-	int i;
-
-	fxsave->cwd = env->cwd;
-	fxsave->swd = env->swd;
-	fxsave->twd = twd_i387_to_fxsr(env->twd);
-	fxsave->fop = (u16) ((u32) env->fcs >> 16);
-#ifdef CONFIG_X86_64
-	fxsave->rip = env->fip;
-	fxsave->rdp = env->foo;
-	/* cs and ds ignored */
-#else
-	fxsave->fip = env->fip;
-	fxsave->fcs = (env->fcs & 0xffff);
-	fxsave->foo = env->foo;
-	fxsave->fos = env->fos;
-#endif
-
-	for (i = 0; i < 8; ++i)
-		memcpy(&to[i], &from[i], sizeof(from[0]));
-}
-
-int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       void *kbuf, void __user *ubuf)
-{
-	struct user_i387_ia32_struct env;
-	int ret;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
-	if (!cpu_has_fxsr)
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.fpu.state->fsave, 0,
-					   -1);
-
-	sanitize_i387_state(target);
-
-	if (kbuf && pos == 0 && count == sizeof(env)) {
-		convert_from_fxsr(kbuf, target);
-		return 0;
-	}
-
-	convert_from_fxsr(&env, target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-}
-
-int fpregs_set(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       const void *kbuf, const void __user *ubuf)
-{
-	struct user_i387_ia32_struct env;
-	int ret;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
-	if (!cpu_has_fxsr)
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.fpu.state->fsave, 0,
-					  -1);
-
-	if (pos > 0 || count < sizeof(env))
-		convert_from_fxsr(&env, target);
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-	if (!ret)
-		convert_to_fxsr(target, &env);
-
-	/*
-	 * update the header bit in the xsave header, indicating the
-	 * presence of FP.
-	 */
-	if (cpu_has_xsave)
-		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
-	return ret;
-}
-
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
-{
-	struct task_struct *tsk = current;
-	int fpvalid;
-
-	fpvalid = !!used_math();
-	if (fpvalid)
-		fpvalid = !fpregs_get(tsk, NULL,
-				      0, sizeof(struct user_i387_ia32_struct),
-				      fpu, NULL);
-
-	return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
-
-static int __init no_387(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FPU);
-	return 1;
-}
-
-__setup("no387", no_387);
-
-void fpu_detect(struct cpuinfo_x86 *c)
-{
-	unsigned long cr0;
-	u16 fsw, fcw;
-
-	fsw = fcw = 0xffff;
-
-	cr0 = read_cr0();
-	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
-	write_cr0(cr0);
-
-	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-		     : "+m" (fsw), "+m" (fcw));
-
-	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-		set_cpu_cap(c, X86_FEATURE_FPU);
-	else
-		clear_cpu_cap(c, X86_FEATURE_FPU);
-
-	/* The final cr0 value is set in fpu_init() */
-}
diff --git a/kernel/arch/x86/kernel/i8253.c b/kernel/arch/x86/kernel/i8253.c
index f2b96de3c..efb82f07b 100644
--- a/kernel/arch/x86/kernel/i8253.c
+++ b/kernel/arch/x86/kernel/i8253.c
@@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+	    !clockevent_state_periodic(&i8253_clockevent))
 		return 0;
 
 	return clocksource_i8253_init();
diff --git a/kernel/arch/x86/kernel/i8259.c b/kernel/arch/x86/kernel/i8259.c
index e7cc5370c..be22f5a21 100644
--- a/kernel/arch/x86/kernel/i8259.c
+++ b/kernel/arch/x86/kernel/i8259.c
@@ -295,16 +295,11 @@ static void unmask_8259A(void)
 	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
 }
 
-static void init_8259A(int auto_eoi)
+static int probe_8259A(void)
 {
 	unsigned long flags;
 	unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
 	unsigned char new_val;
-
-	i8259A_auto_eoi = auto_eoi;
-
-	raw_spin_lock_irqsave(&i8259A_lock, flags);
-
 	/*
 	 * Check to see if we have a PIC.
 	 * Mask all except the cascade and read
@@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi)
 	 * have a PIC, we will read 0xff as opposed to the
 	 * value we wrote.
 	 */
+	raw_spin_lock_irqsave(&i8259A_lock, flags);
+
 	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
 	outb(probe_val, PIC_MASTER_IMR);
 	new_val = inb(PIC_MASTER_IMR);
 	if (new_val != probe_val) {
 		printk(KERN_INFO "Using NULL legacy PIC\n");
 		legacy_pic = &null_legacy_pic;
-		raw_spin_unlock_irqrestore(&i8259A_lock, flags);
-		return;
 	}
 
+	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+	return nr_legacy_irqs();
+}
+
+static void init_8259A(int auto_eoi)
+{
+	unsigned long flags;
+
+	i8259A_auto_eoi = auto_eoi;
+
+	raw_spin_lock_irqsave(&i8259A_lock, flags);
+
 	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
 
 	/*
@@ -329,8 +336,8 @@ static void init_8259A(int auto_eoi)
 	 */
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+	/* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+	outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
 
 	/* 8259A-1 (the master) has a slave on IR2 */
 	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
@@ -342,8 +349,8 @@ static void init_8259A(int auto_eoi)
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 
-	/* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
-	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+	outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
 	/* 8259A-2 is a slave on master's IR2 */
 	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
 	/* (slave's support for AEOI in flat mode is to be investigated) */
@@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
 {
 	return 0;
 }
+static int legacy_pic_probe(void)
+{
+	return 0;
+}
 
 struct legacy_pic null_legacy_pic = {
 	.nr_legacy_irqs = 0,
@@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = {
 	.mask_all = legacy_pic_noop,
 	.restore_mask = legacy_pic_noop,
 	.init = legacy_pic_int_noop,
+	.probe = legacy_pic_probe,
 	.irq_pending = legacy_pic_irq_pending_noop,
 	.make_irq = legacy_pic_uint_noop,
 };
@@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = {
 	.mask_all = mask_8259A,
 	.restore_mask = unmask_8259A,
 	.init = init_8259A,
+	.probe = probe_8259A,
 	.irq_pending = i8259A_irq_pending,
 	.make_irq = make_8259A_irq,
 };
diff --git a/kernel/arch/x86/kernel/irq.c b/kernel/arch/x86/kernel/irq.c
index e5952c225..61521dc19 100644
--- a/kernel/arch/x86/kernel/irq.c
+++ b/kernel/arch/x86/kernel/irq.c
@@ -22,6 +22,12 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
 
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
 atomic_t irq_err_count;
 
 /* Function pointer for generic interrupt vector handling */
@@ -116,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_puts(p, "  Threshold APIC interrupts\n");
 #endif
+#ifdef CONFIG_X86_MCE_AMD
+	seq_printf(p, "%*s: ", prec, "DFR");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+	seq_puts(p, "  Deferred Error APIC interrupts\n");
+#endif
 #ifdef CONFIG_X86_MCE
 	seq_printf(p, "%*s: ", prec, "MCE");
 	for_each_online_cpu(j)
@@ -127,15 +139,30 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_puts(p, "  Machine check polls\n");
 #endif
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
-	seq_printf(p, "%*s: ", prec, "HYP");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
-	seq_puts(p, "  Hypervisor callback interrupts\n");
+	if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) {
+		seq_printf(p, "%*s: ", prec, "HYP");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ",
+				   irq_stats(j)->irq_hv_callback_count);
+		seq_puts(p, "  Hypervisor callback interrupts\n");
+	}
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
 	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
+#ifdef CONFIG_HAVE_KVM
+	seq_printf(p, "%*s: ", prec, "PIN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+	seq_puts(p, "  Posted-interrupt notification event\n");
+
+	seq_printf(p, "%*s: ", prec, "PIW");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+	seq_puts(p, "  Posted-interrupt wakeup event\n");
+#endif
 	return 0;
 }
 
@@ -187,29 +214,42 @@ u64 arch_irq_stat(void)
 __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
-
+	struct irq_desc * desc;
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
-	unsigned irq;
 
-	irq_enter();
-	exit_idle();
+	/*
+	 * NB: Unlike exception entries, IRQ entries do not reliably
+	 * handle context tracking in the low-level entry code.  This is
+	 * because syscall entries execute briefly with IRQs on before
+	 * updating context tracking state, so we can take an IRQ from
+	 * kernel mode with CONTEXT_USER.  The low-level entry code only
+	 * updates the context if we came from user mode, so we won't
+	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
+	 * code is cleaned up enough that we can cleanly defer enabling
+	 * IRQs.
+	 */
 
-	irq = __this_cpu_read(vector_irq[vector]);
+	entering_irq();
 
-	if (!handle_irq(irq, regs)) {
+	/* entering_irq() tells RCU that we're not quiescent.  Check it. */
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+	desc = __this_cpu_read(vector_irq[vector]);
+
+	if (!handle_irq(desc, regs)) {
 		ack_APIC_irq();
 
-		if (irq != VECTOR_RETRIGGERED) {
-			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+		if (desc != VECTOR_RETRIGGERED) {
+			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
 					     __func__, smp_processor_id(),
-					     vector, irq);
+					     vector);
 		} else {
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 		}
 	}
 
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 	return 1;
@@ -237,6 +277,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+	if (handler)
+		kvm_posted_intr_wakeup_handler = handler;
+	else
+		kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
  */
@@ -244,16 +296,23 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
+	entering_ack_irq();
 	inc_irq_stat(kvm_posted_intr_ipis);
+	exiting_irq();
+	set_irq_regs(old_regs);
+}
 
-	irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	entering_ack_irq();
+	inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+	kvm_posted_intr_wakeup_handler();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 #endif
@@ -288,10 +347,10 @@ static struct cpumask affinity_new, online_new;
  */
 int check_irq_vectors_for_cpu_disable(void)
 {
-	int irq, cpu;
 	unsigned int this_cpu, vector, this_count, count;
 	struct irq_desc *desc;
 	struct irq_data *data;
+	int cpu;
 
 	this_cpu = smp_processor_id();
 	cpumask_copy(&online_new, cpu_online_mask);
@@ -299,39 +358,43 @@ int check_irq_vectors_for_cpu_disable(void)
 
 	this_count = 0;
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		irq = __this_cpu_read(vector_irq[vector]);
-		if (irq >= 0) {
-			desc = irq_to_desc(irq);
-			if (!desc)
-				continue;
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
+			continue;
+		/*
+		 * Protect against concurrent action removal, affinity
+		 * changes etc.
+		 */
+		raw_spin_lock(&desc->lock);
+		data = irq_desc_get_irq_data(desc);
+		cpumask_copy(&affinity_new,
+			     irq_data_get_affinity_mask(data));
+		cpumask_clear_cpu(this_cpu, &affinity_new);
 
-			data = irq_desc_get_irq_data(desc);
-			cpumask_copy(&affinity_new, data->affinity);
-			cpumask_clear_cpu(this_cpu, &affinity_new);
-
-			/* Do not count inactive or per-cpu irqs. */
-			if (!irq_has_action(irq) || irqd_is_per_cpu(data))
-				continue;
-
-			/*
-			 * A single irq may be mapped to multiple
-			 * cpu's vector_irq[] (for example IOAPIC cluster
-			 * mode).  In this case we have two
-			 * possibilities:
-			 *
-			 * 1) the resulting affinity mask is empty; that is
-			 * this the down'd cpu is the last cpu in the irq's
-			 * affinity mask, or
-			 *
-			 * 2) the resulting affinity mask is no longer
-			 * a subset of the online cpus but the affinity
-			 * mask is not zero; that is the down'd cpu is the
-			 * last online cpu in a user set affinity mask.
-			 */
-			if (cpumask_empty(&affinity_new) ||
-			    !cpumask_subset(&affinity_new, &online_new))
-				this_count++;
+		/* Do not count inactive or per-cpu irqs. */
+		if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) {
+			raw_spin_unlock(&desc->lock);
+			continue;
 		}
+
+		raw_spin_unlock(&desc->lock);
+		/*
+		 * A single irq may be mapped to multiple cpu's
+		 * vector_irq[] (for example IOAPIC cluster mode).  In
+		 * this case we have two possibilities:
+		 *
+		 * 1) the resulting affinity mask is empty; that is
+		 * this the down'd cpu is the last cpu in the irq's
+		 * affinity mask, or
+		 *
+		 * 2) the resulting affinity mask is no longer a
+		 * subset of the online cpus but the affinity mask is
+		 * not zero; that is the down'd cpu is the last online
+		 * cpu in a user set affinity mask.
+		 */
+		if (cpumask_empty(&affinity_new) ||
+		    !cpumask_subset(&affinity_new, &online_new))
+			this_count++;
 	}
 
 	count = 0;
@@ -343,12 +406,15 @@ int check_irq_vectors_for_cpu_disable(void)
 		 * vector. If the vector is marked in the used vectors
 		 * bitmap or an irq is assigned to it, we don't count
 		 * it as available.
+		 *
+		 * As this is an inaccurate snapshot anyway, we can do
+		 * this w/o holding vector_lock.
 		 */
 		for (vector = FIRST_EXTERNAL_VECTOR;
 		     vector < first_system_vector; vector++) {
 			if (!test_bit(vector, used_vectors) &&
-			    per_cpu(vector_irq, cpu)[vector] < 0)
-					count++;
+			    IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
+			    count++;
 		}
 	}
 
@@ -384,7 +450,7 @@ void fixup_irqs(void)
 		raw_spin_lock(&desc->lock);
 
 		data = irq_desc_get_irq_data(desc);
-		affinity = data->affinity;
+		affinity = irq_data_get_affinity_mask(data);
 		if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
 		    cpumask_subset(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
@@ -396,7 +462,7 @@ void fixup_irqs(void)
 		 * non intr-remapping case, we can't wait till this interrupt
 		 * arrives at this cpu before completing the irq move.
 		 */
-		irq_force_complete_move(irq);
+		irq_force_complete_move(desc);
 
 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 			break_affinity = 1;
@@ -404,6 +470,15 @@ void fixup_irqs(void)
 		}
 
 		chip = irq_data_get_irq_chip(data);
+		/*
+		 * The interrupt descriptor might have been cleaned up
+		 * already, but it is not yet removed from the radix tree
+		 */
+		if (!chip) {
+			raw_spin_unlock(&desc->lock);
+			continue;
+		}
+
 		if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
 			chip->irq_mask(data);
 
@@ -444,20 +519,24 @@ void fixup_irqs(void)
 	 */
 	mdelay(1);
 
+	/*
+	 * We can walk the vector array of this cpu without holding
+	 * vector_lock because the cpu is already marked !online, so
+	 * nothing else will touch it.
+	 */
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irr;
 
-		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
+		if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
 			continue;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
 		if (irr  & (1 << (vector % 32))) {
-			irq = __this_cpu_read(vector_irq[vector]);
+			desc = __this_cpu_read(vector_irq[vector]);
 
-			desc = irq_to_desc(irq);
+			raw_spin_lock(&desc->lock);
 			data = irq_desc_get_irq_data(desc);
 			chip = irq_data_get_irq_chip(data);
-			raw_spin_lock(&desc->lock);
 			if (chip->irq_retrigger) {
 				chip->irq_retrigger(data);
 				__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
@@ -465,7 +544,7 @@ void fixup_irqs(void)
 			raw_spin_unlock(&desc->lock);
 		}
 		if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 	}
 }
 #endif
diff --git a/kernel/arch/x86/kernel/irq_32.c b/kernel/arch/x86/kernel/irq_32.c
index 521ef3cc8..ce71f7098 100644
--- a/kernel/arch/x86/kernel/irq_32.c
+++ b/kernel/arch/x86/kernel/irq_32.c
@@ -21,12 +21,6 @@
 
 #include <asm/apic.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
 int sysctl_panic_on_stackoverflow __read_mostly;
@@ -74,11 +68,10 @@ static inline void *current_stack(void)
 	return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
 }
 
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 {
 	struct irq_stack *curstk, *irqstk;
-	u32 *isp, *prev_esp, arg1, arg2;
+	u32 *isp, *prev_esp, arg1;
 
 	curstk = (struct irq_stack *) current_stack();
 	irqstk = __this_cpu_read(hardirq_stack);
@@ -104,8 +97,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 	asm volatile("xchgl	%%ebx,%%esp	\n"
 		     "call	*%%edi		\n"
 		     "movl	%%ebx,%%esp	\n"
-		     : "=a" (arg1), "=d" (arg2), "=b" (isp)
-		     :  "0" (irq),   "1" (desc),  "2" (isp),
+		     : "=a" (arg1), "=b" (isp)
+		     :  "0" (desc),   "1" (isp),
 			"D" (desc->handle_irq)
 		     : "memory", "cc", "ecx");
 	return 1;
@@ -156,21 +149,17 @@ void do_softirq_own_stack(void)
 }
 #endif
 
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
-	int overflow;
-
-	overflow = check_stack_overflow();
+	int overflow = check_stack_overflow();
 
-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (IS_ERR_OR_NULL(desc))
 		return false;
 
-	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
+	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
-		desc->handle_irq(irq, desc);
+		generic_handle_irq_desc(desc);
 	}
 
 	return true;
diff --git a/kernel/arch/x86/kernel/irq_64.c b/kernel/arch/x86/kernel/irq_64.c
index 394e643d7..206d0b90a 100644
--- a/kernel/arch/x86/kernel/irq_64.c
+++ b/kernel/arch/x86/kernel/irq_64.c
@@ -20,12 +20,6 @@
 #include <asm/idle.h>
 #include <asm/apic.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 int sysctl_panic_on_stackoverflow;
 
 /*
@@ -74,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 #endif
 }
 
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
-
 	stack_overflow_check(regs);
 
-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (IS_ERR_OR_NULL(desc))
 		return false;
 
-	generic_handle_irq_desc(irq, desc);
+	generic_handle_irq_desc(desc);
 	return true;
 }
diff --git a/kernel/arch/x86/kernel/irq_work.c b/kernel/arch/x86/kernel/irq_work.c
index 15d741ddf..3512ba607 100644
--- a/kernel/arch/x86/kernel/irq_work.c
+++ b/kernel/arch/x86/kernel/irq_work.c
@@ -1,7 +1,7 @@
 /*
  * x86 specific code for irq_work
  *
- * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
  */
 
 #include <linux/kernel.h>
@@ -10,12 +10,6 @@
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
 
-static inline void irq_work_entering_irq(void)
-{
-	irq_enter();
-	ack_APIC_irq();
-}
-
 static inline void __smp_irq_work_interrupt(void)
 {
 	inc_irq_stat(apic_irq_work_irqs);
@@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void)
 
 __visible void smp_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_irq_work_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
 	__smp_irq_work_interrupt();
 	trace_irq_work_exit(IRQ_WORK_VECTOR);
diff --git a/kernel/arch/x86/kernel/irqinit.c b/kernel/arch/x86/kernel/irqinit.c
index cd10a6437..1423ab1b0 100644
--- a/kernel/arch/x86/kernel/irqinit.c
+++ b/kernel/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
 };
 
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
+	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
 int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
+		if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
 			return 1;
 	}
 
@@ -86,7 +86,7 @@ void __init init_IRQ(void)
 	int i;
 
 	/*
-	 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+	 * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
 	 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
 	 * then this configuration will likely be static after the boot. If
 	 * these IRQ's are handled by more mordern controllers like IO-APIC,
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 	 * irq's migrate etc.
 	 */
 	for (i = 0; i < nr_legacy_irqs(); i++)
-		per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
 
 	x86_init.irqs.intr_init();
 }
@@ -135,6 +135,10 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+	alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
+#endif
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -144,6 +148,8 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_HAVE_KVM
 	/* IPI for KVM to deliver posted interrupt */
 	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+	/* IPI for KVM to deliver interrupt to wake up tasks */
+	alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
 #endif
 
 	/* IPI vectors for APIC spurious and error interrupts */
diff --git a/kernel/arch/x86/kernel/jump_label.c b/kernel/arch/x86/kernel/jump_label.c
index 26d5a55a2..e565e0e4d 100644
--- a/kernel/arch/x86/kernel/jump_label.c
+++ b/kernel/arch/x86/kernel/jump_label.c
@@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry,
 	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
 	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
 
-	if (type == JUMP_LABEL_ENABLE) {
+	if (type == JUMP_LABEL_JMP) {
 		if (init) {
 			/*
 			 * Jump label is enabled for the first time.
diff --git a/kernel/arch/x86/kernel/kexec-bzimage64.c b/kernel/arch/x86/kernel/kexec-bzimage64.c
index ca05f8648..0f8a6bbaa 100644
--- a/kernel/arch/x86/kernel/kexec-bzimage64.c
+++ b/kernel/arch/x86/kernel/kexec-bzimage64.c
@@ -72,15 +72,16 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
 			 unsigned long cmdline_len)
 {
 	char *cmdline_ptr = ((char *)params) + cmdline_offset;
-	unsigned long cmdline_ptr_phys, len;
+	unsigned long cmdline_ptr_phys, len = 0;
 	uint32_t cmdline_low_32, cmdline_ext_32;
 
-	memcpy(cmdline_ptr, cmdline, cmdline_len);
 	if (image->type == KEXEC_TYPE_CRASH) {
-		len = sprintf(cmdline_ptr + cmdline_len - 1,
-			" elfcorehdr=0x%lx", image->arch.elf_load_addr);
-		cmdline_len += len;
+		len = sprintf(cmdline_ptr,
+			"elfcorehdr=0x%lx ", image->arch.elf_load_addr);
 	}
+	memcpy(cmdline_ptr + len, cmdline, cmdline_len);
+	cmdline_len += len;
+
 	cmdline_ptr[cmdline_len - 1] = '\0';
 
 	pr_debug("Final command line is: %s\n", cmdline_ptr);
@@ -222,9 +223,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
 	memset(&params->hd0_info, 0, sizeof(params->hd0_info));
 	memset(&params->hd1_info, 0, sizeof(params->hd1_info));
 
-	/* Default sysdesc table */
-	params->sys_desc_table.length = 0;
-
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_setup_memmap_entries(image, params);
 		if (ret)
@@ -535,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 	int ret;
 
 	ret = verify_pefile_signature(kernel, kernel_len,
-				      system_trusted_keyring, &trusted);
+				      system_trusted_keyring,
+				      VERIFYING_KEXEC_PE_SIGNATURE,
+				      &trusted);
 	if (ret < 0)
 		return ret;
 	if (!trusted)
diff --git a/kernel/arch/x86/kernel/kgdb.c b/kernel/arch/x86/kernel/kgdb.c
index d6178d979..44256a627 100644
--- a/kernel/arch/x86/kernel/kgdb.c
+++ b/kernel/arch/x86/kernel/kgdb.c
@@ -511,26 +511,31 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
 	return NOTIFY_STOP;
 }
 
-static int was_in_debug_nmi[NR_CPUS];
+static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS);
 
 static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
+	int cpu;
+
 	switch (cmd) {
 	case NMI_LOCAL:
 		if (atomic_read(&kgdb_active) != -1) {
 			/* KGDB CPU roundup */
-			kgdb_nmicallback(raw_smp_processor_id(), regs);
-			was_in_debug_nmi[raw_smp_processor_id()] = 1;
+			cpu = raw_smp_processor_id();
+			kgdb_nmicallback(cpu, regs);
+			set_bit(cpu, was_in_debug_nmi);
 			touch_nmi_watchdog();
+
 			return NMI_HANDLED;
 		}
 		break;
 
 	case NMI_UNKNOWN:
-		if (was_in_debug_nmi[raw_smp_processor_id()]) {
-			was_in_debug_nmi[raw_smp_processor_id()] = 0;
+		cpu = raw_smp_processor_id();
+
+		if (__test_and_clear_bit(cpu, was_in_debug_nmi))
 			return NMI_HANDLED;
-		}
+
 		break;
 	default:
 		/* do nothing */
diff --git a/kernel/arch/x86/kernel/kvm.c b/kernel/arch/x86/kernel/kvm.c
index 943562006..807950860 100644
--- a/kernel/arch/x86/kernel/kvm.c
+++ b/kernel/arch/x86/kernel/kvm.c
@@ -36,6 +36,7 @@
 #include <linux/kprobes.h>
 #include <linux/debugfs.h>
 #include <linux/nmi.h>
+#include <linux/swait.h>
 #include <asm/timer.h>
 #include <asm/cpu.h>
 #include <asm/traps.h>
@@ -91,14 +92,14 @@ static void kvm_io_delay(void)
 
 struct kvm_task_sleep_node {
 	struct hlist_node link;
-	wait_queue_head_t wq;
+	struct swait_queue_head wq;
 	u32 token;
 	int cpu;
 	bool halted;
 };
 
 static struct kvm_task_sleep_head {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct hlist_head list;
 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
 
@@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
 	struct kvm_task_sleep_node n, *e;
-	DEFINE_WAIT(wait);
+	DECLARE_SWAITQUEUE(wait);
 
 	rcu_irq_enter();
 
-	spin_lock(&b->lock);
+	raw_spin_lock(&b->lock);
 	e = _find_apf_task(b, token);
 	if (e) {
 		/* dummy entry exist -> wake up was delivered ahead of PF */
 		hlist_del(&e->link);
 		kfree(e);
-		spin_unlock(&b->lock);
+		raw_spin_unlock(&b->lock);
 
 		rcu_irq_exit();
 		return;
@@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
 	n.token = token;
 	n.cpu = smp_processor_id();
 	n.halted = is_idle_task(current) || preempt_count() > 1;
-	init_waitqueue_head(&n.wq);
+	init_swait_queue_head(&n.wq);
 	hlist_add_head(&n.link, &b->list);
-	spin_unlock(&b->lock);
+	raw_spin_unlock(&b->lock);
 
 	for (;;) {
 		if (!n.halted)
-			prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+			prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
 		if (hlist_unhashed(&n.link))
 			break;
 
@@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
 		}
 	}
 	if (!n.halted)
-		finish_wait(&n.wq, &wait);
+		finish_swait(&n.wq, &wait);
 
 	rcu_irq_exit();
 	return;
@@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 	hlist_del_init(&n->link);
 	if (n->halted)
 		smp_send_reschedule(n->cpu);
-	else if (waitqueue_active(&n->wq))
-		wake_up(&n->wq);
+	else if (swait_active(&n->wq))
+		swake_up(&n->wq);
 }
 
 static void apf_task_wake_all(void)
@@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
 		struct hlist_node *p, *next;
 		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
-		spin_lock(&b->lock);
+		raw_spin_lock(&b->lock);
 		hlist_for_each_safe(p, next, &b->list) {
 			struct kvm_task_sleep_node *n =
 				hlist_entry(p, typeof(*n), link);
 			if (n->cpu == smp_processor_id())
 				apf_task_wake_one(n);
 		}
-		spin_unlock(&b->lock);
+		raw_spin_unlock(&b->lock);
 	}
 }
 
@@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
 	}
 
 again:
-	spin_lock(&b->lock);
+	raw_spin_lock(&b->lock);
 	n = _find_apf_task(b, token);
 	if (!n) {
 		/*
@@ -225,17 +226,17 @@ again:
 			 * Allocation failed! Busy wait while other cpu
 			 * handles async PF.
 			 */
-			spin_unlock(&b->lock);
+			raw_spin_unlock(&b->lock);
 			cpu_relax();
 			goto again;
 		}
 		n->token = token;
 		n->cpu = smp_processor_id();
-		init_waitqueue_head(&n->wq);
+		init_swait_queue_head(&n->wq);
 		hlist_add_head(&n->link, &b->list);
 	} else
 		apf_task_wake_one(n);
-	spin_unlock(&b->lock);
+	raw_spin_unlock(&b->lock);
 	return;
 }
 EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
@@ -331,7 +332,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 	apic_write(APIC_EOI, APIC_EOI_ACK);
 }
 
-void kvm_guest_cpu_init(void)
+static void kvm_guest_cpu_init(void)
 {
 	if (!kvm_para_available())
 		return;
@@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
 	paravirt_ops_setup();
 	register_reboot_notifier(&kvm_pv_reboot_nb);
 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
-		spin_lock_init(&async_pf_sleepers[i].lock);
+		raw_spin_lock_init(&async_pf_sleepers[i].lock);
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 		x86_init.irqs.trap_init = kvm_apf_trap_init;
 
@@ -584,6 +585,39 @@ static void kvm_kick_cpu(int cpu)
 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
 
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+	unsigned long flags;
+
+	if (in_nmi())
+		return;
+
+	local_irq_save(flags);
+
+	if (READ_ONCE(*ptr) != val)
+		goto out;
+
+	/*
+	 * halt until it's our turn and kicked. Note that we do safe halt
+	 * for irq enabled case to avoid hang when lock info is overwritten
+	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
+	 */
+	if (arch_irqs_disabled_flags(flags))
+		halt();
+	else
+		safe_halt();
+
+out:
+	local_irq_restore(flags);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 enum kvm_contention_stat {
 	TAKEN_SLOW,
 	TAKEN_SLOW_PICKUP,
@@ -655,7 +689,7 @@ static inline void spin_time_accum_blocked(u64 start)
 static struct dentry *d_spin_debug;
 static struct dentry *d_kvm_debug;
 
-struct dentry *kvm_init_debugfs(void)
+static struct dentry *kvm_init_debugfs(void)
 {
 	d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
 	if (!d_kvm_debug)
@@ -817,6 +851,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 	}
 }
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -828,8 +864,16 @@ void __init kvm_spinlock_init(void)
 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 		return;
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	__pv_init_lock_hash();
+	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_lock_ops.wait = kvm_wait;
+	pv_lock_ops.kick = kvm_kick_cpu;
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
 	pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
 }
 
 static __init int kvm_spinlock_init_jump(void)
diff --git a/kernel/arch/x86/kernel/kvmclock.c b/kernel/arch/x86/kernel/kvmclock.c
index 42caaef89..2bd81e302 100644
--- a/kernel/arch/x86/kernel/kvmclock.c
+++ b/kernel/arch/x86/kernel/kvmclock.c
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/memblock.h>
+#include <linux/sched.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
@@ -31,6 +32,7 @@
 static int kvmclock = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static cycle_t kvm_sched_clock_offset;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -91,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
 	return kvm_clock_read();
 }
 
+static cycle_t kvm_sched_clock_read(void)
+{
+	return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+	if (!stable) {
+		pv_time_ops.sched_clock = kvm_clock_read;
+		return;
+	}
+
+	kvm_sched_clock_offset = kvm_clock_read();
+	pv_time_ops.sched_clock = kvm_sched_clock_read;
+	set_sched_clock_stable();
+
+	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
+			kvm_sched_clock_offset);
+
+	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+	         sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
 /*
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -199,7 +224,7 @@ static void kvm_setup_secondary_clock(void)
  * kind of shutdown from our side, we unregister the clock by writting anything
  * that does not have the 'enable' bit set in the msr
  */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
@@ -217,8 +242,10 @@ static void kvm_shutdown(void)
 
 void __init kvmclock_init(void)
 {
+	struct pvclock_vcpu_time_info *vcpu_time;
 	unsigned long mem;
-	int size;
+	int size, cpu;
+	u8 flags;
 
 	size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
 
@@ -245,7 +272,17 @@ void __init kvmclock_init(void)
 		memblock_free(mem, size);
 		return;
 	}
-	pv_time_ops.sched_clock = kvm_clock_read;
+
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+	cpu = get_cpu();
+	vcpu_time = &hv_clock[cpu].pvti;
+	flags = pvclock_read_flags(vcpu_time);
+
+	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+	put_cpu();
+
 	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
@@ -256,15 +293,12 @@ void __init kvmclock_init(void)
 	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
 	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
 	machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
 	kvm_get_preset_lpj();
 	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	pv_info.name = "KVM";
-
-	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
-		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 }
 
 int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/kernel/arch/x86/kernel/ldt.c b/kernel/arch/x86/kernel/ldt.c
index 2bcc0525f..6acc9dd91 100644
--- a/kernel/arch/x86/kernel/ldt.c
+++ b/kernel/arch/x86/kernel/ldt.c
@@ -58,7 +58,7 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 	if (alloc_size > PAGE_SIZE)
 		new_ldt->entries = vzalloc(alloc_size);
 	else
-		new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL);
+		new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
 
 	if (!new_ldt->entries) {
 		kfree(new_ldt);
@@ -95,7 +95,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
 		vfree(ldt->entries);
 	else
-		kfree(ldt->entries);
+		free_page((unsigned long)ldt->entries);
 	kfree(ldt);
 }
 
diff --git a/kernel/arch/x86/kernel/livepatch.c b/kernel/arch/x86/kernel/livepatch.c
index ff3c3101d..d1d35ccff 100644
--- a/kernel/arch/x86/kernel/livepatch.c
+++ b/kernel/arch/x86/kernel/livepatch.c
@@ -42,7 +42,6 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
 	bool readonly;
 	unsigned long val;
 	unsigned long core = (unsigned long)mod->module_core;
-	unsigned long core_ro_size = mod->core_ro_size;
 	unsigned long core_size = mod->core_size;
 
 	switch (type) {
@@ -70,10 +69,12 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
 		/* loc does not point to any symbol inside the module */
 		return -EINVAL;
 
-	if (loc < core + core_ro_size)
+	readonly = false;
+
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+	if (loc < core + mod->core_ro_size)
 		readonly = true;
-	else
-		readonly = false;
+#endif
 
 	/* determine if the relocation spans a page boundary */
 	numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
diff --git a/kernel/arch/x86/kernel/machine_kexec_64.c b/kernel/arch/x86/kernel/machine_kexec_64.c
index 415480d3e..819ab3f9c 100644
--- a/kernel/arch/x86/kernel/machine_kexec_64.c
+++ b/kernel/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
 #include <linux/ftrace.h>
 #include <linux/io.h>
 #include <linux/suspend.h>
+#include <linux/vmalloc.h>
 
 #include <asm/init.h>
 #include <asm/pgtable.h>
@@ -25,6 +26,7 @@
 #include <asm/io_apic.h>
 #include <asm/debugreg.h>
 #include <asm/kexec-bzimage64.h>
+#include <asm/setup.h>
 
 #ifdef CONFIG_KEXEC_FILE
 static struct kexec_file_ops *kexec_file_loaders[] = {
@@ -334,7 +336,7 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
 #endif
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
-			      (unsigned long)&_text - __START_KERNEL);
+			      kaslr_offset());
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/kernel/arch/x86/kernel/mcount_64.S b/kernel/arch/x86/kernel/mcount_64.S
index 94ea120fa..87e1762e2 100644
--- a/kernel/arch/x86/kernel/mcount_64.S
+++ b/kernel/arch/x86/kernel/mcount_64.S
@@ -278,6 +278,12 @@ trace:
 	/* save_mcount_regs fills in first two parameters */
 	save_mcount_regs
 
+	/*
+	 * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
+	 * set (see include/asm/ftrace.h and include/linux/ftrace.h).  Only the
+	 * ip and parent ip are used and the list function is called when
+	 * function tracing is enabled.
+	 */
 	call   *ftrace_trace_function
 
 	restore_mcount_regs
diff --git a/kernel/arch/x86/kernel/mpparse.c b/kernel/arch/x86/kernel/mpparse.c
index 2d2a237f2..30ca7607c 100644
--- a/kernel/arch/x86/kernel/mpparse.c
+++ b/kernel/arch/x86/kernel/mpparse.c
@@ -19,8 +19,8 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/pci.h>
-#include <linux/irqdomain.h>
 
+#include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
@@ -113,11 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		pr_warn("Unknown bustype %s - ignoring\n", str);
 }
 
-static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	struct ioapic_domain_cfg cfg = {
diff --git a/kernel/arch/x86/kernel/nmi.c b/kernel/arch/x86/kernel/nmi.c
index d05bd2e2e..697f90db0 100644
--- a/kernel/arch/x86/kernel/nmi.c
+++ b/kernel/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
 		a->handler, whole_msecs, decimal_msecs);
 }
 
-static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_SERR, regs, false))
+	if (nmi_handle(NMI_SERR, regs))
 		return;
 
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	unsigned long i;
 
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_IO_CHECK, regs, false))
+	if (nmi_handle(NMI_IO_CHECK, regs))
 		return;
 
 	pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	 * as only the first one is ever run (unless it can actually determine
 	 * if it caused the NMI)
 	 */
-	handled = nmi_handle(NMI_UNKNOWN, regs, false);
+	handled = nmi_handle(NMI_UNKNOWN, regs);
 	if (handled) {
 		__this_cpu_add(nmi_stats.unknown, handled);
 		return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
 
 	__this_cpu_write(last_nmi_rip, regs->ip);
 
-	handled = nmi_handle(NMI_LOCAL, regs, b2b);
+	handled = nmi_handle(NMI_LOCAL, regs);
 	__this_cpu_add(nmi_stats.normal, handled);
 	if (handled) {
 		/*
diff --git a/kernel/arch/x86/kernel/paravirt-spinlocks.c b/kernel/arch/x86/kernel/paravirt-spinlocks.c
index bbb6c7316..33ee3e0ef 100644
--- a/kernel/arch/x86/kernel/paravirt-spinlocks.c
+++ b/kernel/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,11 +8,33 @@
 
 #include <asm/paravirt.h>
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+	return pv_lock_ops.queued_spin_unlock.func ==
+		__raw_callee_save___native_queued_spin_unlock;
+}
+#endif
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+	.wait = paravirt_nop,
+	.kick = paravirt_nop,
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	.lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
 	.unlock_kick = paravirt_nop,
-#endif
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
 
diff --git a/kernel/arch/x86/kernel/paravirt.c b/kernel/arch/x86/kernel/paravirt.c
index c614dd492..c2130aef3 100644
--- a/kernel/arch/x86/kernel/paravirt.c
+++ b/kernel/arch/x86/kernel/paravirt.c
@@ -41,10 +41,18 @@
 #include <asm/timer.h>
 #include <asm/special_insns.h>
 
-/* nop stub */
-void _paravirt_nop(void)
-{
-}
+/*
+ * nop stub, which must not clobber anything *including the stack* to
+ * avoid confusing the entry prologues.
+ */
+extern void _paravirt_nop(void);
+asm (".pushsection .entry.text, \"ax\"\n"
+     ".global _paravirt_nop\n"
+     "_paravirt_nop:\n\t"
+     "ret\n\t"
+     ".size _paravirt_nop, . - _paravirt_nop\n\t"
+     ".type _paravirt_nop, @function\n\t"
+     ".popsection");
 
 /* identity function, which can be inlined */
 u32 _paravirt_ident_32(u32 x)
@@ -154,7 +162,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_ident_64(insnbuf, len);
 
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+#ifdef CONFIG_X86_32
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
+#endif
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		/* If operation requires a jmp, then jmp */
@@ -349,9 +359,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
@@ -371,7 +379,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 
 	.load_sp0 = native_load_sp0,
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+#if defined(CONFIG_X86_32)
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 #endif
 #ifdef CONFIG_X86_64
diff --git a/kernel/arch/x86/kernel/paravirt_patch_32.c b/kernel/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6..c89f50a76 100644
--- a/kernel/arch/x86/kernel/paravirt_patch_32.c
+++ b/kernel/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+#endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
@@ -24,6 +27,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 	return 0;
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {
@@ -46,15 +51,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_cpu_ops, read_tsc);
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+			if (pv_is_native_spin_unlock()) {
+				start = start_pv_lock_ops_queued_spin_unlock;
+				end   = end_pv_lock_ops_queued_spin_unlock;
+				goto patch_site;
+			}
+#endif
 
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
+
+patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
 	}
 #undef PATCH_SITE
 	return ret;
diff --git a/kernel/arch/x86/kernel/paravirt_patch_64.c b/kernel/arch/x86/kernel/paravirt_patch_64.c
index a1da6737b..8aa05583b 100644
--- a/kernel/arch/x86/kernel/paravirt_patch_64.c
+++ b/kernel/arch/x86/kernel/paravirt_patch_64.c
@@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 DEF_NATIVE(, mov32, "mov %edi, %eax");
 DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
 	return paravirt_patch_insns(insnbuf, len,
@@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 				    start__mov64, end__mov64);
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {
@@ -49,7 +55,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
-		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
 		PATCH_SITE(pv_cpu_ops, swapgs);
@@ -59,14 +64,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+			if (pv_is_native_spin_unlock()) {
+				start = start_pv_lock_ops_queued_spin_unlock;
+				end   = end_pv_lock_ops_queued_spin_unlock;
+				goto patch_site;
+			}
+#endif
 
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
+
+patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
 	}
 #undef PATCH_SITE
 	return ret;
diff --git a/kernel/arch/x86/kernel/pci-dma.c b/kernel/arch/x86/kernel/pci-dma.c
index a25e202bb..6ba014c61 100644
--- a/kernel/arch/x86/kernel/pci-dma.c
+++ b/kernel/arch/x86/kernel/pci-dma.c
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
 /* Number of entries preallocated for DMA-API debugging */
 #define PREALLOC_DMA_DEBUG_ENTRIES       65536
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-
-	*dev->dma_mask = mask;
-
-	return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
@@ -101,7 +90,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 again:
 	page = NULL;
 	/* CMA can be used only in the context which permits sleeping */
-	if (flag & __GFP_WAIT) {
+	if (gfpflags_allow_blocking(flag)) {
 		page = dma_alloc_from_contiguous(dev, count, get_order(size));
 		if (page && page_to_phys(page) + size > dma_mask) {
 			dma_release_from_contiguous(dev, page, count);
@@ -140,6 +129,21 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
 		free_pages((unsigned long)vaddr, get_order(size));
 }
 
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
+{
+	if (!*dev)
+		*dev = &x86_dma_fallback_dev;
+
+	*gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+	*gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
+
+	if (!is_device_dma_capable(*dev))
+		return false;
+	return true;
+
+}
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
+
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
  * parameter documentation.
diff --git a/kernel/arch/x86/kernel/pci-swiotlb.c b/kernel/arch/x86/kernel/pci-swiotlb.c
index 77dd0ad58..adf0392d5 100644
--- a/kernel/arch/x86/kernel/pci-swiotlb.c
+++ b/kernel/arch/x86/kernel/pci-swiotlb.c
@@ -20,6 +20,13 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 {
 	void *vaddr;
 
+	/*
+	 * Don't print a warning when the first allocation attempt fails.
+	 * swiotlb_alloc_coherent() will print a warning when the DMA
+	 * memory allocation ultimately failed.
+	 */
+	flags |= __GFP_NOWARN;
+
 	vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags,
 					   attrs);
 	if (vaddr)
diff --git a/kernel/arch/x86/kernel/pmem.c b/kernel/arch/x86/kernel/pmem.c
index 3420c874d..14415aff1 100644
--- a/kernel/arch/x86/kernel/pmem.c
+++ b/kernel/arch/x86/kernel/pmem.c
@@ -1,53 +1,31 @@
 /*
  * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
  */
-#include <linux/memblock.h>
 #include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <asm/e820.h>
-#include <asm/page_types.h>
-#include <asm/setup.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
 
-static __init void register_pmem_device(struct resource *res)
+static int found(u64 start, u64 end, void *data)
 {
-	struct platform_device *pdev;
-	int error;
-
-	pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
-	if (!pdev)
-		return;
-
-	error = platform_device_add_resources(pdev, res, 1);
-	if (error)
-		goto out_put_pdev;
-
-	error = platform_device_add(pdev);
-	if (error)
-		goto out_put_pdev;
-	return;
-
-out_put_pdev:
-	dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
-	platform_device_put(pdev);
+	return 1;
 }
 
-static __init int register_pmem_devices(void)
+static __init int register_e820_pmem(void)
 {
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (ei->type == E820_PRAM) {
-			struct resource res = {
-				.flags	= IORESOURCE_MEM,
-				.start	= ei->addr,
-				.end	= ei->addr + ei->size - 1,
-			};
-			register_pmem_device(&res);
-		}
-	}
-
-	return 0;
+	char *pmem = "Persistent Memory (legacy)";
+	struct platform_device *pdev;
+	int rc;
+
+	rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+	if (rc <= 0)
+		return 0;
+
+	/*
+	 * See drivers/nvdimm/e820.c for the implementation, this is
+	 * simply here to trigger the module to load on demand.
+	 */
+	pdev = platform_device_alloc("e820_pmem", -1);
+	return platform_device_add(pdev);
 }
-device_initcall(register_pmem_devices);
+device_initcall(register_e820_pmem);
diff --git a/kernel/arch/x86/kernel/process.c b/kernel/arch/x86/kernel/process.c
index 971743774..9f7c21c22 100644
--- a/kernel/arch/x86/kernel/process.c
+++ b/kernel/arch/x86/kernel/process.c
@@ -25,11 +25,12 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/mwait.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/vm86.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -76,47 +77,18 @@ void idle_notifier_unregister(struct notifier_block *n)
 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 #endif
 
-struct kmem_cache *task_xstate_cachep;
-EXPORT_SYMBOL_GPL(task_xstate_cachep);
-
 /*
  * this gets called so that we can store lazy state into memory and copy the
  * current task into the new thread.
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	*dst = *src;
-
-	dst->thread.fpu_counter = 0;
-	dst->thread.fpu.has_fpu = 0;
-	dst->thread.fpu.state = NULL;
-	task_disable_lazy_fpu_restore(dst);
-	if (tsk_used_math(src)) {
-		int err = fpu_alloc(&dst->thread.fpu);
-		if (err)
-			return err;
-		fpu_copy(dst, src);
-	}
-	return 0;
-}
-
-void free_thread_xstate(struct task_struct *tsk)
-{
-	fpu_free(&tsk->thread.fpu);
-}
-
-void arch_release_task_struct(struct task_struct *tsk)
-{
-	free_thread_xstate(tsk);
-}
+	memcpy(dst, src, arch_task_struct_size);
+#ifdef CONFIG_VM86
+	dst->thread.vm86 = NULL;
+#endif
 
-void arch_task_cache_init(void)
-{
-        task_xstate_cachep =
-        	kmem_cache_create("task_xstate", xstate_size,
-				  __alignof__(union thread_xstate),
-				  SLAB_PANIC | SLAB_NOTRACK, NULL);
-	setup_xstate_comp();
+	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
 
 /*
@@ -127,6 +99,7 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
+	struct fpu *fpu = &t->fpu;
 
 	if (bp) {
 		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
@@ -142,7 +115,9 @@ void exit_thread(void)
 		kfree(bp);
 	}
 
-	drop_fpu(me);
+	free_vm86(t);
+
+	fpu__drop(fpu);
 }
 
 void flush_thread(void)
@@ -152,19 +127,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	if (!use_eager_fpu()) {
-		/* FPU state will be reallocated lazily at the first use. */
-		drop_fpu(tsk);
-		free_thread_xstate(tsk);
-	} else {
-		if (!tsk_used_math(tsk)) {
-			/* kthread execs. TODO: cleanup this horror. */
-			if (WARN_ON(init_fpu(tsk)))
-				force_sig(SIGKILL, tsk);
-			user_fpu_begin();
-		}
-		restore_init_xstate();
-	}
+	fpu__clear(&tsk->thread.fpu);
 }
 
 static void hard_disable_TSC(void)
@@ -363,6 +326,7 @@ void stop_this_cpu(void *dummy)
 	 */
 	set_cpu_online(smp_processor_id(), false);
 	disable_local_APIC();
+	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
 	for (;;)
 		halt();
@@ -445,11 +409,10 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
 }
 
 /*
- * MONITOR/MWAIT with no hints, used for default default C1 state.
- * This invokes MWAIT with interrutps enabled and no flags,
- * which is backwards compatible with the original MWAIT implementation.
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
  */
-
 static void mwait_idle(void)
 {
 	if (!current_set_polling_and_test()) {
@@ -546,3 +509,58 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
 
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long start, bottom, top, sp, fp, ip;
+	int count = 0;
+
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+
+	start = (unsigned long)task_stack_page(p);
+	if (!start)
+		return 0;
+
+	/*
+	 * Layout of the stack page:
+	 *
+	 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
+	 * PADDING
+	 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
+	 * stack
+	 * ----------- bottom = start + sizeof(thread_info)
+	 * thread_info
+	 * ----------- start
+	 *
+	 * The tasks stack pointer points at the location where the
+	 * framepointer is stored. The data on the stack is:
+	 * ... IP FP ... IP FP
+	 *
+	 * We need to read FP and IP, so we need to adjust the upper
+	 * bound by another unsigned long.
+	 */
+	top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
+	top -= 2 * sizeof(unsigned long);
+	bottom = start + sizeof(struct thread_info);
+
+	sp = READ_ONCE(p->thread.sp);
+	if (sp < bottom || sp > top)
+		return 0;
+
+	fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+	do {
+		if (fp < bottom || fp > top)
+			return 0;
+		ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
+		if (!in_sched_functions(ip))
+			return ip;
+		fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
+	} while (count++ < 16 && p->state != TASK_RUNNING);
+	return 0;
+}
diff --git a/kernel/arch/x86/kernel/process_32.c b/kernel/arch/x86/kernel/process_32.c
index 3a7071307..4dd4beae9 100644
--- a/kernel/arch/x86/kernel/process_32.c
+++ b/kernel/arch/x86/kernel/process_32.c
@@ -40,8 +40,7 @@
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/desc.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
@@ -55,6 +54,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
+#include <asm/vm86.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
@@ -130,8 +130,8 @@ void release_thread(struct task_struct *dead_task)
 	release_vm86_irqs(dead_task);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-	unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+	unsigned long arg, struct task_struct *p, unsigned long tls)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
 	struct task_struct *tsk;
@@ -186,7 +186,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	 */
 	if (clone_flags & CLONE_SETTLS)
 		err = do_set_thread_area(p, -1,
-			(struct user_desc __user *)childregs->si, 0);
+			(struct user_desc __user *)tls, 0);
 
 	if (err && p->thread.io_bitmap_ptr) {
 		kfree(p->thread.io_bitmap_ptr);
@@ -272,14 +272,16 @@ __visible __notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
-				 *next = &next_p->thread;
+			     *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -308,14 +310,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		set_iopl_mask(next->iopl);
 
 	/*
-	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
-	 * preempt_count of all tasks was equal here and this would not be
-	 * needed.
-	 */
-	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
-	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
-	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */
 	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
@@ -328,19 +322,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
 	 * the GDT and LDT are properly updated, and must be
-	 * done before math_state_restore, so the TS bit is up
+	 * done before fpu__restore(), so the TS bit is up
 	 * to date.
 	 */
 	arch_end_context_switch(next_p);
 
 	/*
-	 * Reload esp0, kernel_stack, and current_top_of_stack.  This changes
+	 * Reload esp0 and cpu_current_top_of_stack.  This changes
 	 * current_thread_info().
 	 */
 	load_sp0(tss, next);
-	this_cpu_write(kernel_stack,
-		       (unsigned long)task_stack_page(next_p) +
-		       THREAD_SIZE);
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
@@ -351,37 +342,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	this_cpu_write(current_task, next_p);
 
 	return prev_p;
 }
-
-#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
-#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
-
-unsigned long get_wchan(struct task_struct *p)
-{
-	unsigned long bp, sp, ip;
-	unsigned long stack_page;
-	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-	stack_page = (unsigned long)task_stack_page(p);
-	sp = p->thread.sp;
-	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
-		return 0;
-	/* include/asm-i386/system.h:switch_to() pushes bp last. */
-	bp = *(unsigned long *) sp;
-	do {
-		if (bp < stack_page || bp > top_ebp+stack_page)
-			return 0;
-		ip = *(unsigned long *) (bp+4);
-		if (!in_sched_functions(ip))
-			return ip;
-		bp = *(unsigned long *) bp;
-	} while (count++ < 16);
-	return 0;
-}
-
diff --git a/kernel/arch/x86/kernel/process_64.c b/kernel/arch/x86/kernel/process_64.c
index 5e0bf57d9..e835d263a 100644
--- a/kernel/arch/x86/kernel/process_64.c
+++ b/kernel/arch/x86/kernel/process_64.c
@@ -38,8 +38,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
@@ -122,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
@@ -129,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
 				dead_task->mm->context.ldt->size);
 			BUG();
 		}
+#endif
 	}
 }
 
@@ -151,8 +152,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 	return get_desc_base(&t->thread.tls_array[tls]);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp,
-		unsigned long arg, struct task_struct *p)
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+		unsigned long arg, struct task_struct *p, unsigned long tls)
 {
 	int err;
 	struct pt_regs *childregs;
@@ -208,10 +209,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 #ifdef CONFIG_IA32_EMULATION
 		if (is_ia32_task())
 			err = do_set_thread_area(p, -1,
-				(struct user_desc __user *)childregs->si, 0);
+				(struct user_desc __user *)tls, 0);
 		else
 #endif
-			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+			err = do_arch_prctl(p, ARCH_SET_FS, tls);
 		if (err)
 			goto out;
 	}
@@ -249,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 			    __USER_CS, __USER_DS, 0);
 }
 
-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
 	start_thread_common(regs, new_ip, new_sp,
 			    test_thread_flag(TIF_X32)
@@ -274,12 +275,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	unsigned fsindex, gsindex;
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 
-	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -299,7 +302,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Leave lazy mode, flushing any hypercalls made here.  This
 	 * must be done after loading TLS entries in the GDT but before
 	 * loading segments that might reference them, and and it must
-	 * be done before math_state_restore, so the TS bit is up to
+	 * be done before fpu__restore(), so the TS bit is up to
 	 * date.
 	 */
 	arch_end_context_switch(next_p);
@@ -329,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Switch FS and GS.
 	 *
-	 * These are even more complicated than FS and GS: they have
+	 * These are even more complicated than DS and ES: they have
 	 * 64-bit bases are that controlled by arch_prctl.  Those bases
 	 * only differ from the values in the GDT or LDT if the selector
 	 * is 0.
@@ -391,27 +394,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 	prev->gsindex = gsindex;
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
 	this_cpu_write(current_task, next_p);
 
-	/*
-	 * If it were not for PREEMPT_ACTIVE we could guarantee that the
-	 * preempt_count of all tasks was equal here and this would not be
-	 * needed.
-	 */
-	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
-	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
 	/* Reload esp0 and ss1.  This changes current_thread_info(). */
 	load_sp0(tss, next);
 
-	this_cpu_write(kernel_stack,
-		(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
-
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 */
@@ -499,30 +491,6 @@ void set_personality_ia32(bool x32)
 }
 EXPORT_SYMBOL_GPL(set_personality_ia32);
 
-unsigned long get_wchan(struct task_struct *p)
-{
-	unsigned long stack;
-	u64 fp, ip;
-	int count = 0;
-
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-	stack = (unsigned long)task_stack_page(p);
-	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
-		return 0;
-	fp = *(u64 *)(p->thread.sp);
-	do {
-		if (fp < (unsigned long)stack ||
-		    fp >= (unsigned long)stack+THREAD_SIZE)
-			return 0;
-		ip = *(u64 *)(fp+8);
-		if (!in_sched_functions(ip))
-			return ip;
-		fp = *(u64 *)fp;
-	} while (count++ < 16);
-	return 0;
-}
-
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 {
 	int ret = 0;
diff --git a/kernel/arch/x86/kernel/ptrace.c b/kernel/arch/x86/kernel/ptrace.c
index a7bc79480..558f50ede 100644
--- a/kernel/arch/x86/kernel/ptrace.c
+++ b/kernel/arch/x86/kernel/ptrace.c
@@ -11,7 +11,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/ptrace.h>
-#include <linux/regset.h>
 #include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/elf.h>
@@ -28,8 +27,9 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -37,12 +37,10 @@
 #include <asm/proto.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/traps.h>
+#include <asm/syscall.h>
 
 #include "tls.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
 enum x86_regset {
 	REGSET_GENERAL,
 	REGSET_FP,
@@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target,
 	return ret;
 }
 
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+			     compat_ulong_t caddr, compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) {
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct32),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_GENERAL, 0,
+					     sizeof(struct user_regs_struct32),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_FP, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(
+			child, &user_x86_32_view, REGSET_FP,
+			0, sizeof(struct user_i387_ia32_struct), datap);
+
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP, 0,
+					   sizeof(struct user32_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP, 0,
+					     sizeof(struct user32_fxsr_struct),
+					     datap);
+
+	case PTRACE_GET_THREAD_AREA:
+	case PTRACE_SET_THREAD_AREA:
+		return arch_ptrace(child, request, addr, data);
+
+	default:
+		return compat_ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
 #ifdef CONFIG_X86_X32_ABI
 static long x32_arch_ptrace(struct task_struct *child,
 			    compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child,
 }
 #endif
 
+#ifdef CONFIG_COMPAT
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
-	unsigned long addr = caddr;
-	unsigned long data = cdata;
-	void __user *datap = compat_ptr(data);
-	int ret;
-	__u32 val;
-
 #ifdef CONFIG_X86_X32_ABI
 	if (!is_ia32_task())
 		return x32_arch_ptrace(child, request, caddr, cdata);
 #endif
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = getreg32(child, addr, &val);
-		if (ret == 0)
-			ret = put_user(val, (__u32 __user *)datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = putreg32(child, addr, data);
-		break;
-
-	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_GENERAL,
-					   0, sizeof(struct user_regs_struct32),
-					   datap);
-
-	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_GENERAL, 0,
-					     sizeof(struct user_regs_struct32),
-					     datap);
-
-	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_FP, 0,
-					   sizeof(struct user_i387_ia32_struct),
-					   datap);
-
-	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
-		return copy_regset_from_user(
-			child, &user_x86_32_view, REGSET_FP,
-			0, sizeof(struct user_i387_ia32_struct), datap);
-
-	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_XFP, 0,
-					   sizeof(struct user32_fxsr_struct),
-					   datap);
-
-	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_XFP, 0,
-					     sizeof(struct user32_fxsr_struct),
-					     datap);
-
-	case PTRACE_GET_THREAD_AREA:
-	case PTRACE_SET_THREAD_AREA:
-		return arch_ptrace(child, request, addr, data);
-
-	default:
-		return compat_ptrace_request(child, request, addr, data);
-	}
-
-	return ret;
+#ifdef CONFIG_IA32_EMULATION
+	return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+	return 0;
+#endif
 }
-
-#endif	/* CONFIG_IA32_EMULATION */
+#endif	/* CONFIG_COMPAT */
 
 #ifdef CONFIG_X86_64
 
@@ -1297,7 +1305,7 @@ static struct user_regset x86_64_regsets[] __read_mostly = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_struct) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
-		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
@@ -1338,13 +1346,13 @@ static struct user_regset x86_32_regsets[] __read_mostly = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+		.active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
 	},
 	[REGSET_XFP] = {
 		.core_note_type = NT_PRXFPREG,
 		.n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
@@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
 }
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
-	if (arch == AUDIT_ARCH_X86_64) {
-		audit_syscall_entry(regs->orig_ax, regs->di,
-				    regs->si, regs->dx, regs->r10);
-	} else
-#endif
-	{
-		audit_syscall_entry(regs->orig_ax, regs->bx,
-				    regs->cx, regs->dx, regs->si);
-	}
-}
-
-/*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2.  If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0:			resume the syscall
- * 1:			go to phase 2; no seccomp phase 2 needed
- * anything else:	go to phase 2; pass return value to seccomp
- */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
-{
-	unsigned long ret = 0;
-	u32 work;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		user_exit();
-		work &= ~_TIF_NOHZ;
-	}
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Do seccomp first -- it should minimize exposure of other
-	 * code, and keeping seccomp fast is probably more valuable
-	 * than the rest of this.
-	 */
-	if (work & _TIF_SECCOMP) {
-		struct seccomp_data sd;
-
-		sd.arch = arch;
-		sd.nr = regs->orig_ax;
-		sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
-		if (arch == AUDIT_ARCH_X86_64) {
-			sd.args[0] = regs->di;
-			sd.args[1] = regs->si;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->r10;
-			sd.args[4] = regs->r8;
-			sd.args[5] = regs->r9;
-		} else
-#endif
-		{
-			sd.args[0] = regs->bx;
-			sd.args[1] = regs->cx;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->si;
-			sd.args[4] = regs->di;
-			sd.args[5] = regs->bp;
-		}
-
-		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
-		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
-		ret = seccomp_phase1(&sd);
-		if (ret == SECCOMP_PHASE1_SKIP) {
-			regs->orig_ax = -1;
-			ret = 0;
-		} else if (ret != SECCOMP_PHASE1_OK) {
-			return ret;  /* Go directly to phase 2 */
-		}
-
-		work &= ~_TIF_SECCOMP;
-	}
-#endif
-
-	/* Do our best to finish without phase 2. */
-	if (work == 0)
-		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
-	if (work == _TIF_SYSCALL_AUDIT) {
-		/*
-		 * If there is no more work to be done except auditing,
-		 * then audit in phase 1.  Phase 2 always audits, so, if
-		 * we audit here, then we can't go on to phase 2.
-		 */
-		do_audit_syscall_entry(regs, arch);
-		return 0;
-	}
-#endif
-
-	return 1;  /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
-				unsigned long phase1_result)
-{
-	long ret = 0;
-	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Call seccomp_phase2 before running the other hooks so that
-	 * they can see any changes made by a seccomp tracer.
-	 */
-	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
-	}
-#endif
-
-	if (unlikely(work & _TIF_SYSCALL_EMU))
-		ret = -1L;
-
-	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->orig_ax);
-
-	do_audit_syscall_entry(regs, arch);
-
-	return ret ?: regs->orig_ax;
-}
-
-long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
-	if (phase1_result == 0)
-		return regs->orig_ax;
-	else
-		return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
-void syscall_trace_leave(struct pt_regs *regs)
-{
-	bool step;
-
-	/*
-	 * We may come here right after calling schedule_user()
-	 * or do_notify_resume(), in which case we can be in RCU
-	 * user mode.
-	 */
-	user_exit();
-
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->ax);
-
-	/*
-	 * If TIF_SYSCALL_EMU is set, we only get here because of
-	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
-	 * We already reported this syscall instruction in
-	 * syscall_trace_enter().
-	 */
-	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
-			!test_thread_flag(TIF_SYSCALL_EMU);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
-
-	user_enter();
-}
diff --git a/kernel/arch/x86/kernel/quirks.c b/kernel/arch/x86/kernel/quirks.c
index 176a0f99d..cc457ff81 100644
--- a/kernel/arch/x86/kernel/quirks.c
+++ b/kernel/arch/x86/kernel/quirks.c
@@ -524,7 +524,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
  */
 static void force_disable_hpet_msi(struct pci_dev *unused)
 {
-	hpet_msi_disable = 1;
+	hpet_msi_disable = true;
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
diff --git a/kernel/arch/x86/kernel/reboot.c b/kernel/arch/x86/kernel/reboot.c
index 86db4bcd7..f660d63f4 100644
--- a/kernel/arch/x86/kernel/reboot.c
+++ b/kernel/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
 		},
 	},
+	{	/* Handle problems with rebooting on the iMac10,1. */
+		.callback = set_pci_reboot,
+		.ident = "Apple iMac10,1",
+		.matches = {
+		    DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+		    DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+		},
+	},
 
 	/* ASRock */
 	{	/* Handle problems with rebooting on ASRock Q1900DC-ITX */
@@ -673,7 +681,7 @@ struct machine_ops machine_ops = {
 	.emergency_restart = native_machine_emergency_restart,
 	.restart = native_machine_restart,
 	.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	.crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
@@ -703,7 +711,7 @@ void machine_halt(void)
 	machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void machine_crash_shutdown(struct pt_regs *regs)
 {
 	machine_ops.crash_shutdown(regs);
diff --git a/kernel/arch/x86/kernel/rtc.c b/kernel/arch/x86/kernel/rtc.c
index cd9685235..4af8d063f 100644
--- a/kernel/arch/x86/kernel/rtc.c
+++ b/kernel/arch/x86/kernel/rtc.c
@@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void)
 	}
 #endif
 
+	if (paravirt_enabled() && !paravirt_has(RTC))
+		return -ENODEV;
+
 	platform_device_register(&rtc_device);
 	dev_info(&rtc_device.dev,
 		 "registered platform RTC device (no PNP device found)\n");
diff --git a/kernel/arch/x86/kernel/setup.c b/kernel/arch/x86/kernel/setup.c
index d74ac3329..d2bbe343f 100644
--- a/kernel/arch/x86/kernel/setup.c
+++ b/kernel/arch/x86/kernel/setup.c
@@ -111,6 +111,7 @@
 #include <asm/mce.h>
 #include <asm/alternative.h>
 #include <asm/prom.h>
+#include <asm/microcode.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -317,15 +318,12 @@ static u64 __init get_ramdisk_size(void)
 	return ramdisk_size;
 }
 
-#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
 	/* Assume only end is not page aligned */
 	u64 ramdisk_image = get_ramdisk_image();
 	u64 ramdisk_size  = get_ramdisk_size();
 	u64 area_size     = PAGE_ALIGN(ramdisk_size);
-	unsigned long slop, clen, mapaddr;
-	char *p, *q;
 
 	/* We need to move the initrd down into directly mapped mem */
 	relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +341,8 @@ static void __init relocate_initrd(void)
 	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
 	       relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
 
-	q = (char *)initrd_start;
-
-	/* Copy the initrd */
-	while (ramdisk_size) {
-		slop = ramdisk_image & ~PAGE_MASK;
-		clen = ramdisk_size;
-		if (clen > MAX_MAP_CHUNK-slop)
-			clen = MAX_MAP_CHUNK-slop;
-		mapaddr = ramdisk_image & PAGE_MASK;
-		p = early_memremap(mapaddr, clen+slop);
-		memcpy(q, p+slop, clen);
-		early_memunmap(p, clen+slop);
-		q += clen;
-		ramdisk_image += clen;
-		ramdisk_size  -= clen;
-	}
+	copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
 
-	ramdisk_image = get_ramdisk_image();
-	ramdisk_size  = get_ramdisk_size();
 	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
 		" [mem %#010llx-%#010llx]\n",
 		ramdisk_image, ramdisk_image + ramdisk_size - 1,
@@ -461,19 +442,18 @@ static void __init e820_reserve_setup_data(void)
 {
 	struct setup_data *data;
 	u64 pa_data;
-	int found = 0;
 
 	pa_data = boot_params.hdr.setup_data;
+	if (!pa_data)
+		return;
+
 	while (pa_data) {
 		data = early_memremap(pa_data, sizeof(*data));
 		e820_update_range(pa_data, sizeof(*data)+data->len,
 			 E820_RAM, E820_RESERVED_KERN);
-		found = 1;
 		pa_data = data->next;
 		early_memunmap(data, sizeof(*data));
 	}
-	if (!found)
-		return;
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 	memcpy(&e820_saved, &e820, sizeof(struct e820map));
@@ -499,7 +479,10 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * --------- Crashkernel reservation ------------------------------
  */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
+
+/* 16M alignment for crash kernel regions */
+#define CRASH_ALIGN		(16 << 20)
 
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
@@ -507,81 +490,80 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * On 64bit, old kexec-tools need to under 896MiB.
  */
 #ifdef CONFIG_X86_32
-# define CRASH_KERNEL_ADDR_LOW_MAX	(512 << 20)
-# define CRASH_KERNEL_ADDR_HIGH_MAX	(512 << 20)
+# define CRASH_ADDR_LOW_MAX	(512 << 20)
+# define CRASH_ADDR_HIGH_MAX	(512 << 20)
 #else
-# define CRASH_KERNEL_ADDR_LOW_MAX	(896UL<<20)
-# define CRASH_KERNEL_ADDR_HIGH_MAX	MAXMEM
+# define CRASH_ADDR_LOW_MAX	(896UL << 20)
+# define CRASH_ADDR_HIGH_MAX	MAXMEM
 #endif
 
-static void __init reserve_crashkernel_low(void)
+static int __init reserve_crashkernel_low(void)
 {
 #ifdef CONFIG_X86_64
-	const unsigned long long alignment = 16<<20;	/* 16M */
-	unsigned long long low_base = 0, low_size = 0;
+	unsigned long long base, low_base = 0, low_size = 0;
 	unsigned long total_low_mem;
-	unsigned long long base;
-	bool auto_set = false;
 	int ret;
 
-	total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+	total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT));
+
 	/* crashkernel=Y,low */
-	ret = parse_crashkernel_low(boot_command_line, total_low_mem,
-						&low_size, &base);
-	if (ret != 0) {
+	ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base);
+	if (ret) {
 		/*
 		 * two parts from lib/swiotlb.c:
-		 *	swiotlb size: user specified with swiotlb= or default.
-		 *	swiotlb overflow buffer: now is hardcoded to 32k.
-		 *		We round it to 8M for other buffers that
-		 *		may need to stay low too.
+		 * -swiotlb size: user-specified with swiotlb= or default.
+		 *
+		 * -swiotlb overflow buffer: now hardcoded to 32k. We round it
+		 * to 8M for other buffers that may need to stay low too. Also
+		 * make sure we allocate enough extra low memory so that we
+		 * don't run out of DMA buffers for 32-bit devices.
 		 */
-		low_size = swiotlb_size_or_default() + (8UL<<20);
-		auto_set = true;
+		low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
 	} else {
 		/* passed with crashkernel=0,low ? */
 		if (!low_size)
-			return;
+			return 0;
 	}
 
-	low_base = memblock_find_in_range(low_size, (1ULL<<32),
-					low_size, alignment);
-
+	low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN);
 	if (!low_base) {
-		if (!auto_set)
-			pr_info("crashkernel low reservation failed - No suitable area found.\n");
+		pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
+		       (unsigned long)(low_size >> 20));
+		return -ENOMEM;
+	}
 
-		return;
+	ret = memblock_reserve(low_base, low_size);
+	if (ret) {
+		pr_err("%s: Error reserving crashkernel low memblock.\n", __func__);
+		return ret;
 	}
 
-	memblock_reserve(low_base, low_size);
 	pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
-			(unsigned long)(low_size >> 20),
-			(unsigned long)(low_base >> 20),
-			(unsigned long)(total_low_mem >> 20));
+		(unsigned long)(low_size >> 20),
+		(unsigned long)(low_base >> 20),
+		(unsigned long)(total_low_mem >> 20));
+
 	crashk_low_res.start = low_base;
 	crashk_low_res.end   = low_base + low_size - 1;
 	insert_resource(&iomem_resource, &crashk_low_res);
 #endif
+	return 0;
 }
 
 static void __init reserve_crashkernel(void)
 {
-	const unsigned long long alignment = 16<<20;	/* 16M */
-	unsigned long long total_mem;
-	unsigned long long crash_size, crash_base;
+	unsigned long long crash_size, crash_base, total_mem;
 	bool high = false;
 	int ret;
 
 	total_mem = memblock_phys_mem_size();
 
 	/* crashkernel=XM */
-	ret = parse_crashkernel(boot_command_line, total_mem,
-			&crash_size, &crash_base);
+	ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
 	if (ret != 0 || crash_size <= 0) {
 		/* crashkernel=X,high */
 		ret = parse_crashkernel_high(boot_command_line, total_mem,
-				&crash_size, &crash_base);
+					     &crash_size, &crash_base);
 		if (ret != 0 || crash_size <= 0)
 			return;
 		high = true;
@@ -592,11 +574,10 @@ static void __init reserve_crashkernel(void)
 		/*
 		 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
 		 */
-		crash_base = memblock_find_in_range(alignment,
-					high ? CRASH_KERNEL_ADDR_HIGH_MAX :
-					       CRASH_KERNEL_ADDR_LOW_MAX,
-					crash_size, alignment);
-
+		crash_base = memblock_find_in_range(CRASH_ALIGN,
+						    high ? CRASH_ADDR_HIGH_MAX
+							 : CRASH_ADDR_LOW_MAX,
+						    crash_size, CRASH_ALIGN);
 		if (!crash_base) {
 			pr_info("crashkernel reservation failed - No suitable area found.\n");
 			return;
@@ -606,26 +587,32 @@ static void __init reserve_crashkernel(void)
 		unsigned long long start;
 
 		start = memblock_find_in_range(crash_base,
-				 crash_base + crash_size, crash_size, 1<<20);
+					       crash_base + crash_size,
+					       crash_size, 1 << 20);
 		if (start != crash_base) {
 			pr_info("crashkernel reservation failed - memory is in use.\n");
 			return;
 		}
 	}
-	memblock_reserve(crash_base, crash_size);
+	ret = memblock_reserve(crash_base, crash_size);
+	if (ret) {
+		pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
+		return;
+	}
 
-	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-			"for crashkernel (System RAM: %ldMB)\n",
-			(unsigned long)(crash_size >> 20),
-			(unsigned long)(crash_base >> 20),
-			(unsigned long)(total_mem >> 20));
+	if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
+		memblock_free(crash_base, crash_size);
+		return;
+	}
+
+	pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
+		(unsigned long)(crash_size >> 20),
+		(unsigned long)(crash_base >> 20),
+		(unsigned long)(total_mem >> 20));
 
 	crashk_res.start = crash_base;
 	crashk_res.end   = crash_base + crash_size - 1;
 	insert_resource(&iomem_resource, &crashk_res);
-
-	if (crash_base >= (1ULL<<32))
-		reserve_crashkernel_low();
 }
 #else
 static void __init reserve_crashkernel(void)
@@ -834,7 +821,7 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
 {
 	if (kaslr_enabled()) {
 		pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
-			 (unsigned long)&_text - __START_KERNEL,
+			 kaslr_offset(),
 			 __START_KERNEL,
 			 __START_KERNEL_map,
 			 MODULES_VADDR-1);
@@ -915,11 +902,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_32
 	apm_info.bios = boot_params.apm_bios_info;
 	ist_info = boot_params.ist_info;
-	if (boot_params.sys_desc_table.length != 0) {
-		machine_id = boot_params.sys_desc_table.table[0];
-		machine_submodel_id = boot_params.sys_desc_table.table[1];
-		BIOS_revision = boot_params.sys_desc_table.table[2];
-	}
 #endif
 	saved_video_mode = boot_params.hdr.vid_mode;
 	bootloader_type = boot_params.hdr.type_of_loader;
@@ -1103,6 +1085,11 @@ void __init setup_arch(char **cmdline_p)
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	memblock_x86_fill();
 
+	if (efi_enabled(EFI_BOOT)) {
+		efi_fake_memmap();
+		efi_find_mirror();
+	}
+
 	/*
 	 * The EFI specification says that boot service code won't be called
 	 * after ExitBootServices(). This is, in fact, a lie.
@@ -1194,6 +1181,14 @@ void __init setup_arch(char **cmdline_p)
 	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
 			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
 			KERNEL_PGD_PTRS);
+
+	/*
+	 * sync back low identity map too.  It is used for example
+	 * in the 32-bit EFI stub.
+	 */
+	clone_pgd_range(initial_page_table,
+			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+			min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
 #endif
 
 	tboot_probe();
@@ -1222,8 +1217,7 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 
 	init_apic_mappings();
-	if (x86_io_apic_ops.init)
-		x86_io_apic_ops.init();
+	io_apic_init_mappings();
 
 	kvm_guest_init();
 
diff --git a/kernel/arch/x86/kernel/signal.c b/kernel/arch/x86/kernel/signal.c
index 12c28f79e..cb6282c36 100644
--- a/kernel/arch/x86/kernel/signal.c
+++ b/kernel/arch/x86/kernel/signal.c
@@ -26,16 +26,16 @@
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
-#include <asm/sys_ia32.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
@@ -63,6 +63,7 @@
 
 int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 {
+	unsigned long buf_val;
 	void __user *buf;
 	unsigned int tmpflags;
 	unsigned int err = 0;
@@ -107,10 +108,11 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
 		regs->orig_ax = -1;		/* disable syscall checks */
 
-		get_user_ex(buf, &sc->fpstate);
+		get_user_ex(buf_val, &sc->fpstate);
+		buf = (void __user *)buf_val;
 	} get_user_catch(err);
 
-	err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+	err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32));
 
 	force_iret();
 
@@ -196,7 +198,7 @@ static unsigned long align_sigframe(unsigned long sp)
 	return sp;
 }
 
-static inline void __user *
+static void __user *
 get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	     void __user **fpstate)
 {
@@ -205,6 +207,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
 	int onsigstack = on_sig_stack(sp);
+	struct fpu *fpu = &current->thread.fpu;
 
 	/* redzone */
 	if (config_enabled(CONFIG_X86_64))
@@ -224,9 +227,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		}
 	}
 
-	if (used_math()) {
-		sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
-				     &buf_fx, &math_size);
+	if (fpu->fpstate_active) {
+		sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+					  &buf_fx, &math_size);
 		*fpstate = (void __user *)sp;
 	}
 
@@ -240,8 +243,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		return (void __user *)-1L;
 
 	/* save i387 and extended state */
-	if (used_math() &&
-	    save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
+	if (fpu->fpstate_active &&
+	    copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
 		return (void __user *)-1L;
 
 	return (void __user *)sp;
@@ -298,7 +301,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
 
 	if (current->mm->context.vdso)
 		restorer = current->mm->context.vdso +
-			selected_vdso32->sym___kernel_sigreturn;
+			vdso_image_32.sym___kernel_sigreturn;
 	else
 		restorer = &frame->retcode;
 	if (ksig->ka.sa.sa_flags & SA_RESTORER)
@@ -362,7 +365,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 		/* Set up to return from userspace.  */
 		restorer = current->mm->context.vdso +
-			selected_vdso32->sym___kernel_rt_sigreturn;
+			vdso_image_32.sym___kernel_rt_sigreturn;
 		if (ksig->ka.sa.sa_flags & SA_RESTORER)
 			restorer = ksig->ka.sa.sa_restorer;
 		put_user_ex(restorer, &frame->pretcode);
@@ -589,6 +592,22 @@ badframe:
 	return 0;
 }
 
+static inline int is_ia32_compat_frame(void)
+{
+	return config_enabled(CONFIG_IA32_EMULATION) &&
+	       test_thread_flag(TIF_IA32);
+}
+
+static inline int is_ia32_frame(void)
+{
+	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+}
+
+static inline int is_x32_frame(void)
+{
+	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+}
+
 static int
 setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 {
@@ -613,6 +632,10 @@ static void
 handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
 	bool stepping, failed;
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (v8086_mode(regs))
+		save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
 
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
@@ -661,25 +684,28 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		if (used_math())
-			fpu_reset_state(current);
+		if (fpu->fpstate_active)
+			fpu__clear(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
 
-#ifdef CONFIG_X86_32
-#define NR_restart_syscall	__NR_restart_syscall
-#else /* !CONFIG_X86_32 */
-#define NR_restart_syscall	\
-	test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
-#endif /* CONFIG_X86_32 */
+static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
+{
+#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
+	return __NR_restart_syscall;
+#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
+	return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
+		__NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
+}
 
 /*
  * Note that 'init' is a special process: it doesn't get signals it doesn't
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 
@@ -701,7 +727,7 @@ static void do_signal(struct pt_regs *regs)
 			break;
 
 		case -ERESTART_RESTARTBLOCK:
-			regs->ax = NR_restart_syscall;
+			regs->ax = get_nr_restart_syscall(regs);
 			regs->ip -= 2;
 			break;
 		}
@@ -714,40 +740,6 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }
 
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-	user_exit();
-
-#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
-	if (unlikely(current->forced_info.si_signo)) {
-		struct task_struct *t = current;
-		force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
-		t->forced_info.si_signo = 0;
-	}
-#endif
-
-	if (thread_info_flags & _TIF_UPROBE)
-		uprobe_notify_resume(regs);
-
-	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
-
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
-		fire_user_return_notifiers();
-
-	user_enter();
-}
-
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 	struct task_struct *me = current;
diff --git a/kernel/arch/x86/kernel/signal_compat.c b/kernel/arch/x86/kernel/signal_compat.c
new file mode 100644
index 000000000..dc3c0b1c8
--- /dev/null
+++ b/kernel/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,95 @@
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+	int err = 0;
+	bool ia32 = test_thread_flag(TIF_IA32);
+
+	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	put_user_try {
+		/* If you change siginfo_t structure, please make sure that
+		   this code is fixed accordingly.
+		   It should never copy any pad contained in the structure
+		   to avoid security leaks, but must copy the generic
+		   3 ints plus the relevant union member.  */
+		put_user_ex(from->si_signo, &to->si_signo);
+		put_user_ex(from->si_errno, &to->si_errno);
+		put_user_ex((short)from->si_code, &to->si_code);
+
+		if (from->si_code < 0) {
+			put_user_ex(from->si_pid, &to->si_pid);
+			put_user_ex(from->si_uid, &to->si_uid);
+			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+		} else {
+			/*
+			 * First 32bits of unions are always present:
+			 * si_pid === si_band === si_tid === si_addr(LS half)
+			 */
+			put_user_ex(from->_sifields._pad[0],
+					  &to->_sifields._pad[0]);
+			switch (from->si_code >> 16) {
+			case __SI_FAULT >> 16:
+				break;
+			case __SI_SYS >> 16:
+				put_user_ex(from->si_syscall, &to->si_syscall);
+				put_user_ex(from->si_arch, &to->si_arch);
+				break;
+			case __SI_CHLD >> 16:
+				if (ia32) {
+					put_user_ex(from->si_utime, &to->si_utime);
+					put_user_ex(from->si_stime, &to->si_stime);
+				} else {
+					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+				}
+				put_user_ex(from->si_status, &to->si_status);
+				/* FALL THROUGH */
+			default:
+			case __SI_KILL >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				break;
+			case __SI_POLL >> 16:
+				put_user_ex(from->si_fd, &to->si_fd);
+				break;
+			case __SI_TIMER >> 16:
+				put_user_ex(from->si_overrun, &to->si_overrun);
+				put_user_ex(ptr_to_compat(from->si_ptr),
+					    &to->si_ptr);
+				break;
+				 /* This is not generated by the kernel as of now.  */
+			case __SI_RT >> 16:
+			case __SI_MESGQ >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				put_user_ex(from->si_int, &to->si_int);
+				break;
+			}
+		}
+	} put_user_catch(err);
+
+	return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+	int err = 0;
+	u32 ptr32;
+
+	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	get_user_try {
+		get_user_ex(to->si_signo, &from->si_signo);
+		get_user_ex(to->si_errno, &from->si_errno);
+		get_user_ex(to->si_code, &from->si_code);
+
+		get_user_ex(to->si_pid, &from->si_pid);
+		get_user_ex(to->si_uid, &from->si_uid);
+		get_user_ex(ptr32, &from->si_ptr);
+		to->si_ptr = compat_ptr(ptr32);
+	} get_user_catch(err);
+
+	return err;
+}
diff --git a/kernel/arch/x86/kernel/smp.c b/kernel/arch/x86/kernel/smp.c
index be8e1bde0..12c828620 100644
--- a/kernel/arch/x86/kernel/smp.c
+++ b/kernel/arch/x86/kernel/smp.c
@@ -30,6 +30,7 @@
 #include <asm/proto.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
+#include <asm/mce.h>
 #include <asm/trace/irq_vectors.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
@@ -170,8 +171,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 
 asmlinkage __visible void smp_reboot_interrupt(void)
 {
-	ack_APIC_irq();
-	irq_enter();
+	ipi_entering_ack_irq();
 	stop_this_cpu(NULL);
 	irq_exit();
 }
@@ -244,6 +244,7 @@ static void native_stop_other_cpus(int wait)
 finish:
 	local_irq_save(flags);
 	disable_local_APIC();
+	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 	local_irq_restore(flags);
 }
 
@@ -265,12 +266,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
 	 */
 }
 
-static inline void smp_entering_irq(void)
-{
-	ack_APIC_irq();
-	irq_enter();
-}
-
 __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 	/*
@@ -279,7 +274,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 	 * scheduler_ipi(). This is OK, since those functions are allowed
 	 * to nest.
 	 */
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
 	__smp_reschedule_interrupt();
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
@@ -297,14 +292,14 @@ static inline void __smp_call_function_interrupt(void)
 
 __visible void smp_call_function_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
 	__smp_call_function_interrupt();
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
@@ -319,14 +314,14 @@ static inline void __smp_call_function_single_interrupt(void)
 
 __visible void smp_call_function_single_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_single_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
 	__smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/kernel/arch/x86/kernel/smpboot.c b/kernel/arch/x86/kernel/smpboot.c
index 50e547eac..fbabe4fcc 100644
--- a/kernel/arch/x86/kernel/smpboot.c
+++ b/kernel/arch/x86/kernel/smpboot.c
@@ -68,8 +68,7 @@
 #include <asm/mwait.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
@@ -98,8 +97,6 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
-atomic_t init_deasserted;
-
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -147,16 +144,11 @@ static void smp_callin(void)
 
 	/*
 	 * If waken up by an INIT in an 82489DX configuration
-	 * we may get here before an INIT-deassert IPI reaches
-	 * our local APIC.  We have to wait for the IPI or we'll
-	 * lock up on an APIC access.
-	 *
-	 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
+	 * cpu_callout_mask guarantees we don't get here before
+	 * an INIT_deassert IPI reaches our local APIC, so it is
+	 * now safe to touch our local APIC.
 	 */
 	cpuid = smp_processor_id();
-	if (apic->wait_for_init_deassert && cpuid)
-		while (!atomic_read(&init_deasserted))
-			cpu_relax();
 
 	/*
 	 * (This works even if the APIC is not enabled.)
@@ -172,11 +164,6 @@ static void smp_callin(void)
 	apic_ap_setup();
 
 	/*
-	 * Need to setup vector mappings before we enable interrupts.
-	 */
-	setup_vector_irq(smp_processor_id());
-
-	/*
 	 * Save our processor parameters. Note: this information
 	 * is needed for clock calibration.
 	 */
@@ -240,18 +227,13 @@ static void notrace start_secondary(void *unused)
 	check_tsc_sync_target();
 
 	/*
-	 * Enable the espfix hack for this CPU
-	 */
-#ifdef CONFIG_X86_ESPFIX64
-	init_espfix_ap();
-#endif
-
-	/*
-	 * We need to hold vector_lock so there the set of online cpus
-	 * does not change while we are assigning vectors to cpus.  Holding
-	 * this lock ensures we don't half assign or remove an irq from a cpu.
+	 * Lock vector_lock and initialize the vectors on this cpu
+	 * before setting the cpu online. We must set it online with
+	 * vector_lock held to prevent a concurrent setup/teardown
+	 * from seeing a half valid vector space.
 	 */
 	lock_vector_lock();
+	setup_vector_irq(smp_processor_id());
 	set_cpu_online(smp_processor_id(), true);
 	unlock_vector_lock();
 	cpu_set_state_online(smp_processor_id());
@@ -314,10 +296,10 @@ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
 		cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
 }
 
-#define link_mask(_m, c1, c2)						\
+#define link_mask(mfunc, c1, c2)					\
 do {									\
-	cpumask_set_cpu((c1), cpu_##_m##_mask(c2));			\
-	cpumask_set_cpu((c2), cpu_##_m##_mask(c1));			\
+	cpumask_set_cpu((c1), mfunc(c2));				\
+	cpumask_set_cpu((c2), mfunc(c1));				\
 } while (0)
 
 static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -398,9 +380,9 @@ void set_cpu_sibling_map(int cpu)
 	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
 	if (!has_mp) {
-		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+		cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
 		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-		cpumask_set_cpu(cpu, cpu_core_mask(cpu));
+		cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
 		c->booted_cores = 1;
 		return;
 	}
@@ -409,32 +391,34 @@ void set_cpu_sibling_map(int cpu)
 		o = &cpu_data(i);
 
 		if ((i == cpu) || (has_smt && match_smt(c, o)))
-			link_mask(sibling, cpu, i);
+			link_mask(topology_sibling_cpumask, cpu, i);
 
 		if ((i == cpu) || (has_mp && match_llc(c, o)))
-			link_mask(llc_shared, cpu, i);
+			link_mask(cpu_llc_shared_mask, cpu, i);
 
 	}
 
 	/*
 	 * This needs a separate iteration over the cpus because we rely on all
-	 * cpu_sibling_mask links to be set-up.
+	 * topology_sibling_cpumask links to be set-up.
 	 */
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		o = &cpu_data(i);
 
 		if ((i == cpu) || (has_mp && match_die(c, o))) {
-			link_mask(core, cpu, i);
+			link_mask(topology_core_cpumask, cpu, i);
 
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */
-			if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
+			if (cpumask_weight(
+			    topology_sibling_cpumask(cpu)) == 1) {
 				/*
 				 * for each core in package, increment
 				 * the booted_cores for this new cpu
 				 */
-				if (cpumask_first(cpu_sibling_mask(i)) == i)
+				if (cpumask_first(
+				    topology_sibling_cpumask(i)) == i)
 					c->booted_cores++;
 				/*
 				 * increment the core count for all
@@ -514,6 +498,44 @@ void __inquire_remote_apic(int apicid)
 }
 
 /*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
+ * Modern processor families are quirked to remove the delay entirely.
+ */
+#define UDELAY_10MS_DEFAULT 10000
+
+static unsigned int init_udelay = UINT_MAX;
+
+static int __init cpu_init_udelay(char *str)
+{
+	get_option(&str, &init_udelay);
+
+	return 0;
+}
+early_param("cpu_init_udelay", cpu_init_udelay);
+
+static void __init smp_quirk_init_udelay(void)
+{
+	/* if cmdline changed it from default, leave it alone */
+	if (init_udelay != UINT_MAX)
+		return;
+
+	/* if modern processor, use no delay */
+	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+	    ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
+		init_udelay = 0;
+		return;
+	}
+	/* else, use legacy delay */
+	init_udelay = UDELAY_10MS_DEFAULT;
+}
+
+/*
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
@@ -555,7 +577,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 static int
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
-	unsigned long send_status, accept_status = 0;
+	unsigned long send_status = 0, accept_status = 0;
 	int maxlvt, num_starts, j;
 
 	maxlvt = lapic_get_maxlvt();
@@ -583,7 +605,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 
-	mdelay(10);
+	udelay(init_udelay);
 
 	pr_debug("Deasserting INIT\n");
 
@@ -595,7 +617,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	send_status = safe_apic_wait_icr_idle();
 
 	mb();
-	atomic_set(&init_deasserted, 1);
 
 	/*
 	 * Should we send STARTUP IPIs ?
@@ -640,7 +661,10 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(300);
+		if (init_udelay == 0)
+			udelay(10);
+		else
+			udelay(300);
 
 		pr_debug("Startup point 1\n");
 
@@ -650,7 +674,11 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(200);
+		if (init_udelay == 0)
+			udelay(10);
+		else
+			udelay(200);
+
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
@@ -792,8 +820,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 	clear_tsk_thread_flag(idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
 #endif
-	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 }
 
 /*
@@ -820,6 +846,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	initial_code = (unsigned long)start_secondary;
 	stack_start  = idle->thread.sp;
 
+	/*
+	 * Enable the espfix hack for this CPU
+	*/
+#ifdef CONFIG_X86_ESPFIX64
+	init_espfix_ap(cpu);
+#endif
+
 	/* So we see what's up */
 	announce_cpu(cpu, apicid);
 
@@ -828,8 +861,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	 * the targeted processor.
 	 */
 
-	atomic_set(&init_deasserted, 0);
-
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 
 		pr_debug("Setting warm reset code and vector.\n");
@@ -867,7 +898,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 
 	if (!boot_error) {
 		/*
-		 * Wait 10s total for a response from AP
+		 * Wait 10s total for first sign of life from AP
 		 */
 		boot_error = -1;
 		timeout = jiffies + 10*HZ;
@@ -880,7 +911,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 				boot_error = 0;
 				break;
 			}
-			udelay(100);
 			schedule();
 		}
 	}
@@ -896,7 +926,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 			 * for the MTRR work(triggered by the AP coming online)
 			 * to be completed in the stop machine context.
 			 */
-			udelay(100);
 			schedule();
 		}
 	}
@@ -961,8 +990,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 
 	common_cpu_up(cpu, tidle);
 
+	/*
+	 * We have to walk the irq descriptors to setup the vector
+	 * space for the cpu which comes online.  Prevent irq
+	 * alloc/free across the bringup.
+	 */
+	irq_lock_sparse();
+
 	err = do_boot_cpu(apicid, cpu, tidle);
+
 	if (err) {
+		irq_unlock_sparse();
 		pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
 		return -EIO;
 	}
@@ -980,6 +1018,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		touch_nmi_watchdog();
 	}
 
+	irq_unlock_sparse();
+
 	return 0;
 }
 
@@ -1009,8 +1049,8 @@ static __init void disable_smp(void)
 		physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	else
 		physid_set_mask_of_physid(0, &phys_cpu_present_map);
-	cpumask_set_cpu(0, cpu_sibling_mask(0));
-	cpumask_set_cpu(0, cpu_core_mask(0));
+	cpumask_set_cpu(0, topology_sibling_cpumask(0));
+	cpumask_set_cpu(0, topology_core_cpumask(0));
 }
 
 enum {
@@ -1176,6 +1216,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		uv_system_init();
 
 	set_mtrr_aps_delayed_init();
+
+	smp_quirk_init_udelay();
 }
 
 void arch_enable_nonboot_cpus_begin(void)
@@ -1293,28 +1335,28 @@ static void remove_siblinginfo(int cpu)
 	int sibling;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	for_each_cpu(sibling, cpu_core_mask(cpu)) {
-		cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
+	for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+		cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
 		/*/
 		 * last thread sibling in this cpu core going down
 		 */
-		if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
+		if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
 			cpu_data(sibling).booted_cores--;
 	}
 
-	for_each_cpu(sibling, cpu_sibling_mask(cpu))
-		cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
 	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
 		cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
 	cpumask_clear(cpu_llc_shared_mask(cpu));
-	cpumask_clear(cpu_sibling_mask(cpu));
-	cpumask_clear(cpu_core_mask(cpu));
+	cpumask_clear(topology_sibling_cpumask(cpu));
+	cpumask_clear(topology_core_cpumask(cpu));
 	c->phys_proc_id = 0;
 	c->cpu_core_id = 0;
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
 }
 
-static void __ref remove_cpu_from_maps(int cpu)
+static void remove_cpu_from_maps(int cpu)
 {
 	set_cpu_online(cpu, false);
 	cpumask_clear_cpu(cpu, cpu_callout_mask);
diff --git a/kernel/arch/x86/kernel/step.c b/kernel/arch/x86/kernel/step.c
index 0ccb53a9f..c9a073866 100644
--- a/kernel/arch/x86/kernel/step.c
+++ b/kernel/arch/x86/kernel/step.c
@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		return addr;
 	}
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	/*
 	 * We'll assume that the code segments in the GDT
 	 * are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		}
 		mutex_unlock(&child->mm->context.lock);
 	}
+#endif
 
 	return addr;
 }
diff --git a/kernel/arch/x86/kernel/topology.c b/kernel/arch/x86/kernel/topology.c
index 649b010da..12cbe2b88 100644
--- a/kernel/arch/x86/kernel/topology.c
+++ b/kernel/arch/x86/kernel/topology.c
@@ -57,7 +57,7 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug);
  *
  * This is only called for debugging CPU offline/online feature.
  */
-int __ref _debug_hotplug_cpu(int cpu, int action)
+int _debug_hotplug_cpu(int cpu, int action)
 {
 	struct device *dev = get_cpu_device(cpu);
 	int ret;
@@ -104,7 +104,7 @@ static int __init debug_hotplug_cpu(void)
 late_initcall_sync(debug_hotplug_cpu);
 #endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
 
-int __ref arch_register_cpu(int num)
+int arch_register_cpu(int num)
 {
 	struct cpuinfo_x86 *c = &cpu_data(num);
 
diff --git a/kernel/arch/x86/kernel/trace_clock.c b/kernel/arch/x86/kernel/trace_clock.c
index 25b993729..80bb24d9b 100644
--- a/kernel/arch/x86/kernel/trace_clock.c
+++ b/kernel/arch/x86/kernel/trace_clock.c
@@ -12,10 +12,5 @@
  */
 u64 notrace trace_clock_x86_tsc(void)
 {
-	u64 ret;
-
-	rdtsc_barrier();
-	rdtscll(ret);
-
-	return ret;
+	return rdtsc_ordered();
 }
diff --git a/kernel/arch/x86/kernel/traps.c b/kernel/arch/x86/kernel/traps.c
index ebae11893..ade185a46 100644
--- a/kernel/arch/x86/kernel/traps.c
+++ b/kernel/arch/x86/kernel/traps.c
@@ -54,13 +54,15 @@
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
 #include <asm/mach_traps.h>
 #include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/trace/mpx.h>
 #include <asm/mpx.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -72,8 +74,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
 #else
 #include <asm/processor-flags.h>
 #include <asm/setup.h>
-
-asmlinkage int system_call(void);
+#include <asm/proto.h>
 #endif
 
 /* Must be page-aligned because the real IDT is used in a fixmap. */
@@ -88,21 +89,9 @@ static inline void conditional_sti(struct pt_regs *regs)
 		local_irq_enable();
 }
 
-static inline void conditional_sti_ist(struct pt_regs *regs)
+static inline void preempt_conditional_sti(struct pt_regs *regs)
 {
-#ifdef CONFIG_X86_64
-	/*
-	 * X86_64 uses a per CPU stack on the IST for certain traps
-	 * like int3. The task can not be preempted when using one
-	 * of these stacks, thus preemption must be disabled, otherwise
-	 * the stack can be corrupted if the task is scheduled out,
-	 * and another task comes in and uses this stack.
-	 *
-	 * On x86_32 the task keeps its own stack and it is OK if the
-	 * task schedules out.
-	 */
 	preempt_count_inc();
-#endif
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
@@ -113,22 +102,17 @@ static inline void conditional_cli(struct pt_regs *regs)
 		local_irq_disable();
 }
 
-static inline void conditional_cli_ist(struct pt_regs *regs)
+static inline void preempt_conditional_cli(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
-#ifdef CONFIG_X86_64
 	preempt_count_dec();
-#endif
 }
 
-enum ctx_state ist_enter(struct pt_regs *regs)
+void ist_enter(struct pt_regs *regs)
 {
-	enum ctx_state prev_state;
-
 	if (user_mode(regs)) {
-		/* Other than that, we're just an exception. */
-		prev_state = exception_enter();
+		RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	} else {
 		/*
 		 * We might have interrupted pretty much anything.  In
@@ -137,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 
 	/*
-	 * We are atomic because we're on the IST stack (or we're on x86_32,
-	 * in which case we still shouldn't schedule).
-	 *
-	 * This must be after exception_enter(), because exception_enter()
-	 * won't do anything if in_interrupt() returns true.
+	 * We are atomic because we're on the IST stack; or we're on
+	 * x86_32, in which case we still shouldn't schedule; or we're
+	 * on x86_64 and entered from user mode, in which case we're
+	 * still atomic unless ist_begin_non_atomic is called.
 	 */
 	preempt_count_add(HARDIRQ_OFFSET);
 
 	/* This code is a bit fragile.  Test it. */
-	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
-
-	return prev_state;
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
 }
 
-void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+void ist_exit(struct pt_regs *regs)
 {
-	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);
 
-	if (user_mode(regs))
-		return exception_exit(prev_state);
-	else
+	if (!user_mode(regs))
 		rcu_nmi_exit();
 }
 
@@ -176,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call is_end_non_atomic()
+ * the non-atomic section, and callers must call ist_end_non_atomic()
  * before ist_exit().
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
@@ -303,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap);
 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 			  unsigned long trapnr, int signr)
 {
-	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;
 
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		conditional_sti(regs);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
-
-	exception_exit(prev_state);
 }
 
 #define DO_ERROR(trapnr, signr, str, name)				\
@@ -365,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	}
 #endif
 
-	ist_enter(regs);  /* Discard prev_state because we won't return. */
+	ist_enter(regs);
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
 	tsk->thread.error_code = error_code;
@@ -385,16 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	struct task_struct *tsk = current;
-	struct xsave_struct *xsave_buf;
-	enum ctx_state prev_state;
-	struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 	siginfo_t *info;
 
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
-		goto exit;
+		return;
 	conditional_sti(regs);
 
 	if (!user_mode(regs))
@@ -407,15 +380,15 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 
 	/*
 	 * We need to look at BNDSTATUS to resolve this exception.
-	 * It is not directly accessible, though, so we need to
-	 * do an xsave and then pull it out of the xsave buffer.
+	 * A NULL here might mean that it is in its 'init state',
+	 * which is all zeros which indicates MPX was not
+	 * responsible for the exception.
 	 */
-	fpu_save_init(&tsk->thread.fpu);
-	xsave_buf = &(tsk->thread.fpu.state->xsave);
-	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
 
+	trace_bounds_exception_mpx(bndcsr);
 	/*
 	 * The error code field of the BNDSTATUS register communicates status
 	 * information of a bound range exception #BR or operation involving
@@ -423,11 +396,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	 */
 	switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
 	case 2:	/* Bound directory has invalid entry. */
-		if (mpx_handle_bd_fault(xsave_buf))
+		if (mpx_handle_bd_fault())
 			goto exit_trap;
 		break; /* Success, it was handled */
 	case 1: /* Bound violation. */
-		info = mpx_generate_siginfo(regs, xsave_buf);
+		info = mpx_generate_siginfo(regs);
 		if (IS_ERR(info)) {
 			/*
 			 * We failed to decode the MPX instruction.  Act as if
@@ -451,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		die("bounds", regs, error_code);
 	}
 
-exit:
-	exception_exit(prev_state);
 	return;
+
 exit_trap:
 	/*
 	 * This path out is for all the cases where we could not
@@ -463,35 +435,33 @@ exit_trap:
 	 * time..
 	 */
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk = current;
 	if (!user_mode(regs)) {
 		if (fixup_exception(regs))
-			goto exit;
+			return;
 
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_nr = X86_TRAP_GP;
 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 			die("general protection fault", regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk->thread.error_code = error_code;
@@ -507,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	}
 
 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-exit:
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_general_protection);
 
 /* May run on IST stack. */
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.
@@ -529,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	if (poke_int3_handler(regs))
 		return;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -550,12 +517,12 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	conditional_sti_ist(regs);
+	preempt_conditional_sti(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-	conditional_cli_ist(regs);
+	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_int3);
 
@@ -631,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	enum ctx_state prev_state;
 	int user_icebp = 0;
 	unsigned long dr6;
 	int si_code;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	get_debugreg(dr6, 6);
 
@@ -682,12 +648,12 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	conditional_sti_ist(regs);
+	preempt_conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
-		conditional_cli_ist(regs);
+		preempt_conditional_cli(regs);
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -707,11 +673,11 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
-	conditional_cli_ist(regs);
+	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
 
@@ -723,8 +689,8 @@ NOKPROBE_SYMBOL(do_debug);
 static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task = current;
+	struct fpu *fpu = &task->thread.fpu;
 	siginfo_t info;
-	unsigned short err;
 	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 						"simd exception";
 
@@ -732,8 +698,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 		return;
 	conditional_sti(regs);
 
-	if (!user_mode(regs))
-	{
+	if (!user_mode(regs)) {
 		if (!fixup_exception(regs)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
@@ -745,150 +710,46 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	unlazy_fpu(task);
-	task->thread.trap_nr = trapnr;
+	fpu__save(fpu);
+
+	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
-	if (trapnr == X86_TRAP_MF) {
-		unsigned short cwd, swd;
-		/*
-		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-		 * status.  0x3f is the exception bits in these regs, 0x200 is the
-		 * C1 reg you need in case of a stack fault, 0x040 is the stack
-		 * fault bit.  We should only be taking one exception at a time,
-		 * so if this combination doesn't produce any single exception,
-		 * then we have a bad program that isn't synchronizing its FPU usage
-		 * and it will suffer the consequences since we won't be able to
-		 * fully reproduce the context of the exception
-		 */
-		cwd = get_fpu_cwd(task);
-		swd = get_fpu_swd(task);
+	info.si_signo		= SIGFPE;
+	info.si_errno		= 0;
+	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs);
 
-		err = swd & ~cwd;
-	} else {
-		/*
-		 * The SIMD FPU exceptions are handled a little differently, as there
-		 * is only a single status/control register.  Thus, to determine which
-		 * unmasked exception was caught we must mask the exception mask bits
-		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-		 */
-		unsigned short mxcsr = get_fpu_mxcsr(task);
-		err = ~(mxcsr >> 7) & mxcsr;
-	}
+	info.si_code = fpu__exception_code(fpu, trapnr);
 
-	if (err & 0x001) {	/* Invalid op */
-		/*
-		 * swd & 0x240 == 0x040: Stack Underflow
-		 * swd & 0x240 == 0x240: Stack Overflow
-		 * User must clear the SF bit (0x40) if set
-		 */
-		info.si_code = FPE_FLTINV;
-	} else if (err & 0x004) { /* Divide by Zero */
-		info.si_code = FPE_FLTDIV;
-	} else if (err & 0x008) { /* Overflow */
-		info.si_code = FPE_FLTOVF;
-	} else if (err & 0x012) { /* Denormal, Underflow */
-		info.si_code = FPE_FLTUND;
-	} else if (err & 0x020) { /* Precision */
-		info.si_code = FPE_FLTRES;
-	} else {
-		/*
-		 * If we're using IRQ 13, or supposedly even some trap
-		 * X86_TRAP_MF implementations, it's possible
-		 * we get a spurious trap, which is not an error.
-		 */
+	/* Retry when we get spurious exceptions: */
+	if (!info.si_code)
 		return;
-	}
+
 	force_sig_info(SIGFPE, &info, task);
 }
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_MF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_XF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
 	conditional_sti(regs);
-#if 0
-	/* No need to warn about this any longer. */
-	pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
-{
-}
-
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (eg with local
- * local interrupts as in the case of do_device_not_available).
- */
-void math_state_restore(void)
-{
-	struct task_struct *tsk = current;
-
-	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
-
-	/* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
-	kernel_fpu_disable();
-	__thread_fpu_begin(tsk);
-	if (unlikely(restore_fpu_checking(tsk))) {
-		fpu_reset_state(tsk);
-		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-	} else {
-		tsk->thread.fpu_counter++;
-	}
-	kernel_fpu_enable();
 }
-EXPORT_SYMBOL_GPL(math_state_restore);
 
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
@@ -899,15 +760,13 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 		info.regs = regs;
 		math_emulate(&info);
-		exception_exit(prev_state);
 		return;
 	}
 #endif
-	math_state_restore(); /* interrupts still off */
+	fpu__restore(&current->thread.fpu); /* interrupts still off */
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_device_not_available);
 
@@ -915,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available);
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
 
 	info.si_signo = SIGILL;
@@ -929,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 			&info);
 	}
-	exception_exit(prev_state);
 }
 #endif
 
@@ -1006,13 +863,13 @@ void __init trap_init(void)
 		set_bit(i, used_vectors);
 
 #ifdef CONFIG_IA32_EMULATION
-	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+	set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
 	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 #ifdef CONFIG_X86_32
-	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-	set_bit(SYSCALL_VECTOR, used_vectors);
+	set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 	/*
diff --git a/kernel/arch/x86/kernel/tsc.c b/kernel/arch/x86/kernel/tsc.c
index 505449700..c7c4d9c51 100644
--- a/kernel/arch/x86/kernel/tsc.c
+++ b/kernel/arch/x86/kernel/tsc.c
@@ -21,6 +21,7 @@
 #include <asm/hypervisor.h>
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
+#include <asm/geode.h>
 
 unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -38,7 +39,7 @@ static int __read_mostly tsc_unstable;
    erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
 
-static struct static_key __use_tsc = STATIC_KEY_INIT;
+static DEFINE_STATIC_KEY_FALSE(__use_tsc);
 
 int tsc_clocksource_reliable;
 
@@ -167,21 +168,20 @@ static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
  *              ns = cycles * cyc2ns_scale / SC
  *
  *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
+ *  into a shift. The larger SC is, the more accurate the conversion, but
+ *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ *  (64-bit result) can be used.
  *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  We can use khz divisor instead of mhz to keep a better precision.
  *  (mathieu.desnoyers@polymtl.ca)
  *
  *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
 static void cyc2ns_data_init(struct cyc2ns_data *data)
 {
 	data->cyc2ns_mul = 0;
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	data->cyc2ns_shift = 0;
 	data->cyc2ns_offset = 0;
 	data->__count = 0;
 }
@@ -215,14 +215,14 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 
 	if (likely(data == tail)) {
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 	} else {
 		data->__count++;
 
 		barrier();
 
 		ns = data->cyc2ns_offset;
-		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
 
 		barrier();
 
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	data = cyc2ns_write_begin(cpu);
 
-	rdtscll(tsc_now);
+	tsc_now = rdtsc();
 	ns_now = cycles_2_ns(tsc_now);
 
 	/*
@@ -256,12 +256,22 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 	 * time function is continuous; see the comment near struct
 	 * cyc2ns_data.
 	 */
-	data->cyc2ns_mul =
-		DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
-				  cpu_khz);
-	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+	clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz,
+			       NSEC_PER_MSEC, 0);
+
+	/*
+	 * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
+	 * not expected to be greater than 31 due to the original published
+	 * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
+	 * value) - refer perf_event_mmap_page documentation in perf_event.h.
+	 */
+	if (data->cyc2ns_shift == 32) {
+		data->cyc2ns_shift = 31;
+		data->cyc2ns_mul >>= 1;
+	}
+
 	data->cyc2ns_offset = ns_now -
-		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
 
 	cyc2ns_write_end(cpu, data);
 
@@ -274,7 +284,12 @@ done:
  */
 u64 native_sched_clock(void)
 {
-	u64 tsc_now;
+	if (static_branch_likely(&__use_tsc)) {
+		u64 tsc_now = rdtsc();
+
+		/* return the value in ns */
+		return cycles_2_ns(tsc_now);
+	}
 
 	/*
 	 * Fall back to jiffies if there's no TSC available:
@@ -284,16 +299,17 @@ u64 native_sched_clock(void)
 	 *   very important for it to be as fast as the platform
 	 *   can achieve it. )
 	 */
-	if (!static_key_false(&__use_tsc)) {
-		/* No locking but a rare wrong value is not a big deal: */
-		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
-	}
 
-	/* read the Time Stamp Counter: */
-	rdtscll(tsc_now);
+	/* No locking but a rare wrong value is not a big deal: */
+	return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+}
 
-	/* return the value in ns */
-	return cycles_2_ns(tsc_now);
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+	return cycles_2_ns(tsc);
 }
 
 /* We need to define a real function for sched_clock, to override the
@@ -308,12 +324,6 @@ unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 #endif
 
-unsigned long long native_read_tsc(void)
-{
-	return __native_read_tsc();
-}
-EXPORT_SYMBOL(native_read_tsc);
-
 int check_tsc_unstable(void)
 {
 	return tsc_unstable;
@@ -598,10 +608,19 @@ static unsigned long quick_pit_calibrate(void)
 			if (!pit_expect_msb(0xff-i, &delta, &d2))
 				break;
 
+			delta -= tsc;
+
+			/*
+			 * Extrapolate the error and fail fast if the error will
+			 * never be below 500 ppm.
+			 */
+			if (i == 1 &&
+			    d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+				return 0;
+
 			/*
 			 * Iterate until the error is less than 500 ppm
 			 */
-			delta -= tsc;
 			if (d1+d2 >= delta >> 11)
 				continue;
 
@@ -967,7 +986,7 @@ static struct clocksource clocksource_tsc;
  */
 static cycle_t read_tsc(struct clocksource *cs)
 {
-	return (cycle_t)get_cycles();
+	return (cycle_t)rdtsc_ordered();
 }
 
 /*
@@ -1004,15 +1023,17 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
 static void __init check_system_tsc_reliable(void)
 {
-#ifdef CONFIG_MGEODE_LX
-	/* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+	if (is_geode_lx()) {
+		/* RTSC counts during suspend */
 #define RTSC_SUSP 0x100
-	unsigned long res_low, res_high;
+		unsigned long res_low, res_high;
 
-	rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
-	/* Geode_LX - the OLPC CPU has a very reliable TSC */
-	if (res_low & RTSC_SUSP)
-		tsc_clocksource_reliable = 1;
+		rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+		/* Geode_LX - the OLPC CPU has a very reliable TSC */
+		if (res_low & RTSC_SUSP)
+			tsc_clocksource_reliable = 1;
+	}
 #endif
 	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
 		tsc_clocksource_reliable = 1;
@@ -1201,7 +1222,7 @@ void __init tsc_init(void)
 	/* now allow native_sched_clock() to use rdtsc */
 
 	tsc_disabled = 0;
-	static_key_slow_inc(&__use_tsc);
+	static_branch_enable(&__use_tsc);
 
 	if (!no_sched_irq_time)
 		enable_sched_clock_irqtime();
diff --git a/kernel/arch/x86/kernel/tsc_sync.c b/kernel/arch/x86/kernel/tsc_sync.c
index 26488487b..78083bf23 100644
--- a/kernel/arch/x86/kernel/tsc_sync.c
+++ b/kernel/arch/x86/kernel/tsc_sync.c
@@ -39,16 +39,15 @@ static cycles_t max_warp;
 static int nr_warps;
 
 /*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs.  This is not called
+ * if there is no TSC.
  */
 static void check_tsc_warp(unsigned int timeout)
 {
 	cycles_t start, now, prev, end;
 	int i;
 
-	rdtsc_barrier();
-	start = get_cycles();
-	rdtsc_barrier();
+	start = rdtsc_ordered();
 	/*
 	 * The measurement runs for 'timeout' msecs:
 	 */
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
 		 */
 		arch_spin_lock(&sync_lock);
 		prev = last_tsc;
-		rdtsc_barrier();
-		now = get_cycles();
-		rdtsc_barrier();
+		now = rdtsc_ordered();
 		last_tsc = now;
 		arch_spin_unlock(&sync_lock);
 
@@ -113,7 +110,7 @@ static void check_tsc_warp(unsigned int timeout)
  */
 static inline unsigned int loop_timeout(int cpu)
 {
-	return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
+	return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
 }
 
 /*
@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
 
 	/*
 	 * No need to check if we already know that the TSC is not
-	 * synchronized:
+	 * synchronized or if we have no TSC.
 	 */
 	if (unsynchronized_tsc())
 		return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
 {
 	int cpus = 2;
 
+	/* Also aborts if there is no TSC. */
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
 
diff --git a/kernel/arch/x86/kernel/uprobes.c b/kernel/arch/x86/kernel/uprobes.c
index 0b81ad67d..bf4db6eae 100644
--- a/kernel/arch/x86/kernel/uprobes.c
+++ b/kernel/arch/x86/kernel/uprobes.c
@@ -29,6 +29,7 @@
 #include <linux/kdebug.h>
 #include <asm/processor.h>
 #include <asm/insn.h>
+#include <asm/mmu_context.h>
 
 /* Post-execution fixups. */
 
@@ -312,11 +313,6 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
 }
 
 #ifdef CONFIG_X86_64
-static inline bool is_64bit_mm(struct mm_struct *mm)
-{
-	return	!config_enabled(CONFIG_IA32_EMULATION) ||
-		!(mm->context.ia32_compat == TIF_IA32);
-}
 /*
  * If arch_uprobe->insn doesn't use rip-relative addressing, return
  * immediately.  Otherwise, rewrite the instruction so that it accesses
@@ -497,10 +493,6 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	}
 }
 #else /* 32-bit: */
-static inline bool is_64bit_mm(struct mm_struct *mm)
-{
-	return false;
-}
 /*
  * No RIP-relative addressing on 32-bit
  */
@@ -993,3 +985,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
 
 	return -1;
 }
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+				struct pt_regs *regs)
+{
+	if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+		return regs->sp < ret->stack;
+	else
+		return regs->sp <= ret->stack;
+}
diff --git a/kernel/arch/x86/kernel/verify_cpu.S b/kernel/arch/x86/kernel/verify_cpu.S
index b9242bacb..4cf401f58 100644
--- a/kernel/arch/x86/kernel/verify_cpu.S
+++ b/kernel/arch/x86/kernel/verify_cpu.S
@@ -34,10 +34,11 @@
 #include <asm/msr-index.h>
 
 verify_cpu:
-	pushfl				# Save caller passed flags
-	pushl	$0			# Kill any dangerous flags
-	popfl
+	pushf				# Save caller passed flags
+	push	$0			# Kill any dangerous flags
+	popf
 
+#ifndef __x86_64__
 	pushfl				# standard way to check for cpuid
 	popl	%eax
 	movl	%eax,%ebx
@@ -48,6 +49,7 @@ verify_cpu:
 	popl	%eax
 	cmpl	%eax,%ebx
 	jz	verify_cpu_no_longmode	# cpu has no cpuid
+#endif
 
 	movl	$0x0,%eax		# See if cpuid 1 is implemented
 	cpuid
@@ -130,10 +132,10 @@ verify_cpu_sse_test:
 	jmp	verify_cpu_sse_test	# try again
 
 verify_cpu_no_longmode:
-	popfl				# Restore caller passed flags
+	popf				# Restore caller passed flags
 	movl $1,%eax
 	ret
 verify_cpu_sse_ok:
-	popfl				# Restore caller passed flags
+	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
diff --git a/kernel/arch/x86/kernel/vm86_32.c b/kernel/arch/x86/kernel/vm86_32.c
index fc9db6ef2..524619351 100644
--- a/kernel/arch/x86/kernel/vm86_32.c
+++ b/kernel/arch/x86/kernel/vm86_32.c
@@ -44,11 +44,15 @@
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/security.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
 
 /*
  * Known problems:
@@ -66,10 +70,6 @@
  */
 
 
-#define KVM86	((struct kernel_vm86_struct *)regs)
-#define VMPI	KVM86->vm86plus
-
-
 /*
  * 8- and 16-bit register defines..
  */
@@ -81,8 +81,8 @@
 /*
  * virtual flags (16 and 32-bit versions)
  */
-#define VFLAGS	(*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS	(current->thread.v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS	(current->thread.vm86->veflags)
 
 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -90,46 +90,13 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
-				  const struct kernel_vm86_regs *regs)
-{
-	int ret = 0;
-
-	/*
-	 * kernel_vm86_regs is missing gs, so copy everything up to
-	 * (but not including) orig_eax, and then rest including orig_eax.
-	 */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
-			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
-	return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
-				    const struct vm86_regs __user *user,
-				    unsigned extra)
-{
-	int ret = 0;
-
-	/* copy ax-fs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	/* copy orig_ax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
-			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
-			      extra);
-	return ret;
-}
-
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
 	struct tss_struct *tss;
-	struct pt_regs *ret;
-	unsigned long tmp;
+	struct task_struct *tsk = current;
+	struct vm86plus_struct __user *user;
+	struct vm86 *vm86 = current->thread.vm86;
+	long err = 0;
 
 	/*
 	 * This gets called from entry.S with interrupts disabled, but
@@ -138,31 +105,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	local_irq_enable();
 
-	if (!current->thread.vm86_info) {
-		pr_alert("no vm86_info: BAD\n");
+	if (!vm86 || !vm86->user_vm86) {
+		pr_alert("no user_vm86: BAD\n");
+		do_exit(SIGSEGV);
+	}
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+	user = vm86->user_vm86;
+
+	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+		       sizeof(struct vm86plus_struct) :
+		       sizeof(struct vm86_struct))) {
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
-	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
-	tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
-	if (tmp) {
-		pr_alert("could not access userspace vm86_info\n");
+
+	put_user_try {
+		put_user_ex(regs->pt.bx, &user->regs.ebx);
+		put_user_ex(regs->pt.cx, &user->regs.ecx);
+		put_user_ex(regs->pt.dx, &user->regs.edx);
+		put_user_ex(regs->pt.si, &user->regs.esi);
+		put_user_ex(regs->pt.di, &user->regs.edi);
+		put_user_ex(regs->pt.bp, &user->regs.ebp);
+		put_user_ex(regs->pt.ax, &user->regs.eax);
+		put_user_ex(regs->pt.ip, &user->regs.eip);
+		put_user_ex(regs->pt.cs, &user->regs.cs);
+		put_user_ex(regs->pt.flags, &user->regs.eflags);
+		put_user_ex(regs->pt.sp, &user->regs.esp);
+		put_user_ex(regs->pt.ss, &user->regs.ss);
+		put_user_ex(regs->es, &user->regs.es);
+		put_user_ex(regs->ds, &user->regs.ds);
+		put_user_ex(regs->fs, &user->regs.fs);
+		put_user_ex(regs->gs, &user->regs.gs);
+
+		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
+	} put_user_catch(err);
+	if (err) {
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	current->thread.sp0 = current->thread.saved_sp0;
-	current->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &current->thread);
-	current->thread.saved_sp0 = 0;
+	tsk->thread.sp0 = vm86->saved_sp0;
+	tsk->thread.sysenter_cs = __KERNEL_CS;
+	load_sp0(tss, &tsk->thread);
+	vm86->saved_sp0 = 0;
 	put_cpu();
 
-	ret = KVM86->regs32;
+	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 
-	ret->fs = current->thread.saved_fs;
-	set_user_gs(ret, current->thread.saved_gs);
+	lazy_load_gs(vm86->regs32.gs);
 
-	return ret;
+	regs->pt.ax = retval;
 }
 
 static void mark_screen_rdonly(struct mm_struct *mm)
@@ -200,45 +193,16 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
 
-SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk = current;
-	int tmp;
-
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, vm86plus) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
-	info.regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
 }
 
 
 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk;
-	int tmp;
-	struct vm86plus_struct __user *v86;
-
-	tsk = current;
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
@@ -256,114 +220,159 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	v86 = (struct vm86plus_struct __user *)arg;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, regs32) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	info.regs32 = current_pt_regs();
-	info.vm86plus.is_vm86pus = 1;
-	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
 }
 
 
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
 	struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
-	info->regs.pt.ds = 0;
-	info->regs.pt.es = 0;
-	info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
-	info->regs.pt.gs = 0;
-#endif
+	struct task_struct *tsk = current;
+	struct vm86 *vm86 = tsk->thread.vm86;
+	struct kernel_vm86_regs vm86regs;
+	struct pt_regs *regs = current_pt_regs();
+	unsigned long err = 0;
+
+	err = security_mmap_addr(0);
+	if (err) {
+		/*
+		 * vm86 cannot virtualize the address space, so vm86 users
+		 * need to manage the low 1MB themselves using mmap.  Given
+		 * that BIOS places important data in the first page, vm86
+		 * is essentially useless if mmap_min_addr != 0.  DOSEMU,
+		 * for example, won't even bother trying to use vm86 if it
+		 * can't map a page at virtual address 0.
+		 *
+		 * To reduce the available kernel attack surface, simply
+		 * disallow vm86(old) for users who cannot mmap at va 0.
+		 *
+		 * The implementation of security_mmap_addr will allow
+		 * suitably privileged users to map va 0 even if
+		 * vm.mmap_min_addr is set above 0, and we want this
+		 * behavior for vm86 as well, as it ensures that legacy
+		 * tools like vbetool will not fail just because of
+		 * vm.mmap_min_addr.
+		 */
+		pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d).  Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+			     current->comm, task_pid_nr(current),
+			     from_kuid_munged(&init_user_ns, current_uid()));
+		return -EPERM;
+	}
+
+	if (!vm86) {
+		if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+			return -ENOMEM;
+		tsk->thread.vm86 = vm86;
+	}
+	if (vm86->saved_sp0)
+		return -EPERM;
+
+	if (!access_ok(VERIFY_READ, user_vm86, plus ?
+		       sizeof(struct vm86_struct) :
+		       sizeof(struct vm86plus_struct)))
+		return -EFAULT;
+
+	memset(&vm86regs, 0, sizeof(vm86regs));
+	get_user_try {
+		unsigned short seg;
+		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+		get_user_ex(seg, &user_vm86->regs.cs);
+		vm86regs.pt.cs = seg;
+		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+		get_user_ex(seg, &user_vm86->regs.ss);
+		vm86regs.pt.ss = seg;
+		get_user_ex(vm86regs.es, &user_vm86->regs.es);
+		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+		get_user_ex(vm86->flags, &user_vm86->flags);
+		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
+	} get_user_catch(err);
+	if (err)
+		return err;
+
+	if (copy_from_user(&vm86->int_revectored,
+			   &user_vm86->int_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (copy_from_user(&vm86->int21_revectored,
+			   &user_vm86->int21_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (plus) {
+		if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+				   sizeof(struct vm86plus_info_struct)))
+			return -EFAULT;
+		vm86->vm86plus.is_vm86pus = 1;
+	} else
+		memset(&vm86->vm86plus, 0,
+		       sizeof(struct vm86plus_info_struct));
+
+	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+	vm86->user_vm86 = user_vm86;
 
 /*
  * The flags register is also special: we cannot trust that the user
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  */
-	VEFLAGS = info->regs.pt.flags;
-	info->regs.pt.flags &= SAFE_MASK;
-	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
-	info->regs.pt.flags |= X86_VM_MASK;
+	VEFLAGS = vm86regs.pt.flags;
+	vm86regs.pt.flags &= SAFE_MASK;
+	vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= X86_VM_MASK;
+
+	vm86regs.pt.orig_ax = regs->orig_ax;
 
-	switch (info->cpu_type) {
+	switch (vm86->cpu_type) {
 	case CPU_286:
-		tsk->thread.v86mask = 0;
+		vm86->veflags_mask = 0;
 		break;
 	case CPU_386:
-		tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	case CPU_486:
-		tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	default:
-		tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	}
 
 /*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
  */
-	info->regs32->ax = VM86_SIGNAL;
-	tsk->thread.saved_sp0 = tsk->thread.sp0;
-	tsk->thread.saved_fs = info->regs32->fs;
-	tsk->thread.saved_gs = get_user_gs(info->regs32);
+	vm86->saved_sp0 = tsk->thread.sp0;
+	lazy_save_gs(vm86->regs32.gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	/* make room for real-mode segments */
+	tsk->thread.sp0 += 16;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
-	tsk->thread.screen_bitmap = info->screen_bitmap;
-	if (info->flags & VM86_SCREEN_BITMAP)
+	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call __audit_syscall_exit since we do not exit via the normal paths */
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context))
-		__audit_syscall_exit(1, 0);
-#endif
-
-	__asm__ __volatile__(
-		"movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
-		"mov  %2, %%gs\n\t"
-#endif
-		"jmp resume_userspace"
-		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-	/* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
-	struct pt_regs *regs32;
-
-	regs32 = save_v86_state(regs16);
-	regs32->ax = retval;
-	__asm__ __volatile__("movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-		"jmp resume_userspace"
-		: : "r" (regs32), "r" (current_thread_info()));
+	memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+	force_iret();
+	return regs->ax;
 }
 
 static inline void set_IF(struct kernel_vm86_regs *regs)
 {
 	VEFLAGS |= X86_EFLAGS_VIF;
-	if (VEFLAGS & X86_EFLAGS_VIP)
-		return_to_32bit(regs, VM86_STI);
 }
 
 static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -395,7 +404,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
 
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -405,7 +414,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
 
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VFLAGS, flags, current->thread.v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -420,7 +429,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
 }
 
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
@@ -518,12 +527,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
+	struct vm86 *vm86 = current->thread.vm86;
 
 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
-	if (is_revectored(i, &KVM86->int_revectored))
+	if (is_revectored(i, &vm86->int_revectored))
 		goto cannot_handle;
-	if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+	if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
 		goto cannot_handle;
 	intr_ptr = (unsigned long __user *) (i << 2);
 	if (get_user(segoffs, intr_ptr))
@@ -542,18 +552,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	return;
 
 cannot_handle:
-	return_to_32bit(regs, VM86_INTx + (i << 8));
+	save_v86_state(regs, VM86_INTx + (i << 8));
 }
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
-	if (VMPI.is_vm86pus) {
+	struct vm86 *vm86 = current->thread.vm86;
+
+	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
-			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
-			/* setting this flag forces the code in entry_32.S to
-			   the path where we call save_v86_state() and change
-			   the stack pointer to KVM86->regs32 */
-			set_thread_flag(TIF_NOTIFY_RESUME);
+			save_v86_state(regs, VM86_TRAP + (trapno << 8));
 			return 0;
 		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -574,16 +582,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	unsigned char __user *ssp;
 	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
+	struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
 
 #define CHECK_IF_IN_TRAP \
-	if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
-	if (VMPI.force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
-		return_to_32bit(regs, VM86_PICRETURN); \
-	if (orig_flags & X86_EFLAGS_TF) \
-		handle_vm86_trap(regs, 0, 1); \
-	return; } while (0)
 
 	orig_flags = *(unsigned short *)&regs->pt.flags;
 
@@ -622,7 +625,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 			SP(regs) -= 2;
 		}
 		IP(regs) = ip;
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* popf */
 	case 0x9d:
@@ -642,16 +645,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		else
 			set_vflags_short(newflags, regs);
 
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* int xx */
 	case 0xcd: {
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
-		if (VMPI.vm86dbg_active) {
-			if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
-				return_to_32bit(regs, VM86_INTx + (intno << 8));
+		if (vmpi->vm86dbg_active) {
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+				save_v86_state(regs, VM86_INTx + (intno << 8));
+				return;
+			}
 		}
 		do_int(regs, intno, ssp, sp);
 		return;
@@ -682,14 +687,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		} else {
 			set_vflags_short(newflags, regs);
 		}
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* cli */
 	case 0xfa:
 		IP(regs) = ip;
 		clear_IF(regs);
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* sti */
 	/*
@@ -701,14 +706,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xfb:
 		IP(regs) = ip;
 		set_IF(regs);
-		VM86_FAULT_RETURN;
+		goto check_vip;
 
 	default:
-		return_to_32bit(regs, VM86_UNKNOWN);
+		save_v86_state(regs, VM86_UNKNOWN);
 	}
 
 	return;
 
+check_vip:
+	if (VEFLAGS & X86_EFLAGS_VIP) {
+		save_v86_state(regs, VM86_STI);
+		return;
+	}
+
+vm86_fault_return:
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+		save_v86_state(regs, VM86_PICRETURN);
+		return;
+	}
+	if (orig_flags & X86_EFLAGS_TF)
+		handle_vm86_trap(regs, 0, X86_TRAP_DB);
+	return;
+
 simulate_sigsegv:
 	/* FIXME: After a long discussion with Stas we finally
 	 *        agreed, that this is wrong. Here we should
@@ -720,7 +740,7 @@ simulate_sigsegv:
 	 *        should be a mixture of the two, but how do we
 	 *        get the information? [KD]
 	 */
-	return_to_32bit(regs, VM86_UNKNOWN);
+	save_v86_state(regs, VM86_UNKNOWN);
 }
 
 /* ---------------- vm86 special IRQ passing stuff ----------------- */
diff --git a/kernel/arch/x86/kernel/vmlinux.lds.S b/kernel/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd..74e4bf11f 100644
--- a/kernel/arch/x86/kernel/vmlinux.lds.S
+++ b/kernel/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
 
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <asm/kexec.h>
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/kernel/arch/x86/kernel/vsmp_64.c b/kernel/arch/x86/kernel/vsmp_64.c
index ee22c1d93..b034b1b14 100644
--- a/kernel/arch/x86/kernel/vsmp_64.c
+++ b/kernel/arch/x86/kernel/vsmp_64.c
@@ -72,7 +72,7 @@ asmlinkage __visible void vsmp_irq_enable(void)
 }
 PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
 
-static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
 				  unsigned long addr, unsigned len)
 {
 	switch (type) {
diff --git a/kernel/arch/x86/kernel/x8664_ksyms_64.c b/kernel/arch/x86/kernel/x8664_ksyms_64.c
index 37d8fa443..a0695be19 100644
--- a/kernel/arch/x86/kernel/x8664_ksyms_64.c
+++ b/kernel/arch/x86/kernel/x8664_ksyms_64.c
@@ -75,7 +75,5 @@ EXPORT_SYMBOL(native_load_gs_index);
 
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
diff --git a/kernel/arch/x86/kernel/x86_init.c b/kernel/arch/x86/kernel/x86_init.c
index 234b0722d..3839628d9 100644
--- a/kernel/arch/x86/kernel/x86_init.c
+++ b/kernel/arch/x86/kernel/x86_init.c
@@ -11,7 +11,6 @@
 #include <asm/bios_ebda.h>
 #include <asm/paravirt.h>
 #include <asm/pci_x86.h>
-#include <asm/pci.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
@@ -111,11 +110,9 @@ EXPORT_SYMBOL_GPL(x86_platform);
 #if defined(CONFIG_PCI_MSI)
 struct x86_msi_ops x86_msi = {
 	.setup_msi_irqs		= native_setup_msi_irqs,
-	.compose_msi_msg	= native_compose_msi_msg,
 	.teardown_msi_irq	= native_teardown_msi_irq,
 	.teardown_msi_irqs	= default_teardown_msi_irqs,
 	.restore_msi_irqs	= default_restore_msi_irqs,
-	.setup_hpet_msi		= default_setup_hpet_msi,
 };
 
 /* MSI arch specific hooks */
@@ -141,13 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 #endif
 
 struct x86_io_apic_ops x86_io_apic_ops = {
-	.init			= native_io_apic_init_mappings,
 	.read			= native_io_apic_read,
-	.write			= native_io_apic_write,
-	.modify			= native_io_apic_modify,
 	.disable		= native_disable_io_apic,
-	.print_entries		= native_io_apic_print_entries,
-	.set_affinity		= native_ioapic_set_affinity,
-	.setup_entry		= native_setup_ioapic_entry,
-	.eoi_ioapic_pin		= native_eoi_ioapic_pin,
 };
diff --git a/kernel/arch/x86/kernel/xsave.c b/kernel/arch/x86/kernel/xsave.c
deleted file mode 100644
index 87a815b85..000000000
--- a/kernel/arch/x86/kernel/xsave.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * xsave/xrstor support.
- *
- * Author: Suresh Siddha <suresh.b.siddha@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bootmem.h>
-#include <linux/compat.h>
-#include <linux/cpu.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/sigframe.h>
-#include <asm/tlbflush.h>
-#include <asm/xcr.h>
-
-/*
- * Supported feature mask by the CPU and the kernel.
- */
-u64 pcntxt_mask;
-
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes;
-static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
-static unsigned int xstate_features;
-
-/*
- * If a processor implementation discern that a processor state component is
- * in its initialized state it may modify the corresponding bit in the
- * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
- * layout in the case of xsaveopt. While presenting the xstate information to
- * the user, we always ensure that the memory layout of a feature will be in
- * the init state if the corresponding header bit is zero. This is to ensure
- * that the user doesn't see some stale state in the memory layout during
- * signal handling, debugging etc.
- */
-void __sanitize_i387_state(struct task_struct *tsk)
-{
-	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
-	int feature_bit = 0x2;
-	u64 xstate_bv;
-
-	if (!fx)
-		return;
-
-	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
-
-	/*
-	 * None of the feature bits are in init state. So nothing else
-	 * to do for us, as the memory layout is up to date.
-	 */
-	if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
-		return;
-
-	/*
-	 * FP is in init state
-	 */
-	if (!(xstate_bv & XSTATE_FP)) {
-		fx->cwd = 0x37f;
-		fx->swd = 0;
-		fx->twd = 0;
-		fx->fop = 0;
-		fx->rip = 0;
-		fx->rdp = 0;
-		memset(&fx->st_space[0], 0, 128);
-	}
-
-	/*
-	 * SSE is in init state
-	 */
-	if (!(xstate_bv & XSTATE_SSE))
-		memset(&fx->xmm_space[0], 0, 256);
-
-	xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
-
-	/*
-	 * Update all the other memory layouts for which the corresponding
-	 * header bit is in the init state.
-	 */
-	while (xstate_bv) {
-		if (xstate_bv & 0x1) {
-			int offset = xstate_offsets[feature_bit];
-			int size = xstate_sizes[feature_bit];
-
-			memcpy(((void *) fx) + offset,
-			       ((void *) init_xstate_buf) + offset,
-			       size);
-		}
-
-		xstate_bv >>= 1;
-		feature_bit++;
-	}
-}
-
-/*
- * Check for the presence of extended state information in the
- * user fpstate pointer in the sigcontext.
- */
-static inline int check_for_xstate(struct i387_fxsave_struct __user *buf,
-				   void __user *fpstate,
-				   struct _fpx_sw_bytes *fx_sw)
-{
-	int min_xstate_size = sizeof(struct i387_fxsave_struct) +
-			      sizeof(struct xsave_hdr_struct);
-	unsigned int magic2;
-
-	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
-		return -1;
-
-	/* Check for the first magic field and other error scenarios. */
-	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
-	    fx_sw->xstate_size < min_xstate_size ||
-	    fx_sw->xstate_size > xstate_size ||
-	    fx_sw->xstate_size > fx_sw->extended_size)
-		return -1;
-
-	/*
-	 * Check for the presence of second magic word at the end of memory
-	 * layout. This detects the case where the user just copied the legacy
-	 * fpstate layout with out copying the extended state information
-	 * in the memory layout.
-	 */
-	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
-	    || magic2 != FP_XSTATE_MAGIC2)
-		return -1;
-
-	return 0;
-}
-
-/*
- * Signal frame handlers.
- */
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
-{
-	if (use_fxsr()) {
-		struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-		struct user_i387_ia32_struct env;
-		struct _fpstate_ia32 __user *fp = buf;
-
-		convert_from_fxsr(&env, tsk);
-
-		if (__copy_to_user(buf, &env, sizeof(env)) ||
-		    __put_user(xsave->i387.swd, &fp->status) ||
-		    __put_user(X86_FXSR_MAGIC, &fp->magic))
-			return -1;
-	} else {
-		struct i387_fsave_struct __user *fp = buf;
-		u32 swd;
-		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
-			return -1;
-	}
-
-	return 0;
-}
-
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
-{
-	struct xsave_struct __user *x = buf;
-	struct _fpx_sw_bytes *sw_bytes;
-	u32 xstate_bv;
-	int err;
-
-	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
-	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
-	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
-
-	if (!use_xsave())
-		return err;
-
-	err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
-
-	/*
-	 * Read the xstate_bv which we copied (directly from the cpu or
-	 * from the state in task struct) to the user buffers.
-	 */
-	err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-	/*
-	 * For legacy compatible, we always set FP/SSE bits in the bit
-	 * vector while saving the state to the user context. This will
-	 * enable us capturing any changes(during sigreturn) to
-	 * the FP/SSE bits by the legacy applications which don't touch
-	 * xstate_bv in the xsave header.
-	 *
-	 * xsave aware apps can change the xstate_bv in the xsave
-	 * header as well as change any contents in the memory layout.
-	 * xrestore as part of sigreturn will capture all the changes.
-	 */
-	xstate_bv |= XSTATE_FPSSE;
-
-	err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-	return err;
-}
-
-static inline int save_user_xstate(struct xsave_struct __user *buf)
-{
-	int err;
-
-	if (use_xsave())
-		err = xsave_user(buf);
-	else if (use_fxsr())
-		err = fxsave_user((struct i387_fxsave_struct __user *) buf);
-	else
-		err = fsave_user((struct i387_fsave_struct __user *) buf);
-
-	if (unlikely(err) && __clear_user(buf, xstate_size))
-		err = -EFAULT;
-	return err;
-}
-
-/*
- * Save the fpu, extended register state to the user signal frame.
- *
- * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
- *  state is copied.
- *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
- *
- *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
- *	buf != buf_fx for 32-bit frames with fxstate.
- *
- * If the fpu, extended register state is live, save the state directly
- * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
- * copy the thread's fpu state to the user frame starting at 'buf_fx'.
- *
- * If this is a 32-bit frame with fxstate, put a fsave header before
- * the aligned state at 'buf_fx'.
- *
- * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
- * indicating the absence/presence of the extended state to the user.
- */
-int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-	struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
-	struct task_struct *tsk = current;
-	int ia32_fxstate = (buf != buf_fx);
-
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
-
-	if (!access_ok(VERIFY_WRITE, buf, size))
-		return -EACCES;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(current, NULL, 0,
-			sizeof(struct user_i387_ia32_struct), NULL,
-			(struct _fpstate_ia32 __user *) buf) ? -1 : 1;
-
-	if (user_has_fpu()) {
-		/* Save the live register state to the user directly. */
-		if (save_user_xstate(buf_fx))
-			return -1;
-		/* Update the thread's fxstate to save the fsave header. */
-		if (ia32_fxstate)
-			fpu_fxsave(&tsk->thread.fpu);
-	} else {
-		sanitize_i387_state(tsk);
-		if (__copy_to_user(buf_fx, xsave, xstate_size))
-			return -1;
-	}
-
-	/* Save the fsave header for the 32-bit frames. */
-	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-		return -1;
-
-	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
-		return -1;
-
-	return 0;
-}
-
-static inline void
-sanitize_restored_xstate(struct task_struct *tsk,
-			 struct user_i387_ia32_struct *ia32_env,
-			 u64 xstate_bv, int fx_only)
-{
-	struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-	struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
-
-	if (use_xsave()) {
-		/* These bits must be zero. */
-		memset(xsave_hdr->reserved, 0, 48);
-
-		/*
-		 * Init the state that is not present in the memory
-		 * layout and not enabled by the OS.
-		 */
-		if (fx_only)
-			xsave_hdr->xstate_bv = XSTATE_FPSSE;
-		else
-			xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
-	}
-
-	if (use_fxsr()) {
-		/*
-		 * mscsr reserved bits must be masked to zero for security
-		 * reasons.
-		 */
-		xsave->i387.mxcsr &= mxcsr_feature_mask;
-
-		convert_to_fxsr(tsk, ia32_env);
-	}
-}
-
-/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
- */
-static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
-{
-	if (use_xsave()) {
-		if ((unsigned long)buf % 64 || fx_only) {
-			u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
-			xrstor_state(init_xstate_buf, init_bv);
-			return fxrstor_user(buf);
-		} else {
-			u64 init_bv = pcntxt_mask & ~xbv;
-			if (unlikely(init_bv))
-				xrstor_state(init_xstate_buf, init_bv);
-			return xrestore_user(buf, xbv);
-		}
-	} else if (use_fxsr()) {
-		return fxrstor_user(buf);
-	} else
-		return frstor_user(buf);
-}
-
-int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-	int ia32_fxstate = (buf != buf_fx);
-	struct task_struct *tsk = current;
-	int state_size = xstate_size;
-	u64 xstate_bv = 0;
-	int fx_only = 0;
-
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
-
-	if (!buf) {
-		fpu_reset_state(tsk);
-		return 0;
-	}
-
-	if (!access_ok(VERIFY_READ, buf, size))
-		return -EACCES;
-
-	if (!used_math() && init_fpu(tsk))
-		return -1;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(current, NULL,
-				       0, sizeof(struct user_i387_ia32_struct),
-				       NULL, buf) != 0;
-
-	if (use_xsave()) {
-		struct _fpx_sw_bytes fx_sw_user;
-		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
-			/*
-			 * Couldn't find the extended state information in the
-			 * memory layout. Restore just the FP/SSE and init all
-			 * the other extended state.
-			 */
-			state_size = sizeof(struct i387_fxsave_struct);
-			fx_only = 1;
-		} else {
-			state_size = fx_sw_user.xstate_size;
-			xstate_bv = fx_sw_user.xstate_bv;
-		}
-	}
-
-	if (ia32_fxstate) {
-		/*
-		 * For 32-bit frames with fxstate, copy the user state to the
-		 * thread's fpu state, reconstruct fxstate from the fsave
-		 * header. Sanitize the copied state etc.
-		 */
-		struct fpu *fpu = &tsk->thread.fpu;
-		struct user_i387_ia32_struct env;
-		int err = 0;
-
-		/*
-		 * Drop the current fpu which clears used_math(). This ensures
-		 * that any context-switch during the copy of the new state,
-		 * avoids the intermediate state from getting restored/saved.
-		 * Thus avoiding the new restored state from getting corrupted.
-		 * We will be ready to restore/save the state only after
-		 * set_used_math() is again set.
-		 */
-		drop_fpu(tsk);
-
-		if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) ||
-		    __copy_from_user(&env, buf, sizeof(env))) {
-			fpu_finit(fpu);
-			err = -1;
-		} else {
-			sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
-		}
-
-		set_used_math();
-		if (use_eager_fpu()) {
-			preempt_disable();
-			math_state_restore();
-			preempt_enable();
-		}
-
-		return err;
-	} else {
-		/*
-		 * For 64-bit frames and 32-bit fsave frames, restore the user
-		 * state to the registers directly (with exceptions handled).
-		 */
-		user_fpu_begin();
-		if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
-			fpu_reset_state(tsk);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-static void prepare_fx_sw_frame(void)
-{
-	int fsave_header_size = sizeof(struct i387_fsave_struct);
-	int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
-
-	if (config_enabled(CONFIG_X86_32))
-		size += fsave_header_size;
-
-	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
-	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xstate_bv = pcntxt_mask;
-	fx_sw_reserved.xstate_size = xstate_size;
-
-	if (config_enabled(CONFIG_IA32_EMULATION)) {
-		fx_sw_reserved_ia32 = fx_sw_reserved;
-		fx_sw_reserved_ia32.extended_size += fsave_header_size;
-	}
-}
-
-/*
- * Enable the extended processor state save/restore feature
- */
-static inline void xstate_enable(void)
-{
-	cr4_set_bits(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-}
-
-/*
- * Record the offsets and sizes of different state managed by the xsave
- * memory layout.
- */
-static void __init setup_xstate_features(void)
-{
-	int eax, ebx, ecx, edx, leaf = 0x2;
-
-	xstate_features = fls64(pcntxt_mask);
-	xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
-	xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
-
-	do {
-		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
-
-		if (eax == 0)
-			break;
-
-		xstate_offsets[leaf] = ebx;
-		xstate_sizes[leaf] = eax;
-
-		leaf++;
-	} while (1);
-}
-
-/*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave aread.
- *
- * Input: void
- * Output: void
- */
-void setup_xstate_comp(void)
-{
-	unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
-	int i;
-
-	/*
-	 * The FP xstates and SSE xstates are legacy states. They are always
-	 * in the fixed offsets in the xsave area in either compacted form
-	 * or standard form.
-	 */
-	xstate_comp_offsets[0] = 0;
-	xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
-
-	if (!cpu_has_xsaves) {
-		for (i = 2; i < xstate_features; i++) {
-			if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-				xstate_comp_offsets[i] = xstate_offsets[i];
-				xstate_comp_sizes[i] = xstate_sizes[i];
-			}
-		}
-		return;
-	}
-
-	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
-	for (i = 2; i < xstate_features; i++) {
-		if (test_bit(i, (unsigned long *)&pcntxt_mask))
-			xstate_comp_sizes[i] = xstate_sizes[i];
-		else
-			xstate_comp_sizes[i] = 0;
-
-		if (i > 2)
-			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
-					+ xstate_comp_sizes[i-1];
-
-	}
-}
-
-/*
- * setup the xstate image representing the init state
- */
-static void __init setup_init_fpu_buf(void)
-{
-	/*
-	 * Setup init_xstate_buf to represent the init state of
-	 * all the features managed by the xsave
-	 */
-	init_xstate_buf = alloc_bootmem_align(xstate_size,
-					      __alignof__(struct xsave_struct));
-	fx_finit(&init_xstate_buf->i387);
-
-	if (!cpu_has_xsave)
-		return;
-
-	setup_xstate_features();
-
-	if (cpu_has_xsaves) {
-		init_xstate_buf->xsave_hdr.xcomp_bv =
-						(u64)1 << 63 | pcntxt_mask;
-		init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
-	}
-
-	/*
-	 * Init all the features state with header_bv being 0x0
-	 */
-	xrstor_state_booting(init_xstate_buf, -1);
-	/*
-	 * Dump the init state again. This is to identify the init state
-	 * of any feature which is not represented by all zero's.
-	 */
-	xsave_state_booting(init_xstate_buf, -1);
-}
-
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
-static int __init eager_fpu_setup(char *s)
-{
-	if (!strcmp(s, "on"))
-		eagerfpu = ENABLE;
-	else if (!strcmp(s, "off"))
-		eagerfpu = DISABLE;
-	else if (!strcmp(s, "auto"))
-		eagerfpu = AUTO;
-	return 1;
-}
-__setup("eagerfpu=", eager_fpu_setup);
-
-
-/*
- * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
- */
-static void __init init_xstate_size(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-	int i;
-
-	if (!cpu_has_xsaves) {
-		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-		xstate_size = ebx;
-		return;
-	}
-
-	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-	for (i = 2; i < 64; i++) {
-		if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
-			xstate_size += eax;
-		}
-	}
-}
-
-/*
- * Enable and initialize the xsave feature.
- */
-static void __init xstate_enable_boot_cpu(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
-		WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
-		return;
-	}
-
-	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	pcntxt_mask = eax + ((u64)edx << 32);
-
-	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
-		pr_err("FP/SSE not shown under xsave features 0x%llx\n",
-		       pcntxt_mask);
-		BUG();
-	}
-
-	/*
-	 * Support only the state known to OS.
-	 */
-	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-
-	xstate_enable();
-
-	/*
-	 * Recompute the context size for enabled features
-	 */
-	init_xstate_size();
-
-	update_regset_xstate_info(xstate_size, pcntxt_mask);
-	prepare_fx_sw_frame();
-	setup_init_fpu_buf();
-
-	/* Auto enable eagerfpu for xsaveopt */
-	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
-		eagerfpu = ENABLE;
-
-	if (pcntxt_mask & XSTATE_EAGER) {
-		if (eagerfpu == DISABLE) {
-			pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
-					pcntxt_mask & XSTATE_EAGER);
-			pcntxt_mask &= ~XSTATE_EAGER;
-		} else {
-			eagerfpu = ENABLE;
-		}
-	}
-
-	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
-		pcntxt_mask, xstate_size,
-		cpu_has_xsaves ? "compacted form" : "standard form");
-}
-
-/*
- * For the very first instance, this calls xstate_enable_boot_cpu();
- * for all subsequent instances, this calls xstate_enable().
- *
- * This is somewhat obfuscated due to the lack of powerful enough
- * overrides for the section checks.
- */
-void xsave_init(void)
-{
-	static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
-	void (*this_func)(void);
-
-	if (!cpu_has_xsave)
-		return;
-
-	this_func = next_func;
-	next_func = xstate_enable;
-	this_func();
-}
-
-/*
- * setup_init_fpu_buf() is __init and it is OK to call it here because
- * init_xstate_buf will be unset only once during boot.
- */
-void __init_refok eager_fpu_init(void)
-{
-	WARN_ON(used_math());
-	current_thread_info()->status = 0;
-
-	if (eagerfpu == ENABLE)
-		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
-	if (!cpu_has_eager_fpu) {
-		stts();
-		return;
-	}
-
-	if (!init_xstate_buf)
-		setup_init_fpu_buf();
-}
-
-/*
- * Given the xsave area and a state inside, this function returns the
- * address of the state.
- *
- * This is the API that is called to get xstate address in either
- * standard format or compacted format of xsave area.
- *
- * Inputs:
- *	xsave: base address of the xsave area;
- *	xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
- *	etc.)
- * Output:
- *	address of the state in the xsave area.
- */
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
-{
-	int feature = fls64(xstate) - 1;
-	if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
-		return NULL;
-
-	return (void *)xsave + xstate_comp_offsets[feature];
-}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
diff --git a/kernel/arch/x86/kvm/Kconfig b/kernel/arch/x86/kvm/Kconfig
index 413a7bf9e..639a6e345 100644
--- a/kernel/arch/x86/kvm/Kconfig
+++ b/kernel/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
+	select IRQ_BYPASS_MANAGER
+	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
@@ -86,15 +88,16 @@ config KVM_MMU_AUDIT
 	 auditing of KVM MMU events at runtime.
 
 config KVM_DEVICE_ASSIGNMENT
-	bool "KVM legacy PCI device assignment support"
+	bool "KVM legacy PCI device assignment support (DEPRECATED)"
 	depends on KVM && PCI && IOMMU_API
-	default y
+	default n
 	---help---
 	  Provide support for legacy PCI device assignment through KVM.  The
 	  kernel now also supports a full featured userspace device driver
-	  framework through VFIO, which supersedes much of this support.
+	  framework through VFIO, which supersedes this support and provides
+	  better security.
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
diff --git a/kernel/arch/x86/kvm/Makefile b/kernel/arch/x86/kvm/Makefile
index 16e8f962e..a1ff508bb 100644
--- a/kernel/arch/x86/kvm/Makefile
+++ b/kernel/arch/x86/kvm/Makefile
@@ -12,10 +12,12 @@ kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
+			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+			   hyperv.o
+
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
-kvm-intel-y		+= vmx.o
-kvm-amd-y		+= svm.o
+kvm-intel-y		+= vmx.o pmu_intel.o
+kvm-amd-y		+= svm.o pmu_amd.o
 
 obj-$(CONFIG_KVM)	+= kvm.o
 obj-$(CONFIG_KVM_INTEL)	+= kvm-intel.o
diff --git a/kernel/arch/x86/kvm/assigned-dev.c b/kernel/arch/x86/kvm/assigned-dev.c
index d090ecf08..9dc091acd 100644
--- a/kernel/arch/x86/kvm/assigned-dev.c
+++ b/kernel/arch/x86/kvm/assigned-dev.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include "irq.h"
 #include "assigned-dev.h"
+#include "trace/events/kvm.h"
 
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
@@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-#ifdef __KVM_HAVE_MSI
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ *  Other values - No need to retry.
+ */
+static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
+				int level)
+{
+	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+	struct kvm_kernel_irq_routing_entry *e;
+	int ret = -EINVAL;
+	int idx;
+
+	trace_kvm_set_irq(irq, level, irq_source_id);
+
+	/*
+	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
+	 * which would need to be retried from thread context;  when same GSI
+	 * is connected to both PIC and IOAPIC, we'd have to report a
+	 * partial failure here.
+	 * Since there's no easy way to do this, we only support injecting MSI
+	 * which is limited to 1:1 GSI mapping.
+	 */
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
+		e = &entries[0];
+		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
+						irq, level);
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
+
 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
 
 	return IRQ_HANDLED;
 }
-#endif
 
-#ifdef __KVM_HAVE_MSIX
 static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 
 	return IRQ_HANDLED;
 }
-#endif
 
 /* Ack the irq line for an assigned device */
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	return 0;
 }
 
-#ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_host_msi(struct kvm *kvm,
 					   struct kvm_assigned_dev_kernel *dev)
 {
@@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 
 	return 0;
 }
-#endif
 
-#ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_host_msix(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 {
@@ -443,8 +473,6 @@ err:
 	return r;
 }
 
-#endif
-
 static int assigned_device_enable_guest_intx(struct kvm *kvm,
 				struct kvm_assigned_dev_kernel *dev,
 				struct kvm_assigned_irq *irq)
@@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
 	return 0;
 }
 
-#ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_guest_msi(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
@@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	return 0;
 }
-#endif
 
-#ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_guest_msix(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
@@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	return 0;
 }
-#endif
 
 static int assign_host_irq(struct kvm *kvm,
 			   struct kvm_assigned_dev_kernel *dev,
@@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_HOST_INTX:
 		r = assigned_device_enable_host_intx(kvm, dev);
 		break;
-#ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_HOST_MSI:
 		r = assigned_device_enable_host_msi(kvm, dev);
 		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_HOST_MSIX:
 		r = assigned_device_enable_host_msix(kvm, dev);
 		break;
-#endif
 	default:
 		r = -EINVAL;
 	}
@@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_GUEST_INTX:
 		r = assigned_device_enable_guest_intx(kvm, dev, irq);
 		break;
-#ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_GUEST_MSI:
 		r = assigned_device_enable_guest_msi(kvm, dev, irq);
 		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_GUEST_MSIX:
 		r = assigned_device_enable_guest_msix(kvm, dev, irq);
 		break;
-#endif
 	default:
 		r = -EINVAL;
 	}
@@ -826,7 +842,6 @@ out:
 }
 
 
-#ifdef __KVM_HAVE_MSIX
 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 				    struct kvm_assigned_msix_nr *entry_nr)
 {
@@ -906,7 +921,6 @@ msix_entry_out:
 
 	return r;
 }
-#endif
 
 static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
 		struct kvm_assigned_pci_dev *assigned_dev)
@@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
-#ifdef __KVM_HAVE_MSIX
 	case KVM_ASSIGN_SET_MSIX_NR: {
 		struct kvm_assigned_msix_nr entry_nr;
 		r = -EFAULT;
@@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
-#endif
 	case KVM_ASSIGN_SET_INTX_MASK: {
 		struct kvm_assigned_pci_dev assigned_dev;
 
diff --git a/kernel/arch/x86/kvm/cpuid.c b/kernel/arch/x86/kvm/cpuid.c
index 1d08ad358..6525e926f 100644
--- a/kernel/arch/x86/kvm/cpuid.c
+++ b/kernel/arch/x86/kvm/cpuid.c
@@ -16,21 +16,21 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
-#include <asm/i387.h> /* For use_eager_fpu.  Ugh! */
-#include <asm/fpu-internal.h> /* For use_eager_fpu.  Ugh! */
+#include <asm/fpu/internal.h> /* For use_eager_fpu.  Ugh! */
 #include <asm/user.h>
-#include <asm/xsave.h>
+#include <asm/fpu/xstate.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
 #include "trace.h"
+#include "pmu.h"
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 {
 	int feature_bit = 0;
 	u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 
-	xstate_bv &= XSTATE_EXTEND_MASK;
+	xstate_bv &= XFEATURE_MASK_EXTEND;
 	while (xstate_bv) {
 		if (xstate_bv & 0x1) {
 		        u32 eax, ebx, ecx, edx, offset;
@@ -51,7 +51,7 @@ u64 kvm_supported_xcr0(void)
 	u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
 
 	if (!kvm_x86_ops->mpx_supported())
-		xcr0 &= ~(XSTATE_BNDREGS | XSTATE_BNDCSR);
+		xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 
 	return xcr0;
 }
@@ -97,7 +97,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-	vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu);
+	vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
+	if (vcpu->arch.eager_fpu)
+		kvm_x86_ops->fpu_activate(vcpu);
 
 	/*
 	 * The existing code assumes virtual address is 48-bit in the canonical
@@ -111,7 +113,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	/* Update physical-address width */
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
-	kvm_pmu_cpuid_update(vcpu);
+	kvm_pmu_refresh(vcpu);
 	return 0;
 }
 
@@ -346,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-		F(AVX512CD);
+		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
 
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_supported_word10_x86_features =
@@ -415,6 +417,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 6: /* Thermal management */
+		entry->eax = 0x4; /* allow ARAT */
+		entry->ebx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
 	case 7: {
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* Mask ebx against host capability word 9 */
@@ -591,7 +599,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
-	case 6: /* Thermal management */
 	case 0xC0000002:
 	case 0xC0000003:
 	case 0xC0000004:
diff --git a/kernel/arch/x86/kvm/cpuid.h b/kernel/arch/x86/kvm/cpuid.h
index 496b3695d..3f5c48ddb 100644
--- a/kernel/arch/x86/kvm/cpuid.h
+++ b/kernel/arch/x86/kvm/cpuid.h
@@ -38,6 +38,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
 }
 
+static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->edx & bit(X86_FEATURE_MTRR));
+}
+
 static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -70,6 +78,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 
+static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->edx & bit(X86_FEATURE_LM));
+}
+
 static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -125,4 +141,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->ebx & bit(X86_FEATURE_MPX));
 }
+
+static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
+}
+
+static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+}
+
+/*
+ * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+ */
+#define BIT_NRIPS	3
+
+static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
+
+	/*
+	 * NRIPS is a scattered cpuid feature, so we can't use
+	 * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
+	 * position 8, not 3).
+	 */
+	return best && (best->edx & bit(BIT_NRIPS));
+}
+#undef BIT_NRIPS
+
 #endif
diff --git a/kernel/arch/x86/kvm/emulate.c b/kernel/arch/x86/kvm/emulate.c
index 630bcb0d7..b9b09fec1 100644
--- a/kernel/arch/x86/kvm/emulate.c
+++ b/kernel/arch/x86/kvm/emulate.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <asm/kvm_emulate.h>
 #include <linux/stringify.h>
+#include <asm/debugreg.h>
 
 #include "x86.h"
 #include "tss.h"
@@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
 static inline void
 register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
 {
-	ulong mask;
+	ulong *preg = reg_rmw(ctxt, reg);
 
-	if (ctxt->ad_bytes == sizeof(unsigned long))
-		mask = ~0UL;
-	else
-		mask = ad_mask(ctxt);
-	masked_increment(reg_rmw(ctxt, reg), mask, inc);
+	assign_register(preg, *preg + inc, ctxt->ad_bytes);
 }
 
 static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -656,6 +653,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 	*max_size = 0;
 	switch (mode) {
 	case X86EMUL_MODE_PROT64:
+		*linear = la;
 		if (is_noncanonical_address(la))
 			goto bad;
 
@@ -664,6 +662,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 			goto bad;
 		break;
 	default:
+		*linear = la = (u32)la;
 		usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
 						addr.seg);
 		if (!usable)
@@ -691,12 +690,10 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 			if (size > *max_size)
 				goto bad;
 		}
-		la &= (u32)-1;
 		break;
 	}
 	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
 		return emulate_gp(ctxt, 0);
-	*linear = la;
 	return X86EMUL_CONTINUE;
 bad:
 	if (addr.seg == VCPU_SREG_SS)
@@ -2262,6 +2259,283 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = 0x80000001;
+	ecx = 0;
+	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+	return edx & bit(X86_FEATURE_LM);
+}
+
+#define GET_SMSTATE(type, smbase, offset)				  \
+	({								  \
+	 type __val;							  \
+	 int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val,      \
+				      sizeof(__val));			  \
+	 if (r != X86EMUL_CONTINUE)					  \
+		 return X86EMUL_UNHANDLEABLE;				  \
+	 __val;								  \
+	})
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+	desc->g    = (flags >> 23) & 1;
+	desc->d    = (flags >> 22) & 1;
+	desc->l    = (flags >> 21) & 1;
+	desc->avl  = (flags >> 20) & 1;
+	desc->p    = (flags >> 15) & 1;
+	desc->dpl  = (flags >> 13) & 3;
+	desc->s    = (flags >> 12) & 1;
+	desc->type = (flags >>  8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+	struct desc_struct desc;
+	int offset;
+	u16 selector;
+
+	selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
+
+	if (n < 3)
+		offset = 0x7f84 + n * 12;
+	else
+		offset = 0x7f2c + (n - 3) * 12;
+
+	set_desc_base(&desc,      GET_SMSTATE(u32, smbase, offset + 8));
+	set_desc_limit(&desc,     GET_SMSTATE(u32, smbase, offset + 4));
+	rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
+	ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+	return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+	struct desc_struct desc;
+	int offset;
+	u16 selector;
+	u32 base3;
+
+	offset = 0x7e00 + n * 16;
+
+	selector =                GET_SMSTATE(u16, smbase, offset);
+	rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
+	set_desc_limit(&desc,     GET_SMSTATE(u32, smbase, offset + 4));
+	set_desc_base(&desc,      GET_SMSTATE(u32, smbase, offset + 8));
+	base3 =                   GET_SMSTATE(u32, smbase, offset + 12);
+
+	ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+	return X86EMUL_CONTINUE;
+}
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+				     u64 cr0, u64 cr4)
+{
+	int bad;
+
+	/*
+	 * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+	 * Then enable protected mode.	However, PCID cannot be enabled
+	 * if EFER.LMA=0, so set it separately.
+	 */
+	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+	if (bad)
+		return X86EMUL_UNHANDLEABLE;
+
+	bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+	if (bad)
+		return X86EMUL_UNHANDLEABLE;
+
+	if (cr4 & X86_CR4_PCIDE) {
+		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+		if (bad)
+			return X86EMUL_UNHANDLEABLE;
+	}
+
+	return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+	struct desc_struct desc;
+	struct desc_ptr dt;
+	u16 selector;
+	u32 val, cr0, cr4;
+	int i;
+
+	cr0 =                      GET_SMSTATE(u32, smbase, 0x7ffc);
+	ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+	ctxt->eflags =             GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
+	ctxt->_eip =               GET_SMSTATE(u32, smbase, 0x7ff0);
+
+	for (i = 0; i < 8; i++)
+		*reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
+
+	val = GET_SMSTATE(u32, smbase, 0x7fcc);
+	ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+	val = GET_SMSTATE(u32, smbase, 0x7fc8);
+	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+	selector =                 GET_SMSTATE(u32, smbase, 0x7fc4);
+	set_desc_base(&desc,       GET_SMSTATE(u32, smbase, 0x7f64));
+	set_desc_limit(&desc,      GET_SMSTATE(u32, smbase, 0x7f60));
+	rsm_set_desc_flags(&desc,  GET_SMSTATE(u32, smbase, 0x7f5c));
+	ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
+
+	selector =                 GET_SMSTATE(u32, smbase, 0x7fc0);
+	set_desc_base(&desc,       GET_SMSTATE(u32, smbase, 0x7f80));
+	set_desc_limit(&desc,      GET_SMSTATE(u32, smbase, 0x7f7c));
+	rsm_set_desc_flags(&desc,  GET_SMSTATE(u32, smbase, 0x7f78));
+	ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
+
+	dt.address =               GET_SMSTATE(u32, smbase, 0x7f74);
+	dt.size =                  GET_SMSTATE(u32, smbase, 0x7f70);
+	ctxt->ops->set_gdt(ctxt, &dt);
+
+	dt.address =               GET_SMSTATE(u32, smbase, 0x7f58);
+	dt.size =                  GET_SMSTATE(u32, smbase, 0x7f54);
+	ctxt->ops->set_idt(ctxt, &dt);
+
+	for (i = 0; i < 6; i++) {
+		int r = rsm_load_seg_32(ctxt, smbase, i);
+		if (r != X86EMUL_CONTINUE)
+			return r;
+	}
+
+	cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
+
+	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
+
+	return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+	struct desc_struct desc;
+	struct desc_ptr dt;
+	u64 val, cr0, cr4;
+	u32 base3;
+	u16 selector;
+	int i, r;
+
+	for (i = 0; i < 16; i++)
+		*reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
+
+	ctxt->_eip   = GET_SMSTATE(u64, smbase, 0x7f78);
+	ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+
+	val = GET_SMSTATE(u32, smbase, 0x7f68);
+	ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+	val = GET_SMSTATE(u32, smbase, 0x7f60);
+	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+	cr0 =                       GET_SMSTATE(u64, smbase, 0x7f58);
+	ctxt->ops->set_cr(ctxt, 3,  GET_SMSTATE(u64, smbase, 0x7f50));
+	cr4 =                       GET_SMSTATE(u64, smbase, 0x7f48);
+	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
+	val =                       GET_SMSTATE(u64, smbase, 0x7ed0);
+	ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+	selector =                  GET_SMSTATE(u32, smbase, 0x7e90);
+	rsm_set_desc_flags(&desc,   GET_SMSTATE(u32, smbase, 0x7e92) << 8);
+	set_desc_limit(&desc,       GET_SMSTATE(u32, smbase, 0x7e94));
+	set_desc_base(&desc,        GET_SMSTATE(u32, smbase, 0x7e98));
+	base3 =                     GET_SMSTATE(u32, smbase, 0x7e9c);
+	ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
+
+	dt.size =                   GET_SMSTATE(u32, smbase, 0x7e84);
+	dt.address =                GET_SMSTATE(u64, smbase, 0x7e88);
+	ctxt->ops->set_idt(ctxt, &dt);
+
+	selector =                  GET_SMSTATE(u32, smbase, 0x7e70);
+	rsm_set_desc_flags(&desc,   GET_SMSTATE(u32, smbase, 0x7e72) << 8);
+	set_desc_limit(&desc,       GET_SMSTATE(u32, smbase, 0x7e74));
+	set_desc_base(&desc,        GET_SMSTATE(u32, smbase, 0x7e78));
+	base3 =                     GET_SMSTATE(u32, smbase, 0x7e7c);
+	ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
+
+	dt.size =                   GET_SMSTATE(u32, smbase, 0x7e64);
+	dt.address =                GET_SMSTATE(u64, smbase, 0x7e68);
+	ctxt->ops->set_gdt(ctxt, &dt);
+
+	r = rsm_enter_protected_mode(ctxt, cr0, cr4);
+	if (r != X86EMUL_CONTINUE)
+		return r;
+
+	for (i = 0; i < 6; i++) {
+		r = rsm_load_seg_64(ctxt, smbase, i);
+		if (r != X86EMUL_CONTINUE)
+			return r;
+	}
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned long cr0, cr4, efer;
+	u64 smbase;
+	int ret;
+
+	if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+		return emulate_ud(ctxt);
+
+	/*
+	 * Get back to real mode, to prepare a safe state in which to load
+	 * CR0/CR3/CR4/EFER.  It's all a bit more complicated if the vCPU
+	 * supports long mode.
+	 */
+	cr4 = ctxt->ops->get_cr(ctxt, 4);
+	if (emulator_has_longmode(ctxt)) {
+		struct desc_struct cs_desc;
+
+		/* Zero CR4.PCIDE before CR0.PG.  */
+		if (cr4 & X86_CR4_PCIDE) {
+			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+			cr4 &= ~X86_CR4_PCIDE;
+		}
+
+		/* A 32-bit code segment is required to clear EFER.LMA.  */
+		memset(&cs_desc, 0, sizeof(cs_desc));
+		cs_desc.type = 0xb;
+		cs_desc.s = cs_desc.g = cs_desc.p = 1;
+		ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
+	}
+
+	/* For the 64-bit case, this will clear EFER.LMA.  */
+	cr0 = ctxt->ops->get_cr(ctxt, 0);
+	if (cr0 & X86_CR0_PE)
+		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
+	if (cr4 & X86_CR4_PAE)
+		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+
+	/* And finally go back to 32-bit mode.  */
+	efer = 0;
+	ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+
+	smbase = ctxt->ops->get_smbase(ctxt);
+	if (emulator_has_longmode(ctxt))
+		ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+	else
+		ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+	if (ret != X86EMUL_CONTINUE) {
+		/* FIXME: should triple fault */
+		return X86EMUL_UNHANDLEABLE;
+	}
+
+	if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+		ctxt->ops->set_nmi_mask(ctxt, false);
+
+	ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
+	ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+	return X86EMUL_CONTINUE;
+}
+
 static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct desc_struct *cs, struct desc_struct *ss)
@@ -2573,6 +2847,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 	return true;
 }
 
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+	/*
+	 * Intel CPUs mask the counter and pointers in quite strange
+	 * manner when ECX is zero due to REP-string optimizations.
+	 */
+#ifdef CONFIG_X86_64
+	if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+		return;
+
+	*reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+	switch (ctxt->b) {
+	case 0xa4:	/* movsb */
+	case 0xa5:	/* movsd/w */
+		*reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+		/* fall through */
+	case 0xaa:	/* stosb */
+	case 0xab:	/* stosd/w */
+		*reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+	}
+#endif
+}
+
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 				struct tss_segment_16 *tss)
 {
@@ -2849,7 +3147,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	ulong old_tss_base =
 		ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
 	u32 desc_limit;
-	ulong desc_addr;
+	ulong desc_addr, dr7;
 
 	/* FIXME: old_tss_base == ~0 ? */
 
@@ -2934,6 +3232,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		ret = em_push(ctxt);
 	}
 
+	ops->get_dr(ctxt, 7, &dr7);
+	ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
 	return ret;
 }
 
@@ -3840,7 +4141,7 @@ static const struct opcode group5[] = {
 	F(DstMem | SrcNone | Lock,		em_inc),
 	F(DstMem | SrcNone | Lock,		em_dec),
 	I(SrcMem | NearBranch,			em_call_near_abs),
-	I(SrcMemFAddr | ImplicitOps | Stack,	em_call_far),
+	I(SrcMemFAddr | ImplicitOps,		em_call_far),
 	I(SrcMem | NearBranch,			em_jmp_abs),
 	I(SrcMemFAddr | ImplicitOps,		em_jmp_far),
 	I(SrcMem | Stack,			em_push), D(Undefined),
@@ -4173,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	DI(ImplicitOps, rsm),
+	II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
@@ -4871,7 +5172,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				fetch_possible_mmx_operand(ctxt, &ctxt->dst);
 		}
 
-		if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_PRE_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -4900,7 +5201,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				goto done;
 		}
 
-		if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_POST_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -4910,6 +5211,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		if (ctxt->rep_prefix && (ctxt->d & String)) {
 			/* All REP prefixes have the same first termination condition */
 			if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+				string_registers_quirk(ctxt);
 				ctxt->eip = ctxt->_eip;
 				ctxt->eflags &= ~X86_EFLAGS_RF;
 				goto done;
@@ -4953,7 +5255,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
-	if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+	if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_MEMACCESS);
 		if (rc != X86EMUL_CONTINUE)
diff --git a/kernel/arch/x86/kvm/hyperv.c b/kernel/arch/x86/kvm/hyperv.c
new file mode 100644
index 000000000..62cf8c915
--- /dev/null
+++ b/kernel/arch/x86/kvm/hyperv.c
@@ -0,0 +1,404 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Amit Shah    <amit.shah@qumranet.com>
+ *   Ben-Ami Yassour <benami@il.ibm.com>
+ *   Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "x86.h"
+#include "lapic.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+	bool r = false;
+
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+	case HV_X64_MSR_HYPERCALL:
+	case HV_X64_MSR_REFERENCE_TSC:
+	case HV_X64_MSR_TIME_REF_COUNT:
+	case HV_X64_MSR_CRASH_CTL:
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_RESET:
+		r = true;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
+				     u32 index, u64 *pdata)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+		return -EINVAL;
+
+	*pdata = hv->hv_crash_param[index];
+	return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	*pdata = hv->hv_crash_ctl;
+	return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	if (host)
+		hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY;
+
+	if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) {
+
+		vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+			  hv->hv_crash_param[0],
+			  hv->hv_crash_param[1],
+			  hv->hv_crash_param[2],
+			  hv->hv_crash_param[3],
+			  hv->hv_crash_param[4]);
+
+		/* Send notification about crash to user space */
+		kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+	}
+
+	return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
+				     u32 index, u64 data)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+		return -EINVAL;
+
+	hv->hv_crash_param[index] = data;
+	return 0;
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+			     bool host)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+		hv->hv_guest_os_id = data;
+		/* setting guest os id to zero disables hypercall page */
+		if (!hv->hv_guest_os_id)
+			hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+		break;
+	case HV_X64_MSR_HYPERCALL: {
+		u64 gfn;
+		unsigned long addr;
+		u8 instructions[4];
+
+		/* if guest os id is not set hypercall should remain disabled */
+		if (!hv->hv_guest_os_id)
+			break;
+		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+			hv->hv_hypercall = data;
+			break;
+		}
+		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+		addr = gfn_to_hva(kvm, gfn);
+		if (kvm_is_error_hva(addr))
+			return 1;
+		kvm_x86_ops->patch_hypercall(vcpu, instructions);
+		((unsigned char *)instructions)[3] = 0xc3; /* ret */
+		if (__copy_to_user((void __user *)addr, instructions, 4))
+			return 1;
+		hv->hv_hypercall = data;
+		mark_page_dirty(kvm, gfn);
+		break;
+	}
+	case HV_X64_MSR_REFERENCE_TSC: {
+		u64 gfn;
+		HV_REFERENCE_TSC_PAGE tsc_ref;
+
+		memset(&tsc_ref, 0, sizeof(tsc_ref));
+		hv->hv_tsc_page = data;
+		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+			break;
+		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+		if (kvm_write_guest(
+				kvm,
+				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
+				&tsc_ref, sizeof(tsc_ref)))
+			return 1;
+		mark_page_dirty(kvm, gfn);
+		break;
+	}
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+		return kvm_hv_msr_set_crash_data(vcpu,
+						 msr - HV_X64_MSR_CRASH_P0,
+						 data);
+	case HV_X64_MSR_CRASH_CTL:
+		return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+	case HV_X64_MSR_RESET:
+		if (data == 1) {
+			vcpu_debug(vcpu, "hyper-v reset requested\n");
+			kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+		}
+		break;
+	default:
+		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+			    msr, data);
+		return 1;
+	}
+	return 0;
+}
+
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+	cputime_t utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+
+	switch (msr) {
+	case HV_X64_MSR_APIC_ASSIST_PAGE: {
+		u64 gfn;
+		unsigned long addr;
+
+		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+			hv->hv_vapic = data;
+			if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+				return 1;
+			break;
+		}
+		gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
+		addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+		if (kvm_is_error_hva(addr))
+			return 1;
+		if (__clear_user((void __user *)addr, PAGE_SIZE))
+			return 1;
+		hv->hv_vapic = data;
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
+		if (kvm_lapic_enable_pv_eoi(vcpu,
+					    gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+			return 1;
+		break;
+	}
+	case HV_X64_MSR_EOI:
+		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+	case HV_X64_MSR_ICR:
+		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+	case HV_X64_MSR_TPR:
+		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+	case HV_X64_MSR_VP_RUNTIME:
+		if (!host)
+			return 1;
+		hv->runtime_offset = data - current_task_runtime_100ns();
+		break;
+	default:
+		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+			    msr, data);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	u64 data = 0;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+
+	switch (msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+		data = hv->hv_guest_os_id;
+		break;
+	case HV_X64_MSR_HYPERCALL:
+		data = hv->hv_hypercall;
+		break;
+	case HV_X64_MSR_TIME_REF_COUNT: {
+		data =
+		     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+		break;
+	}
+	case HV_X64_MSR_REFERENCE_TSC:
+		data = hv->hv_tsc_page;
+		break;
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+		return kvm_hv_msr_get_crash_data(vcpu,
+						 msr - HV_X64_MSR_CRASH_P0,
+						 pdata);
+	case HV_X64_MSR_CRASH_CTL:
+		return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+	case HV_X64_MSR_RESET:
+		data = 0;
+		break;
+	default:
+		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		return 1;
+	}
+
+	*pdata = data;
+	return 0;
+}
+
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	u64 data = 0;
+	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+
+	switch (msr) {
+	case HV_X64_MSR_VP_INDEX: {
+		int r;
+		struct kvm_vcpu *v;
+
+		kvm_for_each_vcpu(r, v, vcpu->kvm) {
+			if (v == vcpu) {
+				data = r;
+				break;
+			}
+		}
+		break;
+	}
+	case HV_X64_MSR_EOI:
+		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+	case HV_X64_MSR_ICR:
+		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+	case HV_X64_MSR_TPR:
+		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+	case HV_X64_MSR_APIC_ASSIST_PAGE:
+		data = hv->hv_vapic;
+		break;
+	case HV_X64_MSR_VP_RUNTIME:
+		data = current_task_runtime_100ns() + hv->runtime_offset;
+		break;
+	default:
+		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+		return 1;
+	}
+	*pdata = data;
+	return 0;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+	if (kvm_hv_msr_partition_wide(msr)) {
+		int r;
+
+		mutex_lock(&vcpu->kvm->lock);
+		r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
+		mutex_unlock(&vcpu->kvm->lock);
+		return r;
+	} else
+		return kvm_hv_set_msr(vcpu, msr, data, host);
+}
+
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	if (kvm_hv_msr_partition_wide(msr)) {
+		int r;
+
+		mutex_lock(&vcpu->kvm->lock);
+		r = kvm_hv_get_msr_pw(vcpu, msr, pdata);
+		mutex_unlock(&vcpu->kvm->lock);
+		return r;
+	} else
+		return kvm_hv_get_msr(vcpu, msr, pdata);
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+	return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+	u64 param, ingpa, outgpa, ret;
+	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+	bool fast, longmode;
+
+	/*
+	 * hypercall generates UD from non zero cpl and real mode
+	 * per HYPER-V spec
+	 */
+	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	longmode = is_64_bit_mode(vcpu);
+
+	if (!longmode) {
+		param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+			(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+		ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+			(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+		outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+			(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+	}
+#ifdef CONFIG_X86_64
+	else {
+		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+	}
+#endif
+
+	code = param & 0xffff;
+	fast = (param >> 16) & 0x1;
+	rep_cnt = (param >> 32) & 0xfff;
+	rep_idx = (param >> 48) & 0xfff;
+
+	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+	switch (code) {
+	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+		kvm_vcpu_on_spin(vcpu);
+		break;
+	default:
+		res = HV_STATUS_INVALID_HYPERCALL_CODE;
+		break;
+	}
+
+	ret = res | (((u64)rep_done & 0xfff) << 32);
+	if (longmode) {
+		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+	} else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+	}
+
+	return 1;
+}
diff --git a/kernel/arch/x86/kvm/hyperv.h b/kernel/arch/x86/kvm/hyperv.h
new file mode 100644
index 000000000..c7bce559f
--- /dev/null
+++ b/kernel/arch/x86/kvm/hyperv.h
@@ -0,0 +1,32 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Amit Shah    <amit.shah@qumranet.com>
+ *   Ben-Ami Yassour <benami@il.ibm.com>
+ *   Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __ARCH_X86_KVM_HYPERV_H__
+#define __ARCH_X86_KVM_HYPERV_H__
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/kernel/arch/x86/kvm/i8254.c b/kernel/arch/x86/kvm/i8254.c
index f90952f64..b0ea42b78 100644
--- a/kernel/arch/x86/kvm/i8254.c
+++ b/kernel/arch/x86/kvm/i8254.c
@@ -35,6 +35,7 @@
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 
+#include "ioapic.h"
 #include "irq.h"
 #include "i8254.h"
 #include "x86.h"
@@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 	s64 interval;
 
-	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+	if (!ioapic_in_kernel(kvm) ||
+	    ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -418,6 +420,7 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_s
 	u8 saved_mode;
 	if (hpet_legacy_start) {
 		/* save existing mode for later reenablement */
+		WARN_ON(channel != 0);
 		saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
 		kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
 		pit_load_count(kvm, channel, val);
diff --git a/kernel/arch/x86/kvm/i8259.c b/kernel/arch/x86/kvm/i8259.c
index fef922ff2..7cc2360f1 100644
--- a/kernel/arch/x86/kvm/i8259.c
+++ b/kernel/arch/x86/kvm/i8259.c
@@ -651,15 +651,10 @@ fail_unlock:
 	return NULL;
 }
 
-void kvm_destroy_pic(struct kvm *kvm)
+void kvm_destroy_pic(struct kvm_pic *vpic)
 {
-	struct kvm_pic *vpic = kvm->arch.vpic;
-
-	if (vpic) {
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
-		kvm->arch.vpic = NULL;
-		kfree(vpic);
-	}
+	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
+	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
+	kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+	kfree(vpic);
 }
diff --git a/kernel/arch/x86/kvm/ioapic.c b/kernel/arch/x86/kvm/ioapic.c
index 28146f03c..88d0a92d3 100644
--- a/kernel/arch/x86/kvm/ioapic.c
+++ b/kernel/arch/x86/kvm/ioapic.c
@@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 }
 
 
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
-{
-	DECLARE_BITMAP(handled_vectors, 256);
-	int i;
-
-	memset(handled_vectors, 0, sizeof(handled_vectors));
-	for (i = 0; i < IOAPIC_NUM_PINS; ++i)
-		__set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
-	memcpy(ioapic->handled_vectors, handled_vectors,
-	       sizeof(handled_vectors));
-	smp_wmb();
-}
-
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-			u32 *tmr)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
 	union kvm_ioapic_redirect_entry *e;
@@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
 		    kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
 		    index == RTC_GSI) {
 			if (kvm_apic_match_dest(vcpu, NULL, 0,
-				e->fields.dest_id, e->fields.dest_mode)) {
+			             e->fields.dest_id, e->fields.dest_mode) ||
+			    (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
+			     kvm_apic_pending_eoi(vcpu, e->fields.vector)))
 				__set_bit(e->fields.vector,
 					(unsigned long *)eoi_exit_bitmap);
-				if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-					__set_bit(e->fields.vector,
-						(unsigned long *)tmr);
-			}
 		}
 	}
 	spin_unlock(&ioapic->lock);
@@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			e->bits |= (u32) val;
 			e->fields.remote_irr = 0;
 		}
-		update_handled_vectors(ioapic);
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -349,6 +332,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 	irqe.delivery_mode = entry->fields.delivery_mode << 8;
 	irqe.level = 1;
 	irqe.shorthand = 0;
+	irqe.msi_redir_hint = false;
 
 	if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
 		ioapic->irr_delivered |= 1 << irq;
@@ -598,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 	ioapic->id = 0;
 	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
-	update_handled_vectors(ioapic);
 }
 
 static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -627,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;
 		kfree(ioapic);
+		return ret;
 	}
 
+	kvm_vcpu_request_scan_ioapic(kvm);
 	return ret;
 }
 
@@ -637,11 +622,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
 	cancel_delayed_work_sync(&ioapic->eoi_inject);
-	if (ioapic) {
-		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
-		kvm->arch.vioapic = NULL;
-		kfree(ioapic);
-	}
+	kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+	kvm->arch.vioapic = NULL;
+	kfree(ioapic);
 }
 
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
@@ -667,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	ioapic->irr = 0;
 	ioapic->irr_delivered = 0;
-	update_handled_vectors(ioapic);
 	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
diff --git a/kernel/arch/x86/kvm/ioapic.h b/kernel/arch/x86/kvm/ioapic.h
index ca0b0b4e6..084617d37 100644
--- a/kernel/arch/x86/kvm/ioapic.h
+++ b/kernel/arch/x86/kvm/ioapic.h
@@ -9,6 +9,7 @@ struct kvm;
 struct kvm_vcpu;
 
 #define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
 #define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
 #define IOAPIC_EDGE_TRIG  0
 #define IOAPIC_LEVEL_TRIG 1
@@ -73,7 +74,6 @@ struct kvm_ioapic {
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
 	spinlock_t lock;
-	DECLARE_BITMAP(handled_vectors, 256);
 	struct rtc_status rtc_status;
 	struct delayed_work eoi_inject;
 	u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
 
-static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+static inline int ioapic_in_kernel(struct kvm *kvm)
 {
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	smp_rmb();
-	return test_bit(vector, ioapic->handled_vectors);
+	int ret;
+
+	ret = (ioapic_irqchip(kvm) != NULL);
+	return ret;
 }
 
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, unsigned long *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-			u32 *tmr);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 
 #endif
diff --git a/kernel/arch/x86/kvm/iommu.c b/kernel/arch/x86/kvm/iommu.c
index 7dbced309..5c520ebf6 100644
--- a/kernel/arch/x86/kvm/iommu.c
+++ b/kernel/arch/x86/kvm/iommu.c
@@ -200,6 +200,7 @@ int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
 			goto out_unmap;
 	}
 
+	kvm_arch_start_assignment(kvm);
 	pci_set_dev_assigned(pdev);
 
 	dev_info(&pdev->dev, "kvm assign device\n");
@@ -224,6 +225,7 @@ int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
 	iommu_detach_device(domain, &pdev->dev);
 
 	pci_clear_dev_assigned(pdev);
+	kvm_arch_end_assignment(kvm);
 
 	dev_info(&pdev->dev, "kvm deassign device\n");
 
diff --git a/kernel/arch/x86/kvm/irq.c b/kernel/arch/x86/kvm/irq.c
index a1ec6a50a..097060e33 100644
--- a/kernel/arch/x86/kvm/irq.c
+++ b/kernel/arch/x86/kvm/irq.c
@@ -38,14 +38,27 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
 /*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+	return v->arch.pending_external_vector != -1;
+}
+
+/*
  * check if there is pending interrupt from
  * non-APIC source without intack.
  */
 static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 {
-	if (kvm_apic_accept_pic_intr(v))
-		return pic_irqchip(v->kvm)->output;	/* PIC */
-	else
+	u8 accept = kvm_apic_accept_pic_intr(v);
+
+	if (accept) {
+		if (irqchip_split(v->kvm))
+			return pending_userspace_extint(v);
+		else
+			return pic_irqchip(v->kvm)->output;
+	} else
 		return 0;
 }
 
@@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
-	if (kvm_apic_vid_enabled(v->kvm))
+	if (kvm_vcpu_apic_vid_enabled(v))
 		return 0;
 
 	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
@@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
  */
 static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 {
-	if (kvm_cpu_has_extint(v))
-		return kvm_pic_read_irq(v->kvm); /* PIC */
-	return -1;
+	if (kvm_cpu_has_extint(v)) {
+		if (irqchip_split(v->kvm)) {
+			int vector = v->arch.pending_external_vector;
+
+			v->arch.pending_external_vector = -1;
+			return vector;
+		} else
+			return kvm_pic_read_irq(v->kvm); /* PIC */
+	} else
+		return -1;
 }
 
 /*
@@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
 	int vector;
 
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.nr;
 
 	vector = kvm_cpu_get_extint(v);
diff --git a/kernel/arch/x86/kvm/irq.h b/kernel/arch/x86/kvm/irq.h
index ad68c7300..ae5c78f23 100644
--- a/kernel/arch/x86/kvm/irq.h
+++ b/kernel/arch/x86/kvm/irq.h
@@ -74,7 +74,7 @@ struct kvm_pic {
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm *kvm);
+void kvm_destroy_pic(struct kvm_pic *vpic);
 int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
 
@@ -83,15 +83,40 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 	return kvm->arch.vpic;
 }
 
-static inline int irqchip_in_kernel(struct kvm *kvm)
+static inline int pic_in_kernel(struct kvm *kvm)
 {
 	int ret;
 
 	ret = (pic_irqchip(kvm) != NULL);
+	return ret;
+}
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+	return kvm->arch.irqchip_split;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	struct kvm_pic *vpic = pic_irqchip(kvm);
+	bool ret;
+
+	ret = (vpic != NULL);
+	ret |= irqchip_split(kvm);
+
+	/* Read vpic before kvm->irq_routing.  */
 	smp_rmb();
 	return ret;
 }
 
+static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+	/* Same as irqchip_in_kernel(vcpu->kvm), but with less
+	 * pointer chasing and no unnecessary memory barriers.
+	 */
+	return vcpu->arch.apic != NULL;
+}
+
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/kernel/arch/x86/kvm/irq_comm.c b/kernel/arch/x86/kvm/irq_comm.c
index 72298b3ac..84b96d319 100644
--- a/kernel/arch/x86/kvm/irq_comm.c
+++ b/kernel/arch/x86/kvm/irq_comm.c
@@ -31,6 +31,8 @@
 
 #include "ioapic.h"
 
+#include "lapic.h"
+
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
 			   bool line_status)
@@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 				line_status);
 }
 
-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
-{
-	return irq->delivery_mode == APIC_DM_LOWEST;
-}
-
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, unsigned long *dest_map)
 {
@@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
-			kvm_is_dm_lowest_prio(irq)) {
+			kvm_lowest_prio_delivery(irq)) {
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 		irq->delivery_mode = APIC_DM_FIXED;
 	}
@@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 					irq->dest_id, irq->dest_mode))
 			continue;
 
-		if (!kvm_is_dm_lowest_prio(irq)) {
+		if (!kvm_lowest_prio_delivery(irq)) {
 			if (r < 0)
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq, dest_map);
@@ -94,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 }
 
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
-				   struct kvm_lapic_irq *irq)
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+		     struct kvm_lapic_irq *irq)
 {
 	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
 
@@ -106,10 +103,12 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 	irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
 	irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
 	irq->delivery_mode = e->msi.data & 0x700;
+	irq->msi_redir_hint = ((e->msi.address_lo
+		& MSI_ADDR_REDIRECTION_LOWPRI) > 0);
 	irq->level = 1;
 	irq->shorthand = 0;
-	/* TODO Deal with RH bit of MSI message address */
 }
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
@@ -125,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 }
 
 
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
-			 struct kvm *kvm)
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int irq_source_id, int level,
+			      bool line_status)
 {
 	struct kvm_lapic_irq irq;
 	int r;
 
+	if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
+		return -EWOULDBLOCK;
+
 	kvm_set_msi_irq(e, &irq);
 
 	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@@ -139,42 +142,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
 		return -EWOULDBLOCK;
 }
 
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
-	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	struct kvm_kernel_irq_routing_entry *e;
-	int ret = -EINVAL;
-	int idx;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	/*
-	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
-	 * which would need to be retried from thread context;  when same GSI
-	 * is connected to both PIC and IOAPIC, we'd have to report a
-	 * partial failure here.
-	 * Since there's no easy way to do this, we only support injecting MSI
-	 * which is limited to 1:1 GSI mapping.
-	 */
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-		e = &entries[0];
-		if (likely(e->type == KVM_IRQ_ROUTING_MSI))
-			ret = kvm_set_msi_inatomic(e, kvm);
-		else
-			ret = -EWOULDBLOCK;
-	}
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
 int kvm_request_irq_source_id(struct kvm *kvm)
 {
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -210,7 +177,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		goto unlock;
 	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!irqchip_in_kernel(kvm))
+	if (!ioapic_in_kernel(kvm))
 		goto unlock;
 
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -299,6 +266,33 @@ out:
 	return r;
 }
 
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			     struct kvm_vcpu **dest_vcpu)
+{
+	int i, r = 0;
+	struct kvm_vcpu *vcpu;
+
+	if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+		return true;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!kvm_apic_present(vcpu))
+			continue;
+
+		if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+					irq->dest_id, irq->dest_mode))
+			continue;
+
+		if (++r == 2)
+			return false;
+
+		*dest_vcpu = vcpu;
+	}
+
+	return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -330,3 +324,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 }
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+		return;
+	kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_kernel_irq_routing_entry *entry;
+	struct kvm_irq_routing_table *table;
+	u32 i, nr_ioapic_pins;
+	int idx;
+
+	/* kvm->irq_routing must be read after clearing
+	 * KVM_SCAN_IOAPIC. */
+	smp_mb();
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+			       kvm->arch.nr_reserved_ioapic_pins);
+	for (i = 0; i < nr_ioapic_pins; ++i) {
+		hlist_for_each_entry(entry, &table->map[i], link) {
+			u32 dest_id, dest_mode;
+			bool level;
+
+			if (entry->type != KVM_IRQ_ROUTING_MSI)
+				continue;
+			dest_id = (entry->msi.address_lo >> 12) & 0xff;
+			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+			level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
+			if (level && kvm_apic_match_dest(vcpu, NULL, 0,
+						dest_id, dest_mode)) {
+				u32 vector = entry->msi.data & 0xff;
+
+				__set_bit(vector,
+					  (unsigned long *) eoi_exit_bitmap);
+			}
+		}
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+}
diff --git a/kernel/arch/x86/kvm/kvm_cache_regs.h b/kernel/arch/x86/kvm/kvm_cache_regs.h
index 544076c4f..e1e89ee4a 100644
--- a/kernel/arch/x86/kvm/kvm_cache_regs.h
+++ b/kernel/arch/x86/kvm/kvm_cache_regs.h
@@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
 	return vcpu->arch.hflags & HF_GUEST_MASK;
 }
 
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
 #endif
diff --git a/kernel/arch/x86/kvm/lapic.c b/kernel/arch/x86/kvm/lapic.c
index 525db8be0..20d9e9fb3 100644
--- a/kernel/arch/x86/kvm/lapic.c
+++ b/kernel/arch/x86/kvm/lapic.c
@@ -209,7 +209,7 @@ out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 	recalculate_apic_map(apic->vcpu->kvm);
 }
 
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+{
+	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+
+	apic_set_reg(apic, APIC_ID, id << 24);
+	apic_set_reg(apic, APIC_LDR, ldr);
+	recalculate_apic_map(apic->vcpu->kvm);
+}
+
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
 {
 	return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -339,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	__kvm_apic_update_irr(pir, apic->regs);
+
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
@@ -381,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 
 	vcpu = apic->vcpu;
 
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+	if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
 		/* try to update RVI */
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -542,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
-}
-
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
 	u32 tpr, isrv, ppr, old_ppr;
@@ -728,7 +730,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 		dst = map->logical_map[cid];
 
-		if (irq->delivery_mode == APIC_DM_LOWEST) {
+		if (kvm_lowest_prio_delivery(irq)) {
 			int l = -1;
 			for_each_set_bit(i, &bitmap, 16) {
 				if (!dst[i])
@@ -755,6 +757,65 @@ out:
 	return ret;
 }
 
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			struct kvm_vcpu **dest_vcpu)
+{
+	struct kvm_apic_map *map;
+	bool ret = false;
+	struct kvm_lapic *dst = NULL;
+
+	if (irq->shorthand)
+		return false;
+
+	rcu_read_lock();
+	map = rcu_dereference(kvm->arch.apic_map);
+
+	if (!map)
+		goto out;
+
+	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+		if (irq->dest_id == 0xFF)
+			goto out;
+
+		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+			goto out;
+
+		dst = map->phys_map[irq->dest_id];
+		if (dst && kvm_apic_present(dst->vcpu))
+			*dest_vcpu = dst->vcpu;
+		else
+			goto out;
+	} else {
+		u16 cid;
+		unsigned long bitmap = 1;
+		int i, r = 0;
+
+		if (!kvm_apic_logical_map_valid(map))
+			goto out;
+
+		apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+
+		if (cid >= ARRAY_SIZE(map->logical_map))
+			goto out;
+
+		for_each_set_bit(i, &bitmap, 16) {
+			dst = map->logical_map[cid][i];
+			if (++r == 2)
+				goto out;
+		}
+
+		if (dst && kvm_apic_present(dst->vcpu))
+			*dest_vcpu = dst->vcpu;
+		else
+			goto out;
+	}
+
+	ret = true;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
 /*
  * Add a pending IRQ into lapic.
  * Return 1 if successfully added and 0 if discarded.
@@ -772,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_LOWEST:
 		vcpu->arch.apic_arb_prio++;
 	case APIC_DM_FIXED:
+		if (unlikely(trig_mode && !level))
+			break;
+
 		/* FIXME add logic for vcpu on reset */
 		if (unlikely(!apic_enabled(apic)))
 			break;
@@ -781,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 
+		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+			if (trig_mode)
+				apic_set_vector(vector, apic->regs + APIC_TMR);
+			else
+				apic_clear_vector(vector, apic->regs + APIC_TMR);
+		}
+
 		if (kvm_x86_ops->deliver_posted_interrupt)
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
 		else {
@@ -799,7 +870,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 
 	case APIC_DM_SMI:
-		apic_debug("Ignoring guest SMI\n");
+		result = 1;
+		kvm_make_request(KVM_REQ_SMI, vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_NMI:
@@ -857,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+	return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
+}
+
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
-		int trigger_mode;
-		if (apic_test_vector(vector, apic->regs + APIC_TMR))
-			trigger_mode = IOAPIC_LEVEL_TRIG;
-		else
-			trigger_mode = IOAPIC_EDGE_TRIG;
-		kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+	int trigger_mode;
+
+	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
+	if (!kvm_ioapic_handles_vector(apic, vector))
+		return;
+
+	/* Request a KVM exit to inform the userspace IOAPIC. */
+	if (irqchip_split(apic->vcpu->kvm)) {
+		apic->vcpu->arch.pending_ioapic_eoi = vector;
+		kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+		return;
 	}
+
+	if (apic_test_vector(vector, apic->regs + APIC_TMR))
+		trigger_mode = IOAPIC_LEVEL_TRIG;
+	else
+		trigger_mode = IOAPIC_EDGE_TRIG;
+
+	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 }
 
 static int apic_set_eoi(struct kvm_lapic *apic)
@@ -914,9 +1003,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	irq.vector = icr_low & APIC_VECTOR_MASK;
 	irq.delivery_mode = icr_low & APIC_MODE_MASK;
 	irq.dest_mode = icr_low & APIC_DEST_MASK;
-	irq.level = icr_low & APIC_INT_ASSERT;
+	irq.level = (icr_low & APIC_INT_ASSERT) != 0;
 	irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
 	irq.shorthand = icr_low & APIC_SHORT_MASK;
+	irq.msi_redir_hint = false;
 	if (apic_x2apic_mode(apic))
 		irq.dest_id = icr_high;
 	else
@@ -926,10 +1016,11 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
-		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+		   "msi_redir_hint 0x%x\n",
 		   icr_high, icr_low, irq.shorthand, irq.dest_id,
 		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
-		   irq.vector);
+		   irq.vector, irq.msi_redir_hint);
 
 	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
@@ -1104,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
-	struct swait_head *q = &vcpu->wq;
+	struct swait_queue_head *q = &vcpu->wq;
 	struct kvm_timer *ktimer = &apic->lapic_timer;
 
 	if (atomic_read(&apic->lapic_timer.pending))
@@ -1113,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	atomic_inc(&apic->lapic_timer.pending);
 	kvm_set_pending_timer(vcpu);
 
-	if (swaitqueue_active(q))
-		swait_wake_interruptible(q);
+	if (swait_active(q))
+		swake_up(q);
 
 	if (apic_lvtt_tscdeadline(apic))
 		ktimer->expired_tscdeadline = ktimer->tscdeadline;
@@ -1159,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
-	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
 	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 
 	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
@@ -1167,36 +1258,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 		__delay(tsc_deadline - guest_tsc);
 }
 
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data);
-
-static void __apic_timer_expired(struct hrtimer *data)
-{
-	int ret, i = 0;
-	enum hrtimer_restart r;
-	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-
-	r = apic_timer_fn(data);
-
-	if (r == HRTIMER_RESTART) {
-		do {
-			ret = hrtimer_start_expires(data, HRTIMER_MODE_ABS);
-			if (ret == -ETIME)
-				hrtimer_add_expires_ns(&ktimer->timer,
-						       ktimer->period);
-			i++;
-		} while (ret == -ETIME && i < 10);
-
-		if (ret == -ETIME) {
-			printk_once(KERN_ERR "%s: failed to reprogram timer\n",
-				    __func__);
-			WARN_ON_ONCE(1);
-		}
-	}
-}
-
 static void start_apic_timer(struct kvm_lapic *apic)
 {
-	int ret;
 	ktime_t now;
 
 	atomic_set(&apic->lapic_timer.pending, 0);
@@ -1227,11 +1290,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			}
 		}
 
-		ret = hrtimer_start(&apic->lapic_timer.timer,
+		hrtimer_start(&apic->lapic_timer.timer,
 			      ktime_add_ns(now, apic->lapic_timer.period),
 			      HRTIMER_MODE_ABS);
-		if (ret == -ETIME)
-			__apic_timer_expired(&apic->lapic_timer.timer);
 
 		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
 			   PRIx64 ", "
@@ -1257,16 +1318,14 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		local_irq_save(flags);
 
 		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+		guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
 			expire = ktime_add_ns(now, ns);
 			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
-			ret = hrtimer_start(&apic->lapic_timer.timer,
+			hrtimer_start(&apic->lapic_timer.timer,
 				      expire, HRTIMER_MODE_ABS);
-			if (ret == -ETIME)
-				__apic_timer_expired(&apic->lapic_timer.timer);
 		} else
 			apic_timer_expired(apic);
 
@@ -1276,16 +1335,17 @@ static void start_apic_timer(struct kvm_lapic *apic)
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 {
-	int nmi_wd_enabled = apic_lvt_nmi_mode(kvm_apic_get_reg(apic, APIC_LVT0));
+	bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
 
-	if (apic_lvt_nmi_mode(lvt0_val)) {
-		if (!nmi_wd_enabled) {
+	if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
+		apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
+		if (lvt0_in_nmi_mode) {
 			apic_debug("Receive NMI setting on APIC_LVT0 "
 				   "for cpu %d\n", apic->vcpu->vcpu_id);
 			atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
-		}
-	} else if (nmi_wd_enabled)
-		atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+		} else
+			atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+	}
 }
 
 static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
@@ -1573,9 +1633,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 	if ((old_value ^ value) & X2APIC_ENABLE) {
 		if (value & X2APIC_ENABLE) {
-			u32 id = kvm_apic_id(apic);
-			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
-			kvm_apic_set_ldr(apic, ldr);
+			kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
 			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
 		} else
 			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
@@ -1594,7 +1652,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 }
 
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct kvm_lapic *apic;
 	int i;
@@ -1608,19 +1666,23 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
-	kvm_apic_set_id(apic, vcpu->vcpu_id);
+	if (!init_event)
+		kvm_apic_set_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
 	for (i = 0; i < APIC_LVT_NUM; i++)
 		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
 	apic_update_lvtt(apic);
-	apic_set_reg(apic, APIC_LVT0,
-		     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+		apic_set_reg(apic, APIC_LVT0,
+			     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+	apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0));
 
 	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
 	apic_set_spiv(apic, 0xff);
 	apic_set_reg(apic, APIC_TASKPRI, 0);
-	kvm_apic_set_ldr(apic, 0);
+	if (!apic_x2apic_mode(apic))
+		kvm_apic_set_ldr(apic, 0);
 	apic_set_reg(apic, APIC_ESR, 0);
 	apic_set_reg(apic, APIC_ICR, 0);
 	apic_set_reg(apic, APIC_ICR2, 0);
@@ -1631,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+	apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
 	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
@@ -1750,7 +1812,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
 	static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
-	kvm_lapic_reset(vcpu);
+	kvm_lapic_reset(vcpu, false);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
 	return 0;
@@ -1855,7 +1917,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	kvm_rtc_eoi_tracking_restore_one(vcpu);
+	if (ioapic_in_kernel(vcpu->kvm))
+		kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+	vcpu->arch.apic_arb_prio = 0;
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1867,8 +1932,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 
 	timer = &vcpu->arch.apic->lapic_timer.timer;
 	if (hrtimer_cancel(timer))
-		if (hrtimer_start_expires(timer, HRTIMER_MODE_ABS) == -ETIME)
-			__apic_timer_expired(timer);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 /*
@@ -1918,8 +1982,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
-	kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
-				sizeof(u32));
+	if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+				  sizeof(u32)))
+		return;
 
 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
 }
@@ -1939,7 +2004,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
 	    /* Cache not set: could be safe but we don't bother. */
 	    apic->highest_isr_cache == -1 ||
 	    /* Need EOI to update ioapic. */
-	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+	    kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
 		/*
 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
 		 * so we need not do anything here.
@@ -1995,7 +2060,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_ICR2)
@@ -2012,7 +2077,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_DFR || reg == APIC_ICR2) {
@@ -2084,11 +2149,22 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 	if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
 		return;
 
-	pe = xchg(&apic->pending_events, 0);
+	/*
+	 * INITs are latched while in SMM.  Because an SMM CPU cannot
+	 * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
+	 * and delay processing of INIT until the next RSM.
+	 */
+	if (is_smm(vcpu)) {
+		WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+		if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+			clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+		return;
+	}
 
+	pe = xchg(&apic->pending_events, 0);
 	if (test_bit(KVM_APIC_INIT, &pe)) {
-		kvm_lapic_reset(vcpu);
-		kvm_vcpu_reset(vcpu);
+		kvm_lapic_reset(vcpu, true);
+		kvm_vcpu_reset(vcpu, true);
 		if (kvm_vcpu_is_bsp(apic->vcpu))
 			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		else
diff --git a/kernel/arch/x86/kvm/lapic.h b/kernel/arch/x86/kvm/lapic.h
index c4ea87eed..fde8e35d5 100644
--- a/kernel/arch/x86/kvm/lapic.h
+++ b/kernel/arch/x86/kvm/lapic.h
@@ -26,6 +26,7 @@ struct kvm_lapic {
 	struct kvm_vcpu *vcpu;
 	bool sw_enabled;
 	bool irr_pending;
+	bool lvt0_in_nmi_mode;
 	/* Number of bits set in ISR. */
 	s16 isr_count;
 	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
@@ -48,7 +49,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
@@ -56,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@@ -90,7 +90,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
 
 static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+	return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
 }
 
 int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
@@ -143,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 }
 
-static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
 {
-	return kvm_x86_ops->vm_has_apicv(kvm);
+	return kvm_x86_ops->cpu_uses_apicv(vcpu);
 }
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -153,8 +153,21 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
 }
 
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+	return (irq->delivery_mode == APIC_DM_LOWEST ||
+			irq->msi_redir_hint);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+	return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+}
+
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			struct kvm_vcpu **dest_vcpu);
 #endif
diff --git a/kernel/arch/x86/kvm/mmu.c b/kernel/arch/x86/kvm/mmu.c
index 554e877e0..8eb8a934b 100644
--- a/kernel/arch/x86/kvm/mmu.c
+++ b/kernel/arch/x86/kvm/mmu.c
@@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte)
 	return gen;
 }
 
-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
 {
-	return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+	return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
 }
 
-static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 			   unsigned access)
 {
-	unsigned int gen = kvm_current_mmio_generation(kvm);
+	unsigned int gen = kvm_current_mmio_generation(vcpu);
 	u64 mask = generation_mmio_spte_mask(gen);
 
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
@@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte)
 	return (spte & ~mask) & ~PAGE_MASK;
 }
 
-static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 			  pfn_t pfn, unsigned access)
 {
 	if (unlikely(is_noslot_pfn(pfn))) {
-		mark_mmio_spte(kvm, sptep, gfn, access);
+		mark_mmio_spte(vcpu, sptep, gfn, access);
 		return true;
 	}
 
 	return false;
 }
 
-static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
 	unsigned int kvm_gen, spte_gen;
 
-	kvm_gen = kvm_current_mmio_generation(kvm);
+	kvm_gen = kvm_current_mmio_generation(vcpu);
 	spte_gen = get_mmio_spte_generation(spte);
 
 	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -781,30 +781,36 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 	return &slot->arch.lpage_info[level - 2][idx];
 }
 
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
+	gfn_t gfn;
 	int i;
 
-	slot = gfn_to_memslot(kvm, gfn);
-	for (i = PT_DIRECTORY_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+	gfn = sp->gfn;
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->write_count += 1;
 	}
 	kvm->arch.indirect_shadow_pages++;
 }
 
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
+	gfn_t gfn;
 	int i;
 
-	slot = gfn_to_memslot(kvm, gfn);
-	for (i = PT_DIRECTORY_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+	gfn = sp->gfn;
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->write_count -= 1;
 		WARN_ON(linfo->write_count < 0);
@@ -812,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	kvm->arch.indirect_shadow_pages--;
 }
 
-static int has_wrprotected_page(struct kvm *kvm,
-				gfn_t gfn,
-				int level)
+static int __has_wrprotected_page(gfn_t gfn, int level,
+				  struct kvm_memory_slot *slot)
 {
-	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
 
-	slot = gfn_to_memslot(kvm, gfn);
 	if (slot) {
 		linfo = lpage_info_slot(gfn, slot, level);
 		return linfo->write_count;
@@ -828,6 +831,14 @@ static int has_wrprotected_page(struct kvm *kvm,
 	return 1;
 }
 
+static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	return __has_wrprotected_page(gfn, level, slot);
+}
+
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned long page_size;
@@ -835,8 +846,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 
 	page_size = kvm_host_page_size(kvm, gfn);
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		if (page_size >= KVM_HPAGE_SIZE(i))
 			ret = i;
 		else
@@ -846,28 +856,43 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 	return ret;
 }
 
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+					  bool no_dirty_log)
+{
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return false;
+	if (no_dirty_log && slot->dirty_bitmap)
+		return false;
+
+	return true;
+}
+
 static struct kvm_memory_slot *
 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 			    bool no_dirty_log)
 {
 	struct kvm_memory_slot *slot;
 
-	slot = gfn_to_memslot(vcpu->kvm, gfn);
-	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
-	      (no_dirty_log && slot->dirty_bitmap))
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if (!memslot_valid_for_gpte(slot, no_dirty_log))
 		slot = NULL;
 
 	return slot;
 }
 
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
-	return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+			 bool *force_pt_level)
 {
 	int host_level, level, max_level;
+	struct kvm_memory_slot *slot;
+
+	if (unlikely(*force_pt_level))
+		return PT_PAGE_TABLE_LEVEL;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+	*force_pt_level = !memslot_valid_for_gpte(slot, true);
+	if (unlikely(*force_pt_level))
+		return PT_PAGE_TABLE_LEVEL;
 
 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
 
@@ -877,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+		if (__has_wrprotected_page(large_gfn, level, slot))
 			break;
 
 	return level - 1;
@@ -1019,12 +1044,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
 /*
  * Take gfn and return the reverse mapping to it.
  */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
 
-	slot = gfn_to_memslot(kvm, gfn);
-	return __gfn_to_rmap(gfn, level, slot);
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	return __gfn_to_rmap(gfn, sp->role.level, slot);
 }
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1042,7 +1069,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	sp = page_header(__pa(spte));
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
 	return pte_list_add(vcpu, spte, rmapp);
 }
 
@@ -1054,7 +1081,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
 	sp = page_header(__pa(spte));
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+	rmapp = gfn_to_rmap(kvm, gfn, sp);
 	pte_list_remove(spte, rmapp);
 }
 
@@ -1119,6 +1146,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
 	return NULL;
 }
 
+#define for_each_rmap_spte(_rmap_, _iter_, _spte_)			    \
+	   for (_spte_ = rmap_get_first(*_rmap_, _iter_);		    \
+		_spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;});  \
+			_spte_ = rmap_get_next(_iter_))
+
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
 	if (mmu_spte_clear_track_bits(sptep))
@@ -1182,12 +1214,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		flush |= spte_write_protect(kvm, sptep, pt_protect);
-		sptep = rmap_get_next(&iter);
-	}
 
 	return flush;
 }
@@ -1209,12 +1237,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		flush |= spte_clear_dirty(kvm, sptep);
-		sptep = rmap_get_next(&iter);
-	}
 
 	return flush;
 }
@@ -1236,12 +1260,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		flush |= spte_set_dirty(kvm, sptep);
-		sptep = rmap_get_next(&iter);
-	}
 
 	return flush;
 }
@@ -1319,42 +1339,45 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	int i;
 	bool write_protected = false;
 
-	slot = gfn_to_memslot(kvm, gfn);
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, true);
+		write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
 	}
 
 	return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			   struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			   unsigned long data)
+static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int need_tlb_flush = 0;
+	bool flush = false;
 
 	while ((sptep = rmap_get_first(*rmapp, &iter))) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
-			     sptep, *sptep, gfn, level);
+		rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
 
 		drop_spte(kvm, sptep);
-		need_tlb_flush = 1;
+		flush = true;
 	}
 
-	return need_tlb_flush;
+	return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			   struct kvm_memory_slot *slot, gfn_t gfn, int level,
+			   unsigned long data)
+{
+	return kvm_zap_rmapp(kvm, rmapp);
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1371,8 +1394,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	WARN_ON(pte_huge(*ptep));
 	new_pfn = pte_pfn(*ptep);
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
+restart:
+	for_each_rmap_spte(rmapp, &iter, sptep) {
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
 			     sptep, *sptep, gfn, level);
 
@@ -1380,7 +1403,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 		if (pte_write(*ptep)) {
 			drop_spte(kvm, sptep);
-			sptep = rmap_get_first(*rmapp, &iter);
+			goto restart;
 		} else {
 			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1391,7 +1414,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
-			sptep = rmap_get_next(&iter);
 		}
 	}
 
@@ -1401,6 +1423,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	return 0;
 }
 
+struct slot_rmap_walk_iterator {
+	/* input fields. */
+	struct kvm_memory_slot *slot;
+	gfn_t start_gfn;
+	gfn_t end_gfn;
+	int start_level;
+	int end_level;
+
+	/* output fields. */
+	gfn_t gfn;
+	unsigned long *rmap;
+	int level;
+
+	/* private field. */
+	unsigned long *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+	iterator->level = level;
+	iterator->gfn = iterator->start_gfn;
+	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+					   iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+		    struct kvm_memory_slot *slot, int start_level,
+		    int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+	iterator->slot = slot;
+	iterator->start_level = start_level;
+	iterator->end_level = end_level;
+	iterator->start_gfn = start_gfn;
+	iterator->end_gfn = end_gfn;
+
+	rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+	return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+	if (++iterator->rmap <= iterator->end_rmap) {
+		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+		return;
+	}
+
+	if (++iterator->level > iterator->end_level) {
+		iterator->rmap = NULL;
+		return;
+	}
+
+	rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
+	   _start_gfn, _end_gfn, _iter_)				\
+	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
+				 _end_level_, _start_gfn, _end_gfn);	\
+	     slot_rmap_walk_okay(_iter_);				\
+	     slot_rmap_walk_next(_iter_))
+
 static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long start,
 				unsigned long end,
@@ -1412,48 +1502,36 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 					       int level,
 					       unsigned long data))
 {
-	int j;
-	int ret = 0;
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
+	struct slot_rmap_walk_iterator iterator;
+	int ret = 0;
+	int i;
 
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots) {
-		unsigned long hva_start, hva_end;
-		gfn_t gfn_start, gfn_end;
-
-		hva_start = max(start, memslot->userspace_addr);
-		hva_end = min(end, memslot->userspace_addr +
-					(memslot->npages << PAGE_SHIFT));
-		if (hva_start >= hva_end)
-			continue;
-		/*
-		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
-		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-		 */
-		gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-		for (j = PT_PAGE_TABLE_LEVEL;
-		     j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
-			unsigned long idx, idx_end;
-			unsigned long *rmapp;
-			gfn_t gfn = gfn_start;
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(memslot, slots) {
+			unsigned long hva_start, hva_end;
+			gfn_t gfn_start, gfn_end;
 
+			hva_start = max(start, memslot->userspace_addr);
+			hva_end = min(end, memslot->userspace_addr +
+				      (memslot->npages << PAGE_SHIFT));
+			if (hva_start >= hva_end)
+				continue;
 			/*
-			 * {idx(page_j) | page_j intersects with
-			 *  [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 			 */
-			idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
-			idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
-
-			rmapp = __gfn_to_rmap(gfn_start, j, memslot);
-
-			for (; idx <= idx_end;
-			       ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
-				ret |= handler(kvm, rmapp++, memslot,
-					       gfn, j, data);
+			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+			for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+						 PT_MAX_HUGEPAGE_LEVEL,
+						 gfn_start, gfn_end - 1,
+						 &iterator)
+				ret |= handler(kvm, iterator.rmap, memslot,
+					       iterator.gfn, iterator.level, data);
 		}
 	}
 
@@ -1495,16 +1573,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 	BUG_ON(!shadow_accessed_mask);
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		if (*sptep & shadow_accessed_mask) {
 			young = 1;
 			clear_bit((ffs(shadow_accessed_mask) - 1),
 				 (unsigned long *)sptep);
 		}
-	}
+
 	trace_kvm_age_page(gfn, level, slot, young);
 	return young;
 }
@@ -1525,15 +1600,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		goto out;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		if (*sptep & shadow_accessed_mask) {
 			young = 1;
 			break;
 		}
-	}
 out:
 	return young;
 }
@@ -1547,7 +1618,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	sp = page_header(__pa(spte));
 
-	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
 
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
 	kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1967,7 +2038,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 		bool protected = false;
 
 		for_each_sp(pages, sp, parents, i)
-			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+			protected |= rmap_write_protect(vcpu, sp->gfn);
 
 		if (protected)
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2065,12 +2136,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	hlist_add_head(&sp->hash_link,
 		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
-		if (rmap_write_protect(vcpu->kvm, gfn))
+		if (rmap_write_protect(vcpu, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
 			kvm_sync_pages(vcpu, gfn);
 
-		account_shadowed(vcpu->kvm, gfn);
+		account_shadowed(vcpu->kvm, sp);
 	}
 	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	init_shadow_page_table(sp);
@@ -2251,7 +2322,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	kvm_mmu_unlink_parents(kvm, sp);
 
 	if (!sp->role.invalid && !sp->role.direct)
-		unaccount_shadowed(kvm, sp->gfn);
+		unaccount_shadowed(kvm, sp);
 
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
@@ -2363,111 +2434,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-/*
- * The function is based on mtrr_type_lookup() in
- * arch/x86/kernel/cpu/mtrr/generic.c
- */
-static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
-			 u64 start, u64 end)
-{
-	int i;
-	u64 base, mask;
-	u8 prev_match, curr_match;
-	int num_var_ranges = KVM_NR_VAR_MTRR;
-
-	if (!mtrr_state->enabled)
-		return 0xFF;
-
-	/* Make end inclusive end, instead of exclusive */
-	end--;
-
-	/* Look in fixed ranges. Just return the type as per start */
-	if (mtrr_state->have_fixed && (start < 0x100000)) {
-		int idx;
-
-		if (start < 0x80000) {
-			idx = 0;
-			idx += (start >> 16);
-			return mtrr_state->fixed_ranges[idx];
-		} else if (start < 0xC0000) {
-			idx = 1 * 8;
-			idx += ((start - 0x80000) >> 14);
-			return mtrr_state->fixed_ranges[idx];
-		} else if (start < 0x1000000) {
-			idx = 3 * 8;
-			idx += ((start - 0xC0000) >> 12);
-			return mtrr_state->fixed_ranges[idx];
-		}
-	}
-
-	/*
-	 * Look in variable ranges
-	 * Look of multiple ranges matching this address and pick type
-	 * as per MTRR precedence
-	 */
-	if (!(mtrr_state->enabled & 2))
-		return mtrr_state->def_type;
-
-	prev_match = 0xFF;
-	for (i = 0; i < num_var_ranges; ++i) {
-		unsigned short start_state, end_state;
-
-		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
-			continue;
-
-		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
-		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
-		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
-		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
-
-		start_state = ((start & mask) == (base & mask));
-		end_state = ((end & mask) == (base & mask));
-		if (start_state != end_state)
-			return 0xFE;
-
-		if ((start & mask) != (base & mask))
-			continue;
-
-		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
-		if (prev_match == 0xFF) {
-			prev_match = curr_match;
-			continue;
-		}
-
-		if (prev_match == MTRR_TYPE_UNCACHABLE ||
-		    curr_match == MTRR_TYPE_UNCACHABLE)
-			return MTRR_TYPE_UNCACHABLE;
-
-		if ((prev_match == MTRR_TYPE_WRBACK &&
-		     curr_match == MTRR_TYPE_WRTHROUGH) ||
-		    (prev_match == MTRR_TYPE_WRTHROUGH &&
-		     curr_match == MTRR_TYPE_WRBACK)) {
-			prev_match = MTRR_TYPE_WRTHROUGH;
-			curr_match = MTRR_TYPE_WRTHROUGH;
-		}
-
-		if (prev_match != curr_match)
-			return MTRR_TYPE_UNCACHABLE;
-	}
-
-	if (prev_match != 0xFF)
-		return prev_match;
-
-	return mtrr_state->def_type;
-}
-
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	u8 mtrr;
-
-	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
-			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
-	if (mtrr == 0xfe || mtrr == 0xff)
-		mtrr = MTRR_TYPE_WRBACK;
-	return mtrr;
-}
-EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-
 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	trace_kvm_mmu_unsync_page(sp);
@@ -2510,6 +2476,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 	return 0;
 }
 
+static bool kvm_is_mmio_pfn(pfn_t pfn)
+{
+	if (pfn_valid(pfn))
+		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+
+	return true;
+}
+
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    unsigned pte_access, int level,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
@@ -2518,7 +2492,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	u64 spte;
 	int ret = 0;
 
-	if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
+	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
 		return 0;
 
 	spte = PT_PRESENT_MASK;
@@ -2537,7 +2511,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
-			kvm_is_reserved_pfn(pfn));
+			kvm_is_mmio_pfn(pfn));
 
 	if (host_writable)
 		spte |= SPTE_HOST_WRITEABLE;
@@ -2555,7 +2529,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		 * be fixed if guest refault.
 		 */
 		if (level > PT_PAGE_TABLE_LEVEL &&
-		    has_wrprotected_page(vcpu->kvm, gfn, level))
+		    has_wrprotected_page(vcpu, gfn, level))
 			goto done;
 
 		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2579,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	if (pte_access & ACC_WRITE_MASK) {
-		mark_page_dirty(vcpu->kvm, gfn);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
 		spte |= shadow_dirty_mask;
 	}
 
@@ -2669,15 +2643,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 				    u64 *start, u64 *end)
 {
 	struct page *pages[PTE_PREFETCH_NUM];
+	struct kvm_memory_slot *slot;
 	unsigned access = sp->role.access;
 	int i, ret;
 	gfn_t gfn;
 
 	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
-	if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
+	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+	if (!slot)
 		return -1;
 
-	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
 	if (ret <= 0)
 		return -1;
 
@@ -2795,7 +2771,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 		return 1;
 
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
-		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
+		kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
 		return 0;
 	}
 
@@ -2818,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
 	    level == PT_PAGE_TABLE_LEVEL &&
 	    PageTransCompound(pfn_to_page(pfn)) &&
-	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+	    !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
 		unsigned long mask;
 		/*
 		 * mmu_notifier_retry was successful and we hold the
@@ -2910,7 +2886,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
 	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
-		mark_page_dirty(vcpu->kvm, gfn);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
 
 	return true;
 }
@@ -3006,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 {
 	int r;
 	int level;
-	int force_pt_level;
+	bool force_pt_level = false;
 	pfn_t pfn;
 	unsigned long mmu_seq;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
 
-	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
-		level = mapping_level(vcpu, gfn);
 		/*
 		 * This path builds a PAE pagetable - so we can map
 		 * 2mb pages at maximum. Therefore check if the level
@@ -3023,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			level = PT_DIRECTORY_LEVEL;
 
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
-		level = PT_PAGE_TABLE_LEVEL;
+	}
 
 	if (fast_page_fault(vcpu, v, level, error_code))
 		return 0;
@@ -3312,6 +3286,25 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
 }
 
+static bool
+__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
+{
+	int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
+
+	return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
+		((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+	return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
+}
+
+static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
+{
+	return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
+}
+
 static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
 	if (direct)
@@ -3320,37 +3313,69 @@ static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 	return vcpu_match_mmio_gva(vcpu, addr);
 }
 
-static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+/* return true if reserved bit is detected on spte. */
+static bool
+walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 {
 	struct kvm_shadow_walk_iterator iterator;
-	u64 spte = 0ull;
+	u64 sptes[PT64_ROOT_LEVEL], spte = 0ull;
+	int root, leaf;
+	bool reserved = false;
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return spte;
+		goto exit;
 
 	walk_shadow_page_lockless_begin(vcpu);
-	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+
+	for (shadow_walk_init(&iterator, vcpu, addr),
+		 leaf = root = iterator.level;
+	     shadow_walk_okay(&iterator);
+	     __shadow_walk_next(&iterator, spte)) {
+		spte = mmu_spte_get_lockless(iterator.sptep);
+
+		sptes[leaf - 1] = spte;
+		leaf--;
+
 		if (!is_shadow_present_pte(spte))
 			break;
+
+		reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
+						    iterator.level);
+	}
+
 	walk_shadow_page_lockless_end(vcpu);
 
-	return spte;
+	if (reserved) {
+		pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+		       __func__, addr);
+		while (root > leaf) {
+			pr_err("------ spte 0x%llx level %d.\n",
+			       sptes[root - 1], root);
+			root--;
+		}
+	}
+exit:
+	*sptep = spte;
+	return reserved;
 }
 
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
 	u64 spte;
+	bool reserved;
 
 	if (quickly_check_mmio_pf(vcpu, addr, direct))
 		return RET_MMIO_PF_EMULATE;
 
-	spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+	reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
+	if (WARN_ON(reserved))
+		return RET_MMIO_PF_BUG;
 
 	if (is_mmio_spte(spte)) {
 		gfn_t gfn = get_mmio_spte_gfn(spte);
 		unsigned access = get_mmio_spte_access(spte);
 
-		if (!check_mmio_spte(vcpu->kvm, spte))
+		if (!check_mmio_spte(vcpu, spte))
 			return RET_MMIO_PF_INVALID;
 
 		if (direct)
@@ -3367,17 +3392,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 	 */
 	return RET_MMIO_PF_RETRY;
 }
-EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
-
-static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
-				  u32 error_code, bool direct)
-{
-	int ret;
-
-	ret = handle_mmio_page_fault_common(vcpu, addr, direct);
-	WARN_ON(ret == RET_MMIO_PF_BUG);
-	return ret;
-}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
 
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
@@ -3388,7 +3403,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 
 	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+		r = handle_mmio_page_fault(vcpu, gva, true);
 
 		if (likely(r != RET_MMIO_PF_INVALID))
 			return r;
@@ -3415,12 +3430,12 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 	arch.direct_map = vcpu->arch.mmu.direct_map;
 	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
 
-	return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
+	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+	if (unlikely(!lapic_in_kernel(vcpu) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
 
@@ -3430,10 +3445,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
 {
+	struct kvm_memory_slot *slot;
 	bool async;
 
-	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	async = false;
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
 	if (!async)
 		return false; /* *pfn has correct page already */
 
@@ -3447,18 +3464,27 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
 	return false;
 }
 
+static bool
+check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+	int page_num = KVM_PAGES_PER_HPAGE(level);
+
+	gfn &= ~(page_num - 1);
+
+	return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
+}
+
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 			  bool prefault)
 {
 	pfn_t pfn;
 	int r;
 	int level;
-	int force_pt_level;
+	bool force_pt_level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
@@ -3467,7 +3493,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
 	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+		r = handle_mmio_page_fault(vcpu, gpa, true);
 
 		if (likely(r != RET_MMIO_PF_INVALID))
 			return r;
@@ -3477,12 +3503,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 		return r;
 
-	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
+							   PT_DIRECTORY_LEVEL);
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
-		level = mapping_level(vcpu, gfn);
+		if (level > PT_DIRECTORY_LEVEL &&
+		    !check_hugepage_cache_consistency(vcpu, gfn, level))
+			level = PT_DIRECTORY_LEVEL;
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
-		level = PT_PAGE_TABLE_LEVEL;
+	}
 
 	if (fast_page_fault(vcpu, gpa, level, error_code))
 		return 0;
@@ -3545,7 +3574,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
 	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 
-static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 			   unsigned access, int *nr_present)
 {
 	if (unlikely(is_mmio_spte(*sptep))) {
@@ -3555,7 +3584,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
 		}
 
 		(*nr_present)++;
-		mark_mmio_spte(kvm, sptep, gfn, access);
+		mark_mmio_spte(vcpu, sptep, gfn, access);
 		return true;
 	}
 
@@ -3584,111 +3613,197 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu *context)
+static void
+__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+			struct rsvd_bits_validate *rsvd_check,
+			int maxphyaddr, int level, bool nx, bool gbpages,
+			bool pse, bool amd)
 {
-	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 	u64 exb_bit_rsvd = 0;
 	u64 gbpages_bit_rsvd = 0;
 	u64 nonleaf_bit8_rsvd = 0;
 
-	context->bad_mt_xwr = 0;
+	rsvd_check->bad_mt_xwr = 0;
 
-	if (!context->nx)
+	if (!nx)
 		exb_bit_rsvd = rsvd_bits(63, 63);
-	if (!guest_cpuid_has_gbpages(vcpu))
+	if (!gbpages)
 		gbpages_bit_rsvd = rsvd_bits(7, 7);
 
 	/*
 	 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
 	 * leaf entries) on AMD CPUs only.
 	 */
-	if (guest_cpuid_is_amd(vcpu))
+	if (amd)
 		nonleaf_bit8_rsvd = rsvd_bits(8, 8);
 
-	switch (context->root_level) {
+	switch (level) {
 	case PT32_ROOT_LEVEL:
 		/* no rsvd bits for 2 level 4K page table entries */
-		context->rsvd_bits_mask[0][1] = 0;
-		context->rsvd_bits_mask[0][0] = 0;
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+		rsvd_check->rsvd_bits_mask[0][1] = 0;
+		rsvd_check->rsvd_bits_mask[0][0] = 0;
+		rsvd_check->rsvd_bits_mask[1][0] =
+			rsvd_check->rsvd_bits_mask[0][0];
 
-		if (!is_pse(vcpu)) {
-			context->rsvd_bits_mask[1][1] = 0;
+		if (!pse) {
+			rsvd_check->rsvd_bits_mask[1][1] = 0;
 			break;
 		}
 
 		if (is_cpuid_PSE36())
 			/* 36bits PSE 4MB page */
-			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
 		else
 			/* 32 bits PSE 4MB page */
-			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
 		break;
 	case PT32E_ROOT_LEVEL:
-		context->rsvd_bits_mask[0][2] =
+		rsvd_check->rsvd_bits_mask[0][2] =
 			rsvd_bits(maxphyaddr, 63) |
 			rsvd_bits(5, 8) | rsvd_bits(1, 2);	/* PDPTE */
-		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+		rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62);	/* PDE */
-		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+		rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62); 	/* PTE */
-		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+		rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+		rsvd_check->rsvd_bits_mask[1][0] =
+			rsvd_check->rsvd_bits_mask[0][0];
 		break;
 	case PT64_ROOT_LEVEL:
-		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
-			nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51);
-		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
-			nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
-		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+		rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+			nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+			rsvd_bits(maxphyaddr, 51);
+		rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+			nonleaf_bit8_rsvd | gbpages_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51);
-		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+		rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51);
-		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
-		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+		rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51);
+		rsvd_check->rsvd_bits_mask[1][3] =
+			rsvd_check->rsvd_bits_mask[0][3];
+		rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
 			gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
 			rsvd_bits(13, 29);
-		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+		rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+		rsvd_check->rsvd_bits_mask[1][0] =
+			rsvd_check->rsvd_bits_mask[0][0];
 		break;
 	}
 }
 
-static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
-		struct kvm_mmu *context, bool execonly)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+				  struct kvm_mmu *context)
 {
-	int maxphyaddr = cpuid_maxphyaddr(vcpu);
-	int pte;
+	__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
+				cpuid_maxphyaddr(vcpu), context->root_level,
+				context->nx, guest_cpuid_has_gbpages(vcpu),
+				is_pse(vcpu), guest_cpuid_is_amd(vcpu));
+}
+
+static void
+__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+			    int maxphyaddr, bool execonly)
+{
+	u64 bad_mt_xwr;
 
-	context->rsvd_bits_mask[0][3] =
+	rsvd_check->rsvd_bits_mask[0][3] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
-	context->rsvd_bits_mask[0][2] =
+	rsvd_check->rsvd_bits_mask[0][2] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
-	context->rsvd_bits_mask[0][1] =
+	rsvd_check->rsvd_bits_mask[0][1] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
-	context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+	rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
 
 	/* large page */
-	context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
-	context->rsvd_bits_mask[1][2] =
+	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
+	rsvd_check->rsvd_bits_mask[1][2] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
-	context->rsvd_bits_mask[1][1] =
+	rsvd_check->rsvd_bits_mask[1][1] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
-	context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
-
-	for (pte = 0; pte < 64; pte++) {
-		int rwx_bits = pte & 7;
-		int mt = pte >> 3;
-		if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
-				rwx_bits == 0x2 || rwx_bits == 0x6 ||
-				(rwx_bits == 0x4 && !execonly))
-			context->bad_mt_xwr |= (1ull << pte);
+	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+
+	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
+	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
+	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
+	if (!execonly) {
+		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
+		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
 	}
+	rsvd_check->bad_mt_xwr = bad_mt_xwr;
+}
+
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+		struct kvm_mmu *context, bool execonly)
+{
+	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+				    cpuid_maxphyaddr(vcpu), execonly);
+}
+
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+	bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
+	/*
+	 * Passing "true" to the last argument is okay; it adds a check
+	 * on bit 8 of the SPTEs which KVM doesn't use anyway.
+	 */
+	__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+				boot_cpu_data.x86_phys_bits,
+				context->shadow_root_level, uses_nx,
+				guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
+				true);
+}
+EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
+
+static inline bool boot_cpu_is_amd(void)
+{
+	WARN_ON_ONCE(!tdp_enabled);
+	return shadow_x_mask == 0;
+}
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void
+reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+				struct kvm_mmu *context)
+{
+	if (boot_cpu_is_amd())
+		__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+					boot_cpu_data.x86_phys_bits,
+					context->shadow_root_level, false,
+					cpu_has_gbpages, true, true);
+	else
+		__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+					    boot_cpu_data.x86_phys_bits,
+					    false);
+
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+				struct kvm_mmu *context, bool execonly)
+{
+	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+				    boot_cpu_data.x86_phys_bits, execonly);
 }
 
 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
@@ -3833,6 +3948,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
 	context->base_role.word = 0;
+	context->base_role.smm = is_smm(vcpu);
 	context->page_fault = tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
@@ -3868,6 +3984,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
 	update_permission_bitmask(vcpu, context, false);
 	update_last_pte_bitmap(vcpu, context);
+	reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
 
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
@@ -3894,6 +4011,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 		= smep && !is_write_protection(vcpu);
 	context->base_role.smap_andnot_wp
 		= smap && !is_write_protection(vcpu);
+	context->base_role.smm = is_smm(vcpu);
+	reset_shadow_zero_bits_mask(vcpu, context);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
@@ -3917,6 +4036,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 
 	update_permission_bitmask(vcpu, context, true);
 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
@@ -4065,7 +4185,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
 		*gpa &= ~(gpa_t)7;
 		*bytes = 8;
-		r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
+		r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
 		if (r)
 			gentry = 0;
 		new = (const u8 *)&gentry;
@@ -4177,6 +4297,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	mask.nxe = 1;
 	mask.smep_andnot_wp = 1;
 	mask.smap_andnot_wp = 1;
+	mask.smm = 1;
 
 	/*
 	 * If we don't have indirect shadow pages, it means no page is
@@ -4375,36 +4496,115 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
-				      struct kvm_memory_slot *memslot)
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+
+/* The caller should hold mmu-lock before calling this function. */
+static bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			slot_level_handler fn, int start_level, int end_level,
+			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
 {
-	gfn_t last_gfn;
-	int i;
+	struct slot_rmap_walk_iterator iterator;
 	bool flush = false;
 
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+			end_gfn, &iterator) {
+		if (iterator.rmap)
+			flush |= fn(kvm, iterator.rmap);
 
-	spin_lock(&kvm->mmu_lock);
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+			if (flush && lock_flush_tlb) {
+				kvm_flush_remote_tlbs(kvm);
+				flush = false;
+			}
+			cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+
+	if (flush && lock_flush_tlb) {
+		kvm_flush_remote_tlbs(kvm);
+		flush = false;
+	}
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		unsigned long *rmapp;
-		unsigned long last_index, index;
+	return flush;
+}
+
+static bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		  slot_level_handler fn, int start_level, int end_level,
+		  bool lock_flush_tlb)
+{
+	return slot_handle_level_range(kvm, memslot, fn, start_level,
+			end_level, memslot->base_gfn,
+			memslot->base_gfn + memslot->npages - 1,
+			lock_flush_tlb);
+}
+
+static bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		      slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		 slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
 
-		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int i;
 
-		for (index = 0; index <= last_index; ++index, ++rmapp) {
-			if (*rmapp)
-				flush |= __rmap_write_protect(kvm, rmapp,
-						false);
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(memslot, slots) {
+			gfn_t start, end;
+
+			start = max(gfn_start, memslot->base_gfn);
+			end = min(gfn_end, memslot->base_gfn + memslot->npages);
+			if (start >= end)
+				continue;
 
-			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-				cond_resched_lock(&kvm->mmu_lock);
+			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+						start, end - 1, true);
 		}
 	}
 
 	spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+{
+	return __rmap_write_protect(kvm, rmapp, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot)
+{
+	bool flush;
+
+	spin_lock(&kvm->mmu_lock);
+	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+				      false);
+	spin_unlock(&kvm->mmu_lock);
 
 	/*
 	 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
@@ -4437,9 +4637,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 	pfn_t pfn;
 	struct kvm_mmu_page *sp;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+restart:
+	for_each_rmap_spte(rmapp, &iter, sptep) {
 		sp = page_header(__pa(sptep));
 		pfn = spte_to_pfn(*sptep);
 
@@ -4454,71 +4653,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 			!kvm_is_reserved_pfn(pfn) &&
 			PageTransCompound(pfn_to_page(pfn))) {
 			drop_spte(kvm, sptep);
-			sptep = rmap_get_first(*rmapp, &iter);
 			need_tlb_flush = 1;
-		} else
-			sptep = rmap_get_next(&iter);
+			goto restart;
+		}
 	}
 
 	return need_tlb_flush;
 }
 
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-			struct kvm_memory_slot *memslot)
+				   const struct kvm_memory_slot *memslot)
 {
-	bool flush = false;
-	unsigned long *rmapp;
-	unsigned long last_index, index;
-
+	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
 	spin_lock(&kvm->mmu_lock);
-
-	rmapp = memslot->arch.rmap[0];
-	last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
-				memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
-
-	for (index = 0; index <= last_index; ++index, ++rmapp) {
-		if (*rmapp)
-			flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
-
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-			if (flush) {
-				kvm_flush_remote_tlbs(kvm);
-				flush = false;
-			}
-			cond_resched_lock(&kvm->mmu_lock);
-		}
-	}
-
-	if (flush)
-		kvm_flush_remote_tlbs(kvm);
-
+	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+			 kvm_mmu_zap_collapsible_spte, true);
 	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot)
 {
-	gfn_t last_gfn;
-	unsigned long *rmapp;
-	unsigned long last_index, index;
-	bool flush = false;
-
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-
-	rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
-	last_index = gfn_to_index(last_gfn, memslot->base_gfn,
-			PT_PAGE_TABLE_LEVEL);
-
-	for (index = 0; index <= last_index; ++index, ++rmapp) {
-		if (*rmapp)
-			flush |= __rmap_clear_dirty(kvm, rmapp);
-
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-			cond_resched_lock(&kvm->mmu_lock);
-	}
-
+	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
@@ -4537,31 +4696,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
 					struct kvm_memory_slot *memslot)
 {
-	gfn_t last_gfn;
-	int i;
-	bool flush = false;
-
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-
-	for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		unsigned long *rmapp;
-		unsigned long last_index, index;
-
-		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
-		for (index = 0; index <= last_index; ++index, ++rmapp) {
-			if (*rmapp)
-				flush |= __rmap_write_protect(kvm, rmapp,
-						false);
-
-			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-				cond_resched_lock(&kvm->mmu_lock);
-		}
-	}
+	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+					false);
 	spin_unlock(&kvm->mmu_lock);
 
 	/* see kvm_mmu_slot_remove_write_access */
@@ -4575,31 +4714,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 			    struct kvm_memory_slot *memslot)
 {
-	gfn_t last_gfn;
-	int i;
-	bool flush = false;
-
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		unsigned long *rmapp;
-		unsigned long last_index, index;
-
-		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
-		for (index = 0; index <= last_index; ++index, ++rmapp) {
-			if (*rmapp)
-				flush |= __rmap_set_dirty(kvm, rmapp);
-
-			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-				cond_resched_lock(&kvm->mmu_lock);
-		}
-	}
-
+	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
@@ -4696,13 +4814,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 {
 	/*
 	 * The very rare case: if the generation-number is round,
 	 * zap all shadow pages.
 	 */
-	if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
+	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
 		printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
@@ -4824,41 +4942,22 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	unsigned int  nr_pages = 0;
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
+	int i;
 
-	slots = kvm_memslots(kvm);
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
 
-	kvm_for_each_memslot(memslot, slots)
-		nr_pages += memslot->npages;
+		kvm_for_each_memslot(memslot, slots)
+			nr_pages += memslot->npages;
+	}
 
 	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
 	nr_mmu_pages = max(nr_mmu_pages,
-			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+			   (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
 
 	return nr_mmu_pages;
 }
 
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
-{
-	struct kvm_shadow_walk_iterator iterator;
-	u64 spte;
-	int nr_sptes = 0;
-
-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-		return nr_sptes;
-
-	walk_shadow_page_lockless_begin(vcpu);
-	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
-		sptes[iterator.level-1] = spte;
-		nr_sptes++;
-		if (!is_shadow_present_pte(spte))
-			break;
-	}
-	walk_shadow_page_lockless_end(vcpu);
-
-	return nr_sptes;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_unload(vcpu);
diff --git a/kernel/arch/x86/kvm/mmu.h b/kernel/arch/x86/kvm/mmu.h
index 0ada65ecd..55ffb7b0f 100644
--- a/kernel/arch/x86/kvm/mmu.h
+++ b/kernel/arch/x86/kvm/mmu.h
@@ -43,23 +43,26 @@
 #define PT_PDPE_LEVEL 3
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
 
 static inline u64 rsvd_bits(int s, int e)
 {
 	return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
 /*
- * Return values of handle_mmio_page_fault_common:
+ * Return values of handle_mmio_page_fault:
  * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
  *			directly.
  * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
  *			fault path update the mmio spte.
  * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: bug is detected.
+ * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
  */
 enum {
 	RET_MMIO_PF_EMULATE = 1,
@@ -68,7 +71,7 @@ enum {
 	RET_MMIO_PF_BUG = -1
 };
 
-int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
+int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
 
@@ -170,4 +173,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
 #endif
diff --git a/kernel/arch/x86/kvm/mmu_audit.c b/kernel/arch/x86/kvm/mmu_audit.c
index 9ade5cfb5..03d518e49 100644
--- a/kernel/arch/x86/kvm/mmu_audit.c
+++ b/kernel/arch/x86/kvm/mmu_audit.c
@@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 		return;
 
 	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+	pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
 
 	if (is_error_pfn(pfn))
 		return;
@@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *slot;
 	gfn_t gfn;
 
 	rev_sp = page_header(__pa(sptep));
 	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-	if (!gfn_to_memslot(kvm, gfn)) {
+	slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	if (!slot) {
 		if (!__ratelimit(&ratelimit_state))
 			return;
 		audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
@@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 		return;
 	}
 
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
 	if (!*rmapp) {
 		if (!__ratelimit(&ratelimit_state))
 			return;
@@ -191,19 +195,21 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	unsigned long *rmapp;
 	u64 *sptep;
 	struct rmap_iterator iter;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *slot;
 
 	if (sp->role.direct || sp->unsync || sp->role.invalid)
 		return;
 
-	rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, sp->gfn);
+	rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-	     sptep = rmap_get_next(&iter)) {
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		if (is_writable_pte(*sptep))
 			audit_printk(kvm, "shadow page has writable "
 				     "mappings: gfn %llx role %x\n",
 				     sp->gfn, sp->role.word);
-	}
 }
 
 static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -291,7 +297,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops audit_param_ops = {
+static const struct kernel_param_ops audit_param_ops = {
 	.set = mmu_audit_set,
 	.get = param_get_bool,
 };
diff --git a/kernel/arch/x86/kvm/mmutrace.h b/kernel/arch/x86/kvm/mmutrace.h
index ce463a9cc..5a24b846a 100644
--- a/kernel/arch/x86/kvm/mmutrace.h
+++ b/kernel/arch/x86/kvm/mmutrace.h
@@ -2,7 +2,7 @@
 #define _TRACE_KVMMMU_H
 
 #include <linux/tracepoint.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
diff --git a/kernel/arch/x86/kvm/mtrr.c b/kernel/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000..3f8c73211
--- /dev/null
+++ b/kernel/arch/x86/kvm/mtrr.c
@@ -0,0 +1,732 @@
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Paolo Bonzini <pbonzini@redhat.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "mmu.h"
+
+#define IA32_MTRR_DEF_TYPE_E		(1ULL << 11)
+#define IA32_MTRR_DEF_TYPE_FE		(1ULL << 10)
+#define IA32_MTRR_DEF_TYPE_TYPE_MASK	(0xff)
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+	switch (msr) {
+	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+	case MSR_MTRRfix64K_00000:
+	case MSR_MTRRfix16K_80000:
+	case MSR_MTRRfix16K_A0000:
+	case MSR_MTRRfix4K_C0000:
+	case MSR_MTRRfix4K_C8000:
+	case MSR_MTRRfix4K_D0000:
+	case MSR_MTRRfix4K_D8000:
+	case MSR_MTRRfix4K_E0000:
+	case MSR_MTRRfix4K_E8000:
+	case MSR_MTRRfix4K_F0000:
+	case MSR_MTRRfix4K_F8000:
+	case MSR_MTRRdefType:
+	case MSR_IA32_CR_PAT:
+		return true;
+	case 0x2f8:
+		return true;
+	}
+	return false;
+}
+
+static bool valid_pat_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	int i;
+	u64 mask;
+
+	if (!msr_mtrr_valid(msr))
+		return false;
+
+	if (msr == MSR_IA32_CR_PAT) {
+		for (i = 0; i < 8; i++)
+			if (!valid_pat_type((data >> (i * 8)) & 0xff))
+				return false;
+		return true;
+	} else if (msr == MSR_MTRRdefType) {
+		if (data & ~0xcff)
+			return false;
+		return valid_mtrr_type(data & 0xff);
+	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+		for (i = 0; i < 8 ; i++)
+			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+				return false;
+		return true;
+	}
+
+	/* variable MTRRs */
+	WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+
+	mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+	if ((msr & 1) == 0) {
+		/* MTRR base */
+		if (!valid_mtrr_type(data & 0xff))
+			return false;
+		mask |= 0xf00;
+	} else
+		/* MTRR mask */
+		mask |= 0x7ff;
+	if (data & mask) {
+		kvm_inject_gp(vcpu, 0);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+
+static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+	return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
+}
+
+static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+	return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
+}
+
+static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
+{
+	return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
+}
+
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Intel SDM 11.11.2.2: all MTRRs are disabled when
+	 * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
+	 * memory type is applied to all of physical memory.
+	 *
+	 * However, virtual machines can be run with CPUID such that
+	 * there are no MTRRs.  In that case, the firmware will never
+	 * enable MTRRs and it is obviously undesirable to run the
+	 * guest entirely with UC memory and we use WB.
+	 */
+	if (guest_cpuid_has_mtrr(vcpu))
+		return MTRR_TYPE_UNCACHABLE;
+	else
+		return MTRR_TYPE_WRBACK;
+}
+
+/*
+* Three terms are used in the following code:
+* - segment, it indicates the address segments covered by fixed MTRRs.
+* - unit, it corresponds to the MSR entry in the segment.
+* - range, a range is covered in one memory cache type.
+*/
+struct fixed_mtrr_segment {
+	u64 start;
+	u64 end;
+
+	int range_shift;
+
+	/* the start position in kvm_mtrr.fixed_ranges[]. */
+	int range_start;
+};
+
+static struct fixed_mtrr_segment fixed_seg_table[] = {
+	/* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
+	{
+		.start = 0x0,
+		.end = 0x80000,
+		.range_shift = 16, /* 64K */
+		.range_start = 0,
+	},
+
+	/*
+	 * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
+	 * 16K fixed mtrr.
+	 */
+	{
+		.start = 0x80000,
+		.end = 0xc0000,
+		.range_shift = 14, /* 16K */
+		.range_start = 8,
+	},
+
+	/*
+	 * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
+	 * 4K fixed mtrr.
+	 */
+	{
+		.start = 0xc0000,
+		.end = 0x100000,
+		.range_shift = 12, /* 12K */
+		.range_start = 24,
+	}
+};
+
+/*
+ * The size of unit is covered in one MSR, one MSR entry contains
+ * 8 ranges so that unit size is always 8 * 2^range_shift.
+ */
+static u64 fixed_mtrr_seg_unit_size(int seg)
+{
+	return 8 << fixed_seg_table[seg].range_shift;
+}
+
+static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
+{
+	switch (msr) {
+	case MSR_MTRRfix64K_00000:
+		*seg = 0;
+		*unit = 0;
+		break;
+	case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+		*seg = 1;
+		*unit = msr - MSR_MTRRfix16K_80000;
+		break;
+	case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+		*seg = 2;
+		*unit = msr - MSR_MTRRfix4K_C0000;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+	u64 unit_size = fixed_mtrr_seg_unit_size(seg);
+
+	*start = mtrr_seg->start + unit * unit_size;
+	*end = *start + unit_size;
+	WARN_ON(*end > mtrr_seg->end);
+}
+
+static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+
+	WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
+		> mtrr_seg->end);
+
+	/* each unit has 8 ranges. */
+	return mtrr_seg->range_start + 8 * unit;
+}
+
+static int fixed_mtrr_seg_end_range_index(int seg)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+	int n;
+
+	n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
+	return mtrr_seg->range_start + n - 1;
+}
+
+static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
+{
+	int seg, unit;
+
+	if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+		return false;
+
+	fixed_mtrr_seg_unit_range(seg, unit, start, end);
+	return true;
+}
+
+static int fixed_msr_to_range_index(u32 msr)
+{
+	int seg, unit;
+
+	if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+		return -1;
+
+	return fixed_mtrr_seg_unit_range_index(seg, unit);
+}
+
+static int fixed_mtrr_addr_to_seg(u64 addr)
+{
+	struct fixed_mtrr_segment *mtrr_seg;
+	int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
+
+	for (seg = 0; seg < seg_num; seg++) {
+		mtrr_seg = &fixed_seg_table[seg];
+		if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
+			return seg;
+	}
+
+	return -1;
+}
+
+static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
+{
+	struct fixed_mtrr_segment *mtrr_seg;
+	int index;
+
+	mtrr_seg = &fixed_seg_table[seg];
+	index = mtrr_seg->range_start;
+	index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
+	return index;
+}
+
+static u64 fixed_mtrr_range_end_addr(int seg, int index)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+	int pos = index - mtrr_seg->range_start;
+
+	return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
+}
+
+static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
+{
+	u64 mask;
+
+	*start = range->base & PAGE_MASK;
+
+	mask = range->mask & PAGE_MASK;
+
+	/* This cannot overflow because writing to the reserved bits of
+	 * variable MTRRs causes a #GP.
+	 */
+	*end = (*start | ~mask) + 1;
+}
+
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	gfn_t start, end;
+	int index;
+
+	if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+	      !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+		return;
+
+	if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
+		return;
+
+	/* fixed MTRRs. */
+	if (fixed_msr_to_range(msr, &start, &end)) {
+		if (!fixed_mtrr_is_enabled(mtrr_state))
+			return;
+	} else if (msr == MSR_MTRRdefType) {
+		start = 0x0;
+		end = ~0ULL;
+	} else {
+		/* variable range MTRRs. */
+		index = (msr - 0x200) / 2;
+		var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
+	}
+
+	kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
+static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
+{
+	return (range->mask & (1 << 11)) != 0;
+}
+
+static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	struct kvm_mtrr_range *tmp, *cur;
+	int index, is_mtrr_mask;
+
+	index = (msr - 0x200) / 2;
+	is_mtrr_mask = msr - 0x200 - 2 * index;
+	cur = &mtrr_state->var_ranges[index];
+
+	/* remove the entry if it's in the list. */
+	if (var_mtrr_range_is_valid(cur))
+		list_del(&mtrr_state->var_ranges[index].node);
+
+	/* Extend the mask with all 1 bits to the left, since those
+	 * bits must implicitly be 0.  The bits are then cleared
+	 * when reading them.
+	 */
+	if (!is_mtrr_mask)
+		cur->base = data;
+	else
+		cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
+
+	/* add it to the list if it's enabled. */
+	if (var_mtrr_range_is_valid(cur)) {
+		list_for_each_entry(tmp, &mtrr_state->head, node)
+			if (cur->base >= tmp->base)
+				break;
+		list_add_tail(&cur->node, &tmp->node);
+	}
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	int index;
+
+	if (!kvm_mtrr_valid(vcpu, msr, data))
+		return 1;
+
+	index = fixed_msr_to_range_index(msr);
+	if (index >= 0)
+		*(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
+	else if (msr == MSR_MTRRdefType)
+		vcpu->arch.mtrr_state.deftype = data;
+	else if (msr == MSR_IA32_CR_PAT)
+		vcpu->arch.pat = data;
+	else
+		set_var_mtrr_msr(vcpu, msr, data);
+
+	update_mtrr(vcpu, msr);
+	return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	int index;
+
+	/* MSR_MTRRcap is a readonly MSR. */
+	if (msr == MSR_MTRRcap) {
+		/*
+		 * SMRR = 0
+		 * WC = 1
+		 * FIX = 1
+		 * VCNT = KVM_NR_VAR_MTRR
+		 */
+		*pdata = 0x500 | KVM_NR_VAR_MTRR;
+		return 0;
+	}
+
+	if (!msr_mtrr_valid(msr))
+		return 1;
+
+	index = fixed_msr_to_range_index(msr);
+	if (index >= 0)
+		*pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
+	else if (msr == MSR_MTRRdefType)
+		*pdata = vcpu->arch.mtrr_state.deftype;
+	else if (msr == MSR_IA32_CR_PAT)
+		*pdata = vcpu->arch.pat;
+	else {	/* Variable MTRRs */
+		int is_mtrr_mask;
+
+		index = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * index;
+		if (!is_mtrr_mask)
+			*pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
+		else
+			*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+
+		*pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
+	}
+
+	return 0;
+}
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
+{
+	INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
+}
+
+struct mtrr_iter {
+	/* input fields. */
+	struct kvm_mtrr *mtrr_state;
+	u64 start;
+	u64 end;
+
+	/* output fields. */
+	int mem_type;
+	/* mtrr is completely disabled? */
+	bool mtrr_disabled;
+	/* [start, end) is not fully covered in MTRRs? */
+	bool partial_map;
+
+	/* private fields. */
+	union {
+		/* used for fixed MTRRs. */
+		struct {
+			int index;
+			int seg;
+		};
+
+		/* used for var MTRRs. */
+		struct {
+			struct kvm_mtrr_range *range;
+			/* max address has been covered in var MTRRs. */
+			u64 start_max;
+		};
+	};
+
+	bool fixed;
+};
+
+static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
+{
+	int seg, index;
+
+	if (!fixed_mtrr_is_enabled(iter->mtrr_state))
+		return false;
+
+	seg = fixed_mtrr_addr_to_seg(iter->start);
+	if (seg < 0)
+		return false;
+
+	iter->fixed = true;
+	index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
+	iter->index = index;
+	iter->seg = seg;
+	return true;
+}
+
+static bool match_var_range(struct mtrr_iter *iter,
+			    struct kvm_mtrr_range *range)
+{
+	u64 start, end;
+
+	var_mtrr_range(range, &start, &end);
+	if (!(start >= iter->end || end <= iter->start)) {
+		iter->range = range;
+
+		/*
+		 * the function is called when we do kvm_mtrr.head walking.
+		 * Range has the minimum base address which interleaves
+		 * [looker->start_max, looker->end).
+		 */
+		iter->partial_map |= iter->start_max < start;
+
+		/* update the max address has been covered. */
+		iter->start_max = max(iter->start_max, end);
+		return true;
+	}
+
+	return false;
+}
+
+static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+	struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+	list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
+		if (match_var_range(iter, iter->range))
+			return;
+
+	iter->range = NULL;
+	iter->partial_map |= iter->start_max < iter->end;
+}
+
+static void mtrr_lookup_var_start(struct mtrr_iter *iter)
+{
+	struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+	iter->fixed = false;
+	iter->start_max = iter->start;
+	iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
+
+	__mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
+{
+	/* terminate the lookup. */
+	if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
+		iter->fixed = false;
+		iter->range = NULL;
+		return;
+	}
+
+	iter->index++;
+
+	/* have looked up for all fixed MTRRs. */
+	if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
+		return mtrr_lookup_var_start(iter);
+
+	/* switch to next segment. */
+	if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
+		iter->seg++;
+}
+
+static void mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+	__mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_start(struct mtrr_iter *iter)
+{
+	if (!mtrr_is_enabled(iter->mtrr_state)) {
+		iter->mtrr_disabled = true;
+		return;
+	}
+
+	if (!mtrr_lookup_fixed_start(iter))
+		mtrr_lookup_var_start(iter);
+}
+
+static void mtrr_lookup_init(struct mtrr_iter *iter,
+			     struct kvm_mtrr *mtrr_state, u64 start, u64 end)
+{
+	iter->mtrr_state = mtrr_state;
+	iter->start = start;
+	iter->end = end;
+	iter->mtrr_disabled = false;
+	iter->partial_map = false;
+	iter->fixed = false;
+	iter->range = NULL;
+
+	mtrr_lookup_start(iter);
+}
+
+static bool mtrr_lookup_okay(struct mtrr_iter *iter)
+{
+	if (iter->fixed) {
+		iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
+		return true;
+	}
+
+	if (iter->range) {
+		iter->mem_type = iter->range->base & 0xff;
+		return true;
+	}
+
+	return false;
+}
+
+static void mtrr_lookup_next(struct mtrr_iter *iter)
+{
+	if (iter->fixed)
+		mtrr_lookup_fixed_next(iter);
+	else
+		mtrr_lookup_var_next(iter);
+}
+
+#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
+	for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
+	     mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
+
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	struct mtrr_iter iter;
+	u64 start, end;
+	int type = -1;
+	const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
+			       | (1 << MTRR_TYPE_WRTHROUGH);
+
+	start = gfn_to_gpa(gfn);
+	end = start + PAGE_SIZE;
+
+	mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+		int curr_type = iter.mem_type;
+
+		/*
+		 * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
+		 * Precedences.
+		 */
+
+		if (type == -1) {
+			type = curr_type;
+			continue;
+		}
+
+		/*
+		 * If two or more variable memory ranges match and the
+		 * memory types are identical, then that memory type is
+		 * used.
+		 */
+		if (type == curr_type)
+			continue;
+
+		/*
+		 * If two or more variable memory ranges match and one of
+		 * the memory types is UC, the UC memory type used.
+		 */
+		if (curr_type == MTRR_TYPE_UNCACHABLE)
+			return MTRR_TYPE_UNCACHABLE;
+
+		/*
+		 * If two or more variable memory ranges match and the
+		 * memory types are WT and WB, the WT memory type is used.
+		 */
+		if (((1 << type) & wt_wb_mask) &&
+		      ((1 << curr_type) & wt_wb_mask)) {
+			type = MTRR_TYPE_WRTHROUGH;
+			continue;
+		}
+
+		/*
+		 * For overlaps not defined by the above rules, processor
+		 * behavior is undefined.
+		 */
+
+		/* We use WB for this undefined behavior. :( */
+		return MTRR_TYPE_WRBACK;
+	}
+
+	if (iter.mtrr_disabled)
+		return mtrr_disabled_type(vcpu);
+
+	/* not contained in any MTRRs. */
+	if (type == -1)
+		return mtrr_default_type(mtrr_state);
+
+	/*
+	 * We just check one page, partially covered by MTRRs is
+	 * impossible.
+	 */
+	WARN_ON(iter.partial_map);
+
+	return type;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
+
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+					  int page_num)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	struct mtrr_iter iter;
+	u64 start, end;
+	int type = -1;
+
+	start = gfn_to_gpa(gfn);
+	end = gfn_to_gpa(gfn + page_num);
+	mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+		if (type == -1) {
+			type = iter.mem_type;
+			continue;
+		}
+
+		if (type != iter.mem_type)
+			return false;
+	}
+
+	if (iter.mtrr_disabled)
+		return true;
+
+	if (!iter.partial_map)
+		return true;
+
+	if (type == -1)
+		return true;
+
+	return type == mtrr_default_type(mtrr_state);
+}
diff --git a/kernel/arch/x86/kvm/paging_tmpl.h b/kernel/arch/x86/kvm/paging_tmpl.h
index 6e6d115fe..7be8a2513 100644
--- a/kernel/arch/x86/kvm/paging_tmpl.h
+++ b/kernel/arch/x86/kvm/paging_tmpl.h
@@ -128,14 +128,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
 	*access &= mask;
 }
 
-static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-	int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
-
-	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
-		((mmu->bad_mt_xwr & (1ull << low6)) != 0);
-}
-
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
 #if PTTYPE != PTTYPE_EPT
@@ -172,7 +164,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *spte,
 				  u64 gpte)
 {
-	if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
 		goto no_present;
 
 	if (!FNAME(is_present_gpte)(gpte))
@@ -256,8 +248,8 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 		if (ret)
 			return ret;
 
-		mark_page_dirty(vcpu->kvm, table_gfn);
-		walker->ptes[level] = pte;
+		kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
+		walker->ptes[level - 1] = pte;
 	}
 	return 0;
 }
@@ -338,7 +330,7 @@ retry_walk:
 
 		real_gfn = gpa_to_gfn(real_gfn);
 
-		host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
+		host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
 					    &walker->pte_writable[walker->level - 1]);
 		if (unlikely(kvm_is_error_hva(host_addr)))
 			goto error;
@@ -353,8 +345,7 @@ retry_walk:
 		if (unlikely(!FNAME(is_present_gpte)(pte)))
 			goto error;
 
-		if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
-					             walker->level))) {
+		if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
 			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
 			goto error;
 		}
@@ -511,11 +502,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 		base_gpa = pte_gpa & ~mask;
 		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
 
-		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+		r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
 				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
 		curr_pte = gw->prefetch_ptes[index];
 	} else
-		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
+		r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
 				  &curr_pte, sizeof(curr_pte));
 
 	return r || curr_pte != gw->ptes[level - 1];
@@ -707,15 +698,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
-	int force_pt_level;
+	bool force_pt_level = false;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
 	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, addr, error_code,
-					      mmu_is_nested(vcpu));
+		r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
 		if (likely(r != RET_MMIO_PF_INVALID))
 			return r;
 
@@ -752,15 +742,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
 
-	if (walker.level >= PT_DIRECTORY_LEVEL)
-		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
-		   || is_self_change_mapping;
-	else
-		force_pt_level = 1;
-	if (!force_pt_level) {
-		level = min(walker.level, mapping_level(vcpu, walker.gfn));
-		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	}
+	if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
+		level = mapping_level(vcpu, walker.gfn, &force_pt_level);
+		if (likely(!force_pt_level)) {
+			level = min(walker.level, level);
+			walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+		}
+	} else
+		force_pt_level = true;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
@@ -869,8 +858,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			if (!rmap_can_add(vcpu))
 				break;
 
-			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
-						  sizeof(pt_element_t)))
+			if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+						       sizeof(pt_element_t)))
 				break;
 
 			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
@@ -956,8 +945,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
 
-		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
-					  sizeof(pt_element_t)))
+		if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+					       sizeof(pt_element_t)))
 			return -EINVAL;
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
@@ -970,7 +959,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		pte_access &= FNAME(gpte_access)(vcpu, gpte);
 		FNAME(protect_clean_gpte)(&pte_access, gpte);
 
-		if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
+		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
 		      &nr_present))
 			continue;
 
diff --git a/kernel/arch/x86/kvm/pmu.c b/kernel/arch/x86/kvm/pmu.c
index 29fbf9dfd..31aa2c85d 100644
--- a/kernel/arch/x86/kvm/pmu.c
+++ b/kernel/arch/x86/kvm/pmu.c
@@ -1,11 +1,12 @@
 /*
  * Kernel-based Virtual Machine -- Performance Monitoring Unit support
  *
- * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@redhat.com>
  *   Gleb Natapov <gleb@redhat.com>
+ *   Wei Huang    <wei@redhat.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -19,88 +20,39 @@
 #include "x86.h"
 #include "cpuid.h"
 #include "lapic.h"
+#include "pmu.h"
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ *   gp counters are stored in gp_counters[] and fixed counters are stored
+ *   in fixed_counters[] respectively. Both of them are part of "struct
+ *   kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ *   However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ *        has MSR_K7_PERFCTRn.
+ *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ *        that it also supports fixed counters. idx can be used to as index to
+ *        gp and fixed counters.
+ *     3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ *        code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ *        all perf counters (both gp and fixed). The mapping relationship
+ *        between pmc and perf counters is as the following:
+ *        * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ */
 
-static struct kvm_arch_event_perf_mapping {
-	u8 eventsel;
-	u8 unit_mask;
-	unsigned event_type;
-	bool inexact;
-} arch_events[] = {
-	/* Index must match CPUID 0x0A.EBX bit vector */
-	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
-	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
-	[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
-	[3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
-	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
-	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
-	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
-	[7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
-};
-
-/* mapping between fixed pmc index and arch_events array */
-static int fixed_pmc_events[] = {1, 0, 7};
-
-static bool pmc_is_gp(struct kvm_pmc *pmc)
-{
-	return pmc->type == KVM_PMC_GP;
-}
-
-static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
-{
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-
-	return pmu->counter_bitmask[pmc->type];
-}
-
-static inline bool pmc_enabled(struct kvm_pmc *pmc)
-{
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-	return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
-static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
-					 u32 base)
-{
-	if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
-		return &pmu->gp_counters[msr - base];
-	return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
-{
-	int base = MSR_CORE_PERF_FIXED_CTR0;
-	if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
-		return &pmu->fixed_counters[msr - base];
-	return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
-{
-	return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
-}
-
-static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
-{
-	if (idx < INTEL_PMC_IDX_FIXED)
-		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
-	else
-		return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
-}
-
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.apic)
-		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
-}
-
-static void trigger_pmi(struct irq_work *irq_work)
+static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
 {
-	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
-			irq_work);
-	struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
-			arch.pmu);
+	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
+	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
 
-	kvm_deliver_pmi(vcpu);
+	kvm_pmu_deliver_pmi(vcpu);
 }
 
 static void kvm_perf_overflow(struct perf_event *perf_event,
@@ -108,63 +60,46 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
 			      struct pt_regs *regs)
 {
 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-	if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	if (!test_and_set_bit(pmc->idx,
+			      (unsigned long *)&pmu->reprogram_pmi)) {
 		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
 		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 	}
 }
 
 static void kvm_perf_overflow_intr(struct perf_event *perf_event,
-		struct perf_sample_data *data, struct pt_regs *regs)
+				   struct perf_sample_data *data,
+				   struct pt_regs *regs)
 {
 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-	if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	if (!test_and_set_bit(pmc->idx,
+			      (unsigned long *)&pmu->reprogram_pmi)) {
 		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
 		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
 		/*
 		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
 		 * can be ejected on a guest mode re-entry. Otherwise we can't
 		 * be sure that vcpu wasn't executing hlt instruction at the
-		 * time of vmexit and is not going to re-enter guest mode until,
+		 * time of vmexit and is not going to re-enter guest mode until
 		 * woken up. So we should wake it, but this is impossible from
 		 * NMI context. Do it from irq work instead.
 		 */
 		if (!kvm_is_in_guest())
-			irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+			irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
 		else
 			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
 	}
 }
 
-static u64 read_pmc(struct kvm_pmc *pmc)
-{
-	u64 counter, enabled, running;
-
-	counter = pmc->counter;
-
-	if (pmc->perf_event)
-		counter += perf_event_read_value(pmc->perf_event,
-						 &enabled, &running);
-
-	/* FIXME: Scaling needed? */
-
-	return counter & pmc_bitmask(pmc);
-}
-
-static void stop_counter(struct kvm_pmc *pmc)
-{
-	if (pmc->perf_event) {
-		pmc->counter = read_pmc(pmc);
-		perf_event_release_kernel(pmc->perf_event);
-		pmc->perf_event = NULL;
-	}
-}
-
-static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
-		unsigned config, bool exclude_user, bool exclude_kernel,
-		bool intr, bool in_tx, bool in_tx_cp)
+static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
+				  unsigned config, bool exclude_user,
+				  bool exclude_kernel, bool intr,
+				  bool in_tx, bool in_tx_cp)
 {
 	struct perf_event *event;
 	struct perf_event_attr attr = {
@@ -177,6 +112,7 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.exclude_kernel = exclude_kernel,
 		.config = config,
 	};
+
 	if (in_tx)
 		attr.config |= HSW_IN_TX;
 	if (in_tx_cp)
@@ -188,33 +124,16 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 						 intr ? kvm_perf_overflow_intr :
 						 kvm_perf_overflow, pmc);
 	if (IS_ERR(event)) {
-		printk_once("kvm: pmu event creation failed %ld\n",
-				PTR_ERR(event));
+		printk_once("kvm_pmu: event creation failed %ld\n",
+			    PTR_ERR(event));
 		return;
 	}
 
 	pmc->perf_event = event;
-	clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
-}
-
-static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
-		u8 unit_mask)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(arch_events); i++)
-		if (arch_events[i].eventsel == event_select
-				&& arch_events[i].unit_mask == unit_mask
-				&& (pmu->available_event_types & (1 << i)))
-			break;
-
-	if (i == ARRAY_SIZE(arch_events))
-		return PERF_COUNT_HW_MAX;
-
-	return arch_events[i].event_type;
+	clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
 }
 
-static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 {
 	unsigned config, type = PERF_TYPE_RAW;
 	u8 event_select, unit_mask;
@@ -224,21 +143,22 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 
 	pmc->eventsel = eventsel;
 
-	stop_counter(pmc);
+	pmc_stop_counter(pmc);
 
-	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
+	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
 		return;
 
 	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
 	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 
 	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
-				ARCH_PERFMON_EVENTSEL_INV |
-				ARCH_PERFMON_EVENTSEL_CMASK |
-				HSW_IN_TX |
-				HSW_IN_TX_CHECKPOINTED))) {
-		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
-				unit_mask);
+			  ARCH_PERFMON_EVENTSEL_INV |
+			  ARCH_PERFMON_EVENTSEL_CMASK |
+			  HSW_IN_TX |
+			  HSW_IN_TX_CHECKPOINTED))) {
+		config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
+						      event_select,
+						      unit_mask);
 		if (config != PERF_COUNT_HW_MAX)
 			type = PERF_TYPE_HARDWARE;
 	}
@@ -246,56 +166,36 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	if (type == PERF_TYPE_RAW)
 		config = eventsel & X86_RAW_EVENT_MASK;
 
-	reprogram_counter(pmc, type, config,
-			!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
-			!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-			eventsel & ARCH_PERFMON_EVENTSEL_INT,
-			(eventsel & HSW_IN_TX),
-			(eventsel & HSW_IN_TX_CHECKPOINTED));
+	pmc_reprogram_counter(pmc, type, config,
+			      !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+			      !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+			      eventsel & ARCH_PERFMON_EVENTSEL_INT,
+			      (eventsel & HSW_IN_TX),
+			      (eventsel & HSW_IN_TX_CHECKPOINTED));
 }
+EXPORT_SYMBOL_GPL(reprogram_gp_counter);
 
-static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
 {
-	unsigned en = en_pmi & 0x3;
-	bool pmi = en_pmi & 0x8;
+	unsigned en_field = ctrl & 0x3;
+	bool pmi = ctrl & 0x8;
 
-	stop_counter(pmc);
+	pmc_stop_counter(pmc);
 
-	if (!en || !pmc_enabled(pmc))
+	if (!en_field || !pmc_is_enabled(pmc))
 		return;
 
-	reprogram_counter(pmc, PERF_TYPE_HARDWARE,
-			arch_events[fixed_pmc_events[idx]].event_type,
-			!(en & 0x2), /* exclude user */
-			!(en & 0x1), /* exclude kernel */
-			pmi, false, false);
+	pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+			      kvm_x86_ops->pmu_ops->find_fixed_event(idx),
+			      !(en_field & 0x2), /* exclude user */
+			      !(en_field & 0x1), /* exclude kernel */
+			      pmi, false, false);
 }
+EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
 
-static inline u8 fixed_en_pmi(u64 ctrl, int idx)
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
 {
-	return (ctrl >> (idx * 4)) & 0xf;
-}
-
-static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
-{
-	int i;
-
-	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
-		u8 en_pmi = fixed_en_pmi(data, i);
-		struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
-
-		if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
-			continue;
-
-		reprogram_fixed_counter(pmc, en_pmi, i);
-	}
-
-	pmu->fixed_ctr_ctrl = data;
-}
-
-static void reprogram_idx(struct kvm_pmu *pmu, int idx)
-{
-	struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
+	struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
 
 	if (!pmc)
 		return;
@@ -303,274 +203,107 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
 	if (pmc_is_gp(pmc))
 		reprogram_gp_counter(pmc, pmc->eventsel);
 	else {
-		int fidx = idx - INTEL_PMC_IDX_FIXED;
-		reprogram_fixed_counter(pmc,
-				fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
+		int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+		u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+
+		reprogram_fixed_counter(pmc, ctrl, idx);
 	}
 }
+EXPORT_SYMBOL_GPL(reprogram_counter);
 
-static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
 {
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	u64 bitmask;
 	int bit;
-	u64 diff = pmu->global_ctrl ^ data;
-
-	pmu->global_ctrl = data;
-
-	for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
-		reprogram_idx(pmu, bit);
-}
 
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	int ret;
-
-	switch (msr) {
-	case MSR_CORE_PERF_FIXED_CTR_CTRL:
-	case MSR_CORE_PERF_GLOBAL_STATUS:
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-		ret = pmu->version > 1;
-		break;
-	default:
-		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
-			|| get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
-			|| get_fixed_pmc(pmu, msr);
-		break;
-	}
-	return ret;
-}
+	bitmask = pmu->reprogram_pmi;
 
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
+	for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+		struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
 
-	switch (index) {
-	case MSR_CORE_PERF_FIXED_CTR_CTRL:
-		*data = pmu->fixed_ctr_ctrl;
-		return 0;
-	case MSR_CORE_PERF_GLOBAL_STATUS:
-		*data = pmu->global_status;
-		return 0;
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-		*data = pmu->global_ctrl;
-		return 0;
-	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-		*data = pmu->global_ovf_ctrl;
-		return 0;
-	default:
-		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
-				(pmc = get_fixed_pmc(pmu, index))) {
-			*data = read_pmc(pmc);
-			return 0;
-		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
-			*data = pmc->eventsel;
-			return 0;
+		if (unlikely(!pmc || !pmc->perf_event)) {
+			clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+			continue;
 		}
-	}
-	return 1;
-}
 
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-	u32 index = msr_info->index;
-	u64 data = msr_info->data;
-
-	switch (index) {
-	case MSR_CORE_PERF_FIXED_CTR_CTRL:
-		if (pmu->fixed_ctr_ctrl == data)
-			return 0;
-		if (!(data & 0xfffffffffffff444ull)) {
-			reprogram_fixed_counters(pmu, data);
-			return 0;
-		}
-		break;
-	case MSR_CORE_PERF_GLOBAL_STATUS:
-		if (msr_info->host_initiated) {
-			pmu->global_status = data;
-			return 0;
-		}
-		break; /* RO MSR */
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-		if (pmu->global_ctrl == data)
-			return 0;
-		if (!(data & pmu->global_ctrl_mask)) {
-			global_ctrl_changed(pmu, data);
-			return 0;
-		}
-		break;
-	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
-			if (!msr_info->host_initiated)
-				pmu->global_status &= ~data;
-			pmu->global_ovf_ctrl = data;
-			return 0;
-		}
-		break;
-	default:
-		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
-				(pmc = get_fixed_pmc(pmu, index))) {
-			if (!msr_info->host_initiated)
-				data = (s64)(s32)data;
-			pmc->counter += data - read_pmc(pmc);
-			return 0;
-		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
-			if (data == pmc->eventsel)
-				return 0;
-			if (!(data & pmu->reserved_bits)) {
-				reprogram_gp_counter(pmc, data);
-				return 0;
-			}
-		}
+		reprogram_counter(pmu, bit);
 	}
-	return 1;
 }
 
-int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
+/* check if idx is a valid index to access PMU */
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	bool fixed = pmc & (1u << 30);
-	pmc &= ~(3u << 30);
-	return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
-		(fixed && pmc >= pmu->nr_arch_fixed_counters);
+	return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
 }
 
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	bool fast_mode = pmc & (1u << 31);
-	bool fixed = pmc & (1u << 30);
-	struct kvm_pmc *counters;
-	u64 ctr;
-
-	pmc &= ~(3u << 30);
-	if (!fixed && pmc >= pmu->nr_arch_gp_counters)
-		return 1;
-	if (fixed && pmc >= pmu->nr_arch_fixed_counters)
+	bool fast_mode = idx & (1u << 31);
+	struct kvm_pmc *pmc;
+	u64 ctr_val;
+
+	pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
+	if (!pmc)
 		return 1;
-	counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
-	ctr = read_pmc(&counters[pmc]);
+
+	ctr_val = pmc_read_counter(pmc);
 	if (fast_mode)
-		ctr = (u32)ctr;
-	*data = ctr;
+		ctr_val = (u32)ctr_val;
 
+	*data = ctr_val;
 	return 0;
 }
 
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_cpuid_entry2 *entry;
-	union cpuid10_eax eax;
-	union cpuid10_edx edx;
-
-	pmu->nr_arch_gp_counters = 0;
-	pmu->nr_arch_fixed_counters = 0;
-	pmu->counter_bitmask[KVM_PMC_GP] = 0;
-	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
-	pmu->version = 0;
-	pmu->reserved_bits = 0xffffffff00200000ull;
-
-	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
-	if (!entry)
-		return;
-	eax.full = entry->eax;
-	edx.full = entry->edx;
-
-	pmu->version = eax.split.version_id;
-	if (!pmu->version)
-		return;
-
-	pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
-					INTEL_PMC_MAX_GENERIC);
-	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
-	pmu->available_event_types = ~entry->ebx &
-					((1ull << eax.split.mask_length) - 1);
-
-	if (pmu->version == 1) {
-		pmu->nr_arch_fixed_counters = 0;
-	} else {
-		pmu->nr_arch_fixed_counters =
-			min_t(int, edx.split.num_counters_fixed,
-				INTEL_PMC_MAX_FIXED);
-		pmu->counter_bitmask[KVM_PMC_FIXED] =
-			((u64)1 << edx.split.bit_width_fixed) - 1;
-	}
+	if (vcpu->arch.apic)
+		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
 
-	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
-		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
-	pmu->global_ctrl_mask = ~pmu->global_ctrl;
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
 
-	entry = kvm_find_cpuid_entry(vcpu, 7, 0);
-	if (entry &&
-	    (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
-	    (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
-		pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+	return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
 }
 
-void kvm_pmu_init(struct kvm_vcpu *vcpu)
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
+}
 
-	memset(pmu, 0, sizeof(*pmu));
-	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
-		pmu->gp_counters[i].type = KVM_PMC_GP;
-		pmu->gp_counters[i].vcpu = vcpu;
-		pmu->gp_counters[i].idx = i;
-	}
-	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
-		pmu->fixed_counters[i].type = KVM_PMC_FIXED;
-		pmu->fixed_counters[i].vcpu = vcpu;
-		pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
-	}
-	init_irq_work(&pmu->irq_work, trigger_pmi);
-	kvm_pmu_cpuid_update(vcpu);
+/* refresh PMU settings. This function generally is called when underlying
+ * settings are changed (such as changes of PMU CPUID by guest VMs), which
+ * should rarely happen.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->pmu_ops->refresh(vcpu);
 }
 
 void kvm_pmu_reset(struct kvm_vcpu *vcpu)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	int i;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 
 	irq_work_sync(&pmu->irq_work);
-	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
-		struct kvm_pmc *pmc = &pmu->gp_counters[i];
-		stop_counter(pmc);
-		pmc->counter = pmc->eventsel = 0;
-	}
+	kvm_x86_ops->pmu_ops->reset(vcpu);
+}
 
-	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
-		stop_counter(&pmu->fixed_counters[i]);
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 
-	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
-		pmu->global_ovf_ctrl = 0;
+	memset(pmu, 0, sizeof(*pmu));
+	kvm_x86_ops->pmu_ops->init(vcpu);
+	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+	kvm_pmu_refresh(vcpu);
 }
 
 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
 {
 	kvm_pmu_reset(vcpu);
 }
-
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	u64 bitmask;
-	int bit;
-
-	bitmask = pmu->reprogram_pmi;
-
-	for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
-		struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
-
-		if (unlikely(!pmc || !pmc->perf_event)) {
-			clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
-			continue;
-		}
-
-		reprogram_idx(pmu, bit);
-	}
-}
diff --git a/kernel/arch/x86/kvm/pmu.h b/kernel/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000..f96e1f962
--- /dev/null
+++ b/kernel/arch/x86/kvm/pmu.h
@@ -0,0 +1,118 @@
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu)  (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc)   (&(pmc)->vcpu->arch.pmu)
+
+/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+
+struct kvm_event_hw_type_mapping {
+	u8 eventsel;
+	u8 unit_mask;
+	unsigned event_type;
+};
+
+struct kvm_pmu_ops {
+	unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
+				    u8 unit_mask);
+	unsigned (*find_fixed_event)(int idx);
+	bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
+	struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
+	struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx);
+	int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+	bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+	void (*refresh)(struct kvm_vcpu *vcpu);
+	void (*init)(struct kvm_vcpu *vcpu);
+	void (*reset)(struct kvm_vcpu *vcpu);
+};
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+	u64 counter, enabled, running;
+
+	counter = pmc->counter;
+	if (pmc->perf_event)
+		counter += perf_event_read_value(pmc->perf_event,
+						 &enabled, &running);
+	/* FIXME: Scaling needed? */
+	return counter & pmc_bitmask(pmc);
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+	if (pmc->perf_event) {
+		pmc->counter = pmc_read_counter(pmc);
+		perf_event_release_kernel(pmc->perf_event);
+		pmc->perf_event = NULL;
+	}
+}
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+	return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+	return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+	return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * paramenter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+					 u32 base)
+{
+	if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+		return &pmu->gp_counters[msr - base];
+
+	return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+	int base = MSR_CORE_PERF_FIXED_CTR0;
+
+	if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+		return &pmu->fixed_counters[msr - base];
+
+	return NULL;
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/kernel/arch/x86/kvm/pmu_amd.c b/kernel/arch/x86/kvm/pmu_amd.c
new file mode 100644
index 000000000..39b91127e
--- /dev/null
+++ b/kernel/arch/x86/kvm/pmu_amd.c
@@ -0,0 +1,205 @@
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ *   Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* duplicated from amd_perfmon_event_map, K7 and above should work. */
+static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
+	[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+	[2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES },
+	[3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES },
+	[4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+	[5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+	[6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+	[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
+				    u8 event_select,
+				    u8 unit_mask)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
+		if (amd_event_mapping[i].eventsel == event_select
+		    && amd_event_mapping[i].unit_mask == unit_mask)
+			break;
+
+	if (i == ARRAY_SIZE(amd_event_mapping))
+		return PERF_COUNT_HW_MAX;
+
+	return amd_event_mapping[i].event_type;
+}
+
+/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+static unsigned amd_find_fixed_event(int idx)
+{
+	return PERF_COUNT_HW_MAX;
+}
+
+/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
+ * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
+ */
+static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+	return true;
+}
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+	return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0);
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	idx &= ~(3u << 30);
+
+	return (idx >= pmu->nr_arch_gp_counters);
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *counters;
+
+	idx &= ~(3u << 30);
+	if (idx >= pmu->nr_arch_gp_counters)
+		return NULL;
+	counters = pmu->gp_counters;
+
+	return &counters[idx];
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int ret = false;
+
+	ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
+		get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+
+	return ret;
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+
+	/* MSR_K7_PERFCTRn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+	if (pmc) {
+		*data = pmc_read_counter(pmc);
+		return 0;
+	}
+	/* MSR_K7_EVNTSELn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+	if (pmc) {
+		*data = pmc->eventsel;
+		return 0;
+	}
+
+	return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+	u64 data = msr_info->data;
+
+	/* MSR_K7_PERFCTRn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+	if (pmc) {
+		pmc->counter += data - pmc_read_counter(pmc);
+		return 0;
+	}
+	/* MSR_K7_EVNTSELn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+	if (pmc) {
+		if (data == pmc->eventsel)
+			return 0;
+		if (!(data & pmu->reserved_bits)) {
+			reprogram_gp_counter(pmc, data);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+	pmu->reserved_bits = 0xffffffff00200000ull;
+	/* not applicable to AMD; but clean them to prevent any fall out */
+	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+	pmu->nr_arch_fixed_counters = 0;
+	pmu->version = 0;
+	pmu->global_status = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int i;
+
+	for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
+		pmu->gp_counters[i].type = KVM_PMC_GP;
+		pmu->gp_counters[i].vcpu = vcpu;
+		pmu->gp_counters[i].idx = i;
+	}
+}
+
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int i;
+
+	for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
+		struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+		pmc_stop_counter(pmc);
+		pmc->counter = pmc->eventsel = 0;
+	}
+}
+
+struct kvm_pmu_ops amd_pmu_ops = {
+	.find_arch_event = amd_find_arch_event,
+	.find_fixed_event = amd_find_fixed_event,
+	.pmc_is_enabled = amd_pmc_is_enabled,
+	.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+	.msr_idx_to_pmc = amd_msr_idx_to_pmc,
+	.is_valid_msr_idx = amd_is_valid_msr_idx,
+	.is_valid_msr = amd_is_valid_msr,
+	.get_msr = amd_pmu_get_msr,
+	.set_msr = amd_pmu_set_msr,
+	.refresh = amd_pmu_refresh,
+	.init = amd_pmu_init,
+	.reset = amd_pmu_reset,
+};
diff --git a/kernel/arch/x86/kvm/pmu_intel.c b/kernel/arch/x86/kvm/pmu_intel.c
new file mode 100644
index 000000000..ab38af4f4
--- /dev/null
+++ b/kernel/arch/x86/kvm/pmu_intel.c
@@ -0,0 +1,358 @@
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@redhat.com>
+ *   Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+static struct kvm_event_hw_type_mapping intel_arch_events[] = {
+	/* Index must match CPUID 0x0A.EBX bit vector */
+	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+	[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
+	[3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+	[7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+};
+
+/* mapping between fixed pmc index and intel_arch_events array */
+static int fixed_pmc_events[] = {1, 0, 7};
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+	int i;
+
+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+		u8 new_ctrl = fixed_ctrl_field(data, i);
+		u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
+		struct kvm_pmc *pmc;
+
+		pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+		if (old_ctrl == new_ctrl)
+			continue;
+
+		reprogram_fixed_counter(pmc, new_ctrl, i);
+	}
+
+	pmu->fixed_ctr_ctrl = data;
+}
+
+/* function is called when global control register has been updated. */
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+	int bit;
+	u64 diff = pmu->global_ctrl ^ data;
+
+	pmu->global_ctrl = data;
+
+	for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+		reprogram_counter(pmu, bit);
+}
+
+static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
+				      u8 event_select,
+				      u8 unit_mask)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+		if (intel_arch_events[i].eventsel == event_select
+		    && intel_arch_events[i].unit_mask == unit_mask
+		    && (pmu->available_event_types & (1 << i)))
+			break;
+
+	if (i == ARRAY_SIZE(intel_arch_events))
+		return PERF_COUNT_HW_MAX;
+
+	return intel_arch_events[i].event_type;
+}
+
+static unsigned intel_find_fixed_event(int idx)
+{
+	if (idx >= ARRAY_SIZE(fixed_pmc_events))
+		return PERF_COUNT_HW_MAX;
+
+	return intel_arch_events[fixed_pmc_events[idx]].event_type;
+}
+
+/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+	if (pmc_idx < INTEL_PMC_IDX_FIXED)
+		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+				  MSR_P6_EVNTSEL0);
+	else {
+		u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+		return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+	}
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	bool fixed = idx & (1u << 30);
+
+	idx &= ~(3u << 30);
+
+	return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
+		(fixed && idx >= pmu->nr_arch_fixed_counters);
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
+					    unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	bool fixed = idx & (1u << 30);
+	struct kvm_pmc *counters;
+
+	idx &= ~(3u << 30);
+	if (!fixed && idx >= pmu->nr_arch_gp_counters)
+		return NULL;
+	if (fixed && idx >= pmu->nr_arch_fixed_counters)
+		return NULL;
+	counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+
+	return &counters[idx];
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int ret;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		ret = pmu->version > 1;
+		break;
+	default:
+		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+			get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+			get_fixed_pmc(pmu, msr);
+		break;
+	}
+
+	return ret;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		*data = pmu->fixed_ctr_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		*data = pmu->global_status;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		*data = pmu->global_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		*data = pmu->global_ovf_ctrl;
+		return 0;
+	default:
+		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+		    (pmc = get_fixed_pmc(pmu, msr))) {
+			*data = pmc_read_counter(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+			*data = pmc->eventsel;
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+	u64 data = msr_info->data;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		if (pmu->fixed_ctr_ctrl == data)
+			return 0;
+		if (!(data & 0xfffffffffffff444ull)) {
+			reprogram_fixed_counters(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		if (msr_info->host_initiated) {
+			pmu->global_status = data;
+			return 0;
+		}
+		break; /* RO MSR */
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		if (pmu->global_ctrl == data)
+			return 0;
+		if (!(data & pmu->global_ctrl_mask)) {
+			global_ctrl_changed(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+			if (!msr_info->host_initiated)
+				pmu->global_status &= ~data;
+			pmu->global_ovf_ctrl = data;
+			return 0;
+		}
+		break;
+	default:
+		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+		    (pmc = get_fixed_pmc(pmu, msr))) {
+			if (!msr_info->host_initiated)
+				data = (s64)(s32)data;
+			pmc->counter += data - pmc_read_counter(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+			if (data == pmc->eventsel)
+				return 0;
+			if (!(data & pmu->reserved_bits)) {
+				reprogram_gp_counter(pmc, data);
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_cpuid_entry2 *entry;
+	union cpuid10_eax eax;
+	union cpuid10_edx edx;
+
+	pmu->nr_arch_gp_counters = 0;
+	pmu->nr_arch_fixed_counters = 0;
+	pmu->counter_bitmask[KVM_PMC_GP] = 0;
+	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+	pmu->version = 0;
+	pmu->reserved_bits = 0xffffffff00200000ull;
+
+	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+	if (!entry)
+		return;
+	eax.full = entry->eax;
+	edx.full = entry->edx;
+
+	pmu->version = eax.split.version_id;
+	if (!pmu->version)
+		return;
+
+	pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+					INTEL_PMC_MAX_GENERIC);
+	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+	pmu->available_event_types = ~entry->ebx &
+					((1ull << eax.split.mask_length) - 1);
+
+	if (pmu->version == 1) {
+		pmu->nr_arch_fixed_counters = 0;
+	} else {
+		pmu->nr_arch_fixed_counters =
+			min_t(int, edx.split.num_counters_fixed,
+				INTEL_PMC_MAX_FIXED);
+		pmu->counter_bitmask[KVM_PMC_FIXED] =
+			((u64)1 << edx.split.bit_width_fixed) - 1;
+	}
+
+	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+	pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+	entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+	if (entry &&
+	    (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+	    (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+		pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+		pmu->gp_counters[i].type = KVM_PMC_GP;
+		pmu->gp_counters[i].vcpu = vcpu;
+		pmu->gp_counters[i].idx = i;
+	}
+
+	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+		pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+		pmu->fixed_counters[i].vcpu = vcpu;
+		pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+	}
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int i;
+
+	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+		struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+		pmc_stop_counter(pmc);
+		pmc->counter = pmc->eventsel = 0;
+	}
+
+	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
+		pmc_stop_counter(&pmu->fixed_counters[i]);
+
+	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+		pmu->global_ovf_ctrl = 0;
+}
+
+struct kvm_pmu_ops intel_pmu_ops = {
+	.find_arch_event = intel_find_arch_event,
+	.find_fixed_event = intel_find_fixed_event,
+	.pmc_is_enabled = intel_pmc_is_enabled,
+	.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+	.msr_idx_to_pmc = intel_msr_idx_to_pmc,
+	.is_valid_msr_idx = intel_is_valid_msr_idx,
+	.is_valid_msr = intel_is_valid_msr,
+	.get_msr = intel_pmu_get_msr,
+	.set_msr = intel_pmu_set_msr,
+	.refresh = intel_pmu_refresh,
+	.init = intel_pmu_init,
+	.reset = intel_pmu_reset,
+};
diff --git a/kernel/arch/x86/kvm/svm.c b/kernel/arch/x86/kvm/svm.c
index 4911bf191..899c40f82 100644
--- a/kernel/arch/x86/kvm/svm.c
+++ b/kernel/arch/x86/kvm/svm.c
@@ -21,6 +21,7 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
+#include "pmu.h"
 
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
@@ -28,7 +29,7 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/slab.h>
 
 #include <asm/perf_event.h>
@@ -157,7 +158,8 @@ struct vcpu_svm {
 	unsigned long int3_rip;
 	u32 apf_reason;
 
-	u64  tsc_ratio;
+	/* cached guest cpuid flags for faster access */
+	bool nrips_enabled	: 1;
 };
 
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -201,6 +203,7 @@ module_param(npt, int, S_IRUGO);
 static int nested = true;
 module_param(nested, int, S_IRUGO);
 
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
@@ -209,7 +212,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
-static u64 __scale_tsc(u64 ratio, u64 tsc);
 
 enum {
 	VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -512,7 +514,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	if (svm->vmcb->control.next_rip != 0) {
-		WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
+		WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
 		svm->next_rip = svm->vmcb->control.next_rip;
 	}
 
@@ -889,20 +891,9 @@ static __init int svm_hardware_setup(void)
 		kvm_enable_efer_bits(EFER_FFXSR);
 
 	if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
-		u64 max;
-
 		kvm_has_tsc_control = true;
-
-		/*
-		 * Make sure the user can only configure tsc_khz values that
-		 * fit into a signed integer.
-		 * A min value is not calculated needed because it will always
-		 * be 1 on all machines and a value of 0 is used to disable
-		 * tsc-scaling for the vcpu.
-		 */
-		max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
-
-		kvm_max_guest_tsc_khz = max;
+		kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+		kvm_tsc_scaling_ratio_frac_bits = 32;
 	}
 
 	if (nested) {
@@ -966,68 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
-static u64 __scale_tsc(u64 ratio, u64 tsc)
-{
-	u64 mult, frac, _tsc;
-
-	mult  = ratio >> 32;
-	frac  = ratio & ((1ULL << 32) - 1);
-
-	_tsc  = tsc;
-	_tsc *= mult;
-	_tsc += (tsc >> 32) * frac;
-	_tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
-
-	return _tsc;
-}
-
-static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	u64 _tsc = tsc;
-
-	if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
-		_tsc = __scale_tsc(svm->tsc_ratio, tsc);
-
-	return _tsc;
-}
-
-static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	u64 ratio;
-	u64 khz;
-
-	/* Guest TSC same frequency as host TSC? */
-	if (!scale) {
-		svm->tsc_ratio = TSC_RATIO_DEFAULT;
-		return;
-	}
-
-	/* TSC scaling supported? */
-	if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
-		if (user_tsc_khz > tsc_khz) {
-			vcpu->arch.tsc_catchup = 1;
-			vcpu->arch.tsc_always_catchup = 1;
-		} else
-			WARN(1, "user requested TSC rate below hardware speed\n");
-		return;
-	}
-
-	khz = user_tsc_khz;
-
-	/* TSC scaling required  - calculate ratio */
-	ratio = khz << 32;
-	do_div(ratio, tsc_khz);
-
-	if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
-		WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
-				user_tsc_khz);
-		return;
-	}
-	svm->tsc_ratio             = ratio;
-}
-
 static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1054,16 +983,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
-static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (host) {
-		if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
-			WARN_ON(adjustment < 0);
-		adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
-	}
-
 	svm->vmcb->control.tsc_offset += adjustment;
 	if (is_guest_mode(vcpu))
 		svm->nested.hsave->control.tsc_offset += adjustment;
@@ -1075,15 +998,6 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
-static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
-	u64 tsc;
-
-	tsc = svm_scale_tsc(vcpu, native_read_tsc());
-
-	return target_tsc - tsc;
-}
-
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1105,6 +1019,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_exception_intercept(svm, PF_VECTOR);
 	set_exception_intercept(svm, UD_VECTOR);
 	set_exception_intercept(svm, MC_VECTOR);
+	set_exception_intercept(svm, AC_VECTOR);
+	set_exception_intercept(svm, DB_VECTOR);
 
 	set_intercept(svm, INTERCEPT_INTR);
 	set_intercept(svm, INTERCEPT_NMI);
@@ -1162,11 +1078,11 @@ static void init_vmcb(struct vcpu_svm *svm)
 	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
 	/*
-	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+	 * It also updates the guest-visible cr0 value.
 	 */
-	svm->vcpu.arch.cr0 = 0;
-	(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+	svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+	kvm_mmu_reset_context(&svm->vcpu);
 
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
@@ -1178,7 +1094,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 		clr_exception_intercept(svm, PF_VECTOR);
 		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
 		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
-		save->g_pat = 0x0007040600070406ULL;
+		save->g_pat = svm->vcpu.arch.pat;
 		save->cr3 = 0;
 		save->cr4 = 0;
 	}
@@ -1197,12 +1113,18 @@ static void init_vmcb(struct vcpu_svm *svm)
 	enable_gif(svm);
 }
 
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u32 dummy;
 	u32 eax = 1;
 
+	if (!init_event) {
+		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+					   MSR_IA32_APICBASE_ENABLE;
+		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	}
 	init_vmcb(svm);
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
@@ -1224,8 +1146,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto out;
 	}
 
-	svm->tsc_ratio = TSC_RATIO_DEFAULT;
-
 	err = kvm_vcpu_init(&svm->vcpu, kvm, id);
 	if (err)
 		goto free_svm;
@@ -1261,11 +1181,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->asid_generation = 0;
 	init_vmcb(svm);
 
-	svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-				   MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
-
 	svm_init_osvw(&svm->vcpu);
 
 	return &svm->vcpu;
@@ -1316,10 +1231,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
 		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 
-	if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
-	    svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
-		__this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
-		wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
+	if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+		u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+		if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
+			__this_cpu_write(current_tsc_ratio, tsc_ratio);
+			wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
+		}
 	}
 }
 
@@ -1577,7 +1494,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	 * does not do it - this results in some delay at
 	 * reboot
 	 */
-	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+		cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
 	svm->vmcb->save.cr0 = cr0;
 	mark_dirty(svm->vmcb, VMCB_CR);
 	update_cr0_intercept(svm);
@@ -1637,20 +1555,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 	mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
-static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
+static void update_bp_intercept(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	clr_exception_intercept(svm, DB_VECTOR);
 	clr_exception_intercept(svm, BP_VECTOR);
 
-	if (svm->nmi_singlestep)
-		set_exception_intercept(svm, DB_VECTOR);
-
 	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
-		if (vcpu->guest_debug &
-		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-			set_exception_intercept(svm, DB_VECTOR);
 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
 			set_exception_intercept(svm, BP_VECTOR);
 	} else
@@ -1756,7 +1667,6 @@ static int db_interception(struct vcpu_svm *svm)
 		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
 			svm->vmcb->save.rflags &=
 				~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-		update_db_bp_intercept(&svm->vcpu);
 	}
 
 	if (svm->vcpu.guest_debug &
@@ -1791,6 +1701,12 @@ static int ud_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int ac_interception(struct vcpu_svm *svm)
+{
+	kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+	return 1;
+}
+
 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1955,8 +1871,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
 	u64 pdpte;
 	int ret;
 
-	ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
-				  offset_in_page(cr3) + index * 8, 8);
+	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+				       offset_in_page(cr3) + index * 8, 8);
 	if (ret)
 		return 0;
 	return pdpte;
@@ -2010,6 +1926,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.get_pdptr         = nested_svm_get_tdp_pdptr;
 	vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
 	vcpu->arch.mmu.shadow_root_level = get_npt_level();
+	reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 }
 
@@ -2114,7 +2031,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 
 	might_sleep();
 
-	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+	page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto error;
 
@@ -2153,7 +2070,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 	mask = (0xf >> (4 - size)) << start_bit;
 	val = 0;
 
-	if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
 		return NESTED_EXIT_DONE;
 
 	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2178,7 +2095,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	/* Offset is in 32 bit units but need in 8 bit units */
 	offset *= 4;
 
-	if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
 		return NESTED_EXIT_DONE;
 
 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2359,7 +2276,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
-	nested_vmcb->control.next_rip          = vmcb->control.next_rip;
+
+	if (svm->nrips_enabled)
+		nested_vmcb->control.next_rip  = vmcb->control.next_rip;
 
 	/*
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -2449,7 +2368,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 		p      = msrpm_offsets[i];
 		offset = svm->nested.vmcb_msrpm + (p * 4);
 
-		if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
 			return false;
 
 		svm->nested.msrpm[p] = svm->msrpm[p] | value;
@@ -3054,7 +2973,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	r = cr_interception(svm);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (lapic_in_kernel(&svm->vcpu))
 		return r;
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return r;
@@ -3065,46 +2984,45 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
 	struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
-	return vmcb->control.tsc_offset +
-		svm_scale_tsc(vcpu, host_tsc);
+	return vmcb->control.tsc_offset + host_tsc;
 }
 
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	switch (ecx) {
+	switch (msr_info->index) {
 	case MSR_IA32_TSC: {
-		*data = svm->vmcb->control.tsc_offset +
-			svm_scale_tsc(vcpu, native_read_tsc());
+		msr_info->data = svm->vmcb->control.tsc_offset +
+			kvm_scale_tsc(vcpu, rdtsc());
 
 		break;
 	}
 	case MSR_STAR:
-		*data = svm->vmcb->save.star;
+		msr_info->data = svm->vmcb->save.star;
 		break;
 #ifdef CONFIG_X86_64
 	case MSR_LSTAR:
-		*data = svm->vmcb->save.lstar;
+		msr_info->data = svm->vmcb->save.lstar;
 		break;
 	case MSR_CSTAR:
-		*data = svm->vmcb->save.cstar;
+		msr_info->data = svm->vmcb->save.cstar;
 		break;
 	case MSR_KERNEL_GS_BASE:
-		*data = svm->vmcb->save.kernel_gs_base;
+		msr_info->data = svm->vmcb->save.kernel_gs_base;
 		break;
 	case MSR_SYSCALL_MASK:
-		*data = svm->vmcb->save.sfmask;
+		msr_info->data = svm->vmcb->save.sfmask;
 		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
-		*data = svm->vmcb->save.sysenter_cs;
+		msr_info->data = svm->vmcb->save.sysenter_cs;
 		break;
 	case MSR_IA32_SYSENTER_EIP:
-		*data = svm->sysenter_eip;
+		msr_info->data = svm->sysenter_eip;
 		break;
 	case MSR_IA32_SYSENTER_ESP:
-		*data = svm->sysenter_esp;
+		msr_info->data = svm->sysenter_esp;
 		break;
 	/*
 	 * Nobody will change the following 5 values in the VMCB so we can
@@ -3112,31 +3030,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	 * implemented.
 	 */
 	case MSR_IA32_DEBUGCTLMSR:
-		*data = svm->vmcb->save.dbgctl;
+		msr_info->data = svm->vmcb->save.dbgctl;
 		break;
 	case MSR_IA32_LASTBRANCHFROMIP:
-		*data = svm->vmcb->save.br_from;
+		msr_info->data = svm->vmcb->save.br_from;
 		break;
 	case MSR_IA32_LASTBRANCHTOIP:
-		*data = svm->vmcb->save.br_to;
+		msr_info->data = svm->vmcb->save.br_to;
 		break;
 	case MSR_IA32_LASTINTFROMIP:
-		*data = svm->vmcb->save.last_excp_from;
+		msr_info->data = svm->vmcb->save.last_excp_from;
 		break;
 	case MSR_IA32_LASTINTTOIP:
-		*data = svm->vmcb->save.last_excp_to;
+		msr_info->data = svm->vmcb->save.last_excp_to;
 		break;
 	case MSR_VM_HSAVE_PA:
-		*data = svm->nested.hsave_msr;
+		msr_info->data = svm->nested.hsave_msr;
 		break;
 	case MSR_VM_CR:
-		*data = svm->nested.vm_cr_msr;
+		msr_info->data = svm->nested.vm_cr_msr;
 		break;
 	case MSR_IA32_UCODE_REV:
-		*data = 0x01000065;
+		msr_info->data = 0x01000065;
 		break;
 	default:
-		return kvm_get_msr_common(vcpu, ecx, data);
+		return kvm_get_msr_common(vcpu, msr_info);
 	}
 	return 0;
 }
@@ -3144,16 +3062,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 static int rdmsr_interception(struct vcpu_svm *svm)
 {
 	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
-	u64 data;
+	struct msr_data msr_info;
 
-	if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+	msr_info.index = ecx;
+	msr_info.host_initiated = false;
+	if (svm_get_msr(&svm->vcpu, &msr_info)) {
 		trace_kvm_msr_read_ex(ecx);
 		kvm_inject_gp(&svm->vcpu, 0);
 	} else {
-		trace_kvm_msr_read(ecx, data);
+		trace_kvm_msr_read(ecx, msr_info.data);
 
-		kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
-		kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
+		kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
+				   msr_info.data & 0xffffffff);
+		kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
+				   msr_info.data >> 32);
 		svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 		skip_emulated_instruction(&svm->vcpu);
 	}
@@ -3284,24 +3206,11 @@ static int msr_interception(struct vcpu_svm *svm)
 
 static int interrupt_window_interception(struct vcpu_svm *svm)
 {
-	struct kvm_run *kvm_run = svm->vcpu.run;
-
 	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	mark_dirty(svm->vmcb, VMCB_INTR);
 	++svm->vcpu.stat.irq_window_exits;
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
-	    kvm_run->request_interrupt_window &&
-	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
-		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		return 0;
-	}
-
 	return 1;
 }
 
@@ -3361,6 +3270,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
 	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
 	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
+	[SVM_EXIT_EXCP_BASE + AC_VECTOR]	= ac_interception,
 	[SVM_EXIT_INTR]				= intr_interception,
 	[SVM_EXIT_NMI]				= nmi_interception,
 	[SVM_EXIT_SMI]				= nop_on_interception,
@@ -3390,6 +3300,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
+	[SVM_EXIT_RSM]                          = emulate_on_interception,
 };
 
 static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -3511,6 +3422,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	u32 exit_code = svm->vmcb->control.exit_code;
 
+	trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
 	if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
 		vcpu->arch.cr0 = svm->vmcb->save.cr0;
 	if (npt_enabled)
@@ -3648,12 +3561,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	return;
 }
 
-static int svm_vm_has_apicv(struct kvm *kvm)
+static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
 
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
 	return;
 }
@@ -3743,7 +3656,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	 */
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-	update_db_bp_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3982,8 +3894,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
 	vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
-	trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
-
 	if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
 		kvm_before_handle_nmi(&svm->vcpu);
 
@@ -4075,6 +3985,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
+static bool svm_has_high_real_mode_segbase(void)
+{
+	return true;
+}
+
 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
@@ -4082,6 +3997,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 
 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/* Update nrips enabled cache */
+	svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
 }
 
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -4350,6 +4269,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+	.cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,
@@ -4359,7 +4279,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.vcpu_load = svm_vcpu_load,
 	.vcpu_put = svm_vcpu_put,
 
-	.update_db_bp_intercept = update_db_bp_intercept,
+	.update_bp_intercept = update_bp_intercept,
 	.get_msr = svm_get_msr,
 	.set_msr = svm_set_msr,
 	.get_segment_base = svm_get_segment_base,
@@ -4408,7 +4328,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
-	.vm_has_apicv = svm_vm_has_apicv,
+	.cpu_uses_apicv = svm_cpu_uses_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
@@ -4431,11 +4351,9 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
-	.set_tsc_khz = svm_set_tsc_khz,
 	.read_tsc_offset = svm_read_tsc_offset,
 	.write_tsc_offset = svm_write_tsc_offset,
-	.adjust_tsc_offset = svm_adjust_tsc_offset,
-	.compute_tsc_offset = svm_compute_tsc_offset,
+	.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
 	.read_l1_tsc = svm_read_l1_tsc,
 
 	.set_tdp_cr3 = set_tdp_cr3,
@@ -4444,6 +4362,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.handle_external_intr = svm_handle_external_intr,
 
 	.sched_in = svm_sched_in,
+
+	.pmu_ops = &amd_pmu_ops,
 };
 
 static int __init svm_init(void)
diff --git a/kernel/arch/x86/kvm/trace.h b/kernel/arch/x86/kvm/trace.h
index 7c7bc8bef..ab9ae67a8 100644
--- a/kernel/arch/x86/kvm/trace.h
+++ b/kernel/arch/x86/kvm/trace.h
@@ -129,6 +129,24 @@ TRACE_EVENT(kvm_pio,
 );
 
 /*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+	TP_PROTO(u64 gpa),
+	TP_ARGS(gpa),
+
+	TP_STRUCT__entry(
+		__field(u64,	gpa)
+	),
+
+	TP_fast_assign(
+		__entry->gpa		= gpa;
+	),
+
+	TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
  * Tracepoint for cpuid.
  */
 TRACE_EVENT(kvm_cpuid,
@@ -250,7 +268,7 @@ TRACE_EVENT(kvm_inj_virq,
 #define kvm_trace_sym_exc						\
 	EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM),	\
 	EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF),		\
-	EXS(MF), EXS(MC)
+	EXS(MF), EXS(AC), EXS(MC)
 
 /*
  * Tracepoint for kvm interrupt injection:
@@ -952,6 +970,61 @@ TRACE_EVENT(kvm_wait_lapic_expire,
 		  __entry->delta < 0 ? "early" : "late")
 );
 
+TRACE_EVENT(kvm_enter_smm,
+	TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+	TP_ARGS(vcpu_id, smbase, entering),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
+		__field(	u64,		smbase		)
+		__field(	bool,		entering	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->smbase		= smbase;
+		__entry->entering	= entering;
+	),
+
+	TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+		  __entry->vcpu_id,
+		  __entry->entering ? "entering" : "leaving",
+		  __entry->smbase)
+);
+
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+	TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
+		 unsigned int gvec, u64 pi_desc_addr, bool set),
+	TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
+		__field(	unsigned int,	gsi		)
+		__field(	unsigned int,	gvec		)
+		__field(	u64,		pi_desc_addr	)
+		__field(	bool,		set		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->gsi		= gsi;
+		__entry->gvec		= gvec;
+		__entry->pi_desc_addr	= pi_desc_addr;
+		__entry->set		= set;
+	),
+
+	TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+		  "gvec: 0x%x, pi_desc_addr: 0x%llx",
+		  __entry->set ? "enabled and being updated" : "disabled",
+		  __entry->vcpu_id,
+		  __entry->gsi,
+		  __entry->gvec,
+		  __entry->pi_desc_addr)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index 2d73807f0..0958fa2b7 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -28,26 +28,28 @@
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
 #include <linux/mod_devicetable.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
 #include <linux/hrtimer.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
+#include <asm/cpu.h>
 #include <asm/io.h>
 #include <asm/desc.h>
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
+#include <asm/fpu/internal.h>
 #include <asm/perf_event.h>
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
 #include <asm/apic.h>
+#include <asm/irq_remapping.h>
 
 #include "trace.h"
+#include "pmu.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 #define __ex_clear(x, reg) \
@@ -105,6 +107,8 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON						\
@@ -424,6 +428,9 @@ struct nested_vmx {
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	u64 vmcs01_debugctl;
 
+	u16 vpid02;
+	u16 last_vpid;
+
 	u32 nested_vmx_procbased_ctls_low;
 	u32 nested_vmx_procbased_ctls_high;
 	u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +447,33 @@ struct nested_vmx {
 	u32 nested_vmx_misc_low;
 	u32 nested_vmx_misc_high;
 	u32 nested_vmx_ept_caps;
+	u32 nested_vmx_vpid_caps;
 };
 
 #define POSTED_INTR_ON  0
+#define POSTED_INTR_SN  1
+
 /* Posted-Interrupt Descriptor */
 struct pi_desc {
 	u32 pir[8];     /* Posted interrupt requested */
-	u32 control;	/* bit 0 of control is outstanding notification bit */
-	u32 rsvd[7];
+	union {
+		struct {
+				/* bit 256 - Outstanding Notification */
+			u16	on	: 1,
+				/* bit 257 - Suppress Notification */
+				sn	: 1,
+				/* bit 271:258 - Reserved */
+				rsvd_1	: 14;
+				/* bit 279:272 - Notification Vector */
+			u8	nv;
+				/* bit 287:280 - Reserved */
+			u8	rsvd_2;
+				/* bit 319:288 - Notification Destination */
+			u32	ndst;
+		};
+		u64 control;
+	};
+	u32 rsvd[6];
 } __aligned(64);
 
 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +493,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+	return clear_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+	return set_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -532,8 +582,6 @@ struct vcpu_vmx {
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
 
-	bool rdtscp_enabled;
-
 	/* Posted interrupt descriptor */
 	struct pi_desc pi_desc;
 
@@ -547,6 +595,8 @@ struct vcpu_vmx {
 	/* Support for PML */
 #define PML_ENTITY_NUM		512
 	struct page *pml_pg;
+
+	u64 current_tsc_ratio;
 };
 
 enum segment_cache_field {
@@ -563,6 +613,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+	return &(to_vmx(vcpu)->pi_desc);
+}
+
 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
 #define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
 #define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
@@ -786,7 +841,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
 {
-	struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+	struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
 	if (is_error_page(page))
 		return NULL;
 
@@ -809,7 +864,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@@ -831,6 +886,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1008,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
 {
-	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+	return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
 }
 
 static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1045,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 
 static inline bool cpu_has_vmx_posted_intr(void)
 {
-	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+	return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+		vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
 }
 
 static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1125,9 @@ static inline bool cpu_has_vmx_ple(void)
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 {
-	return flexpriority_enabled && irqchip_in_kernel(kvm);
+	return flexpriority_enabled && lapic_in_kernel(vcpu);
 }
 
 static inline bool cpu_has_vmx_vpid(void)
@@ -1113,6 +1176,12 @@ static inline bool cpu_has_vmx_pml(void)
 	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
 }
 
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_TSC_SCALING;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -1157,6 +1226,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }
 
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 {
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1264,7 +1338,7 @@ static void vmcs_load(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This bitmap is used to indicate whether the vmclear
  * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1376,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
 #else
 static inline void crash_enable_local_vmclear(int cpu) { }
 static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -1337,13 +1411,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 			 __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(int vpid)
 {
-	if (vmx->vpid == 0)
+	if (vpid == 0)
 		return;
 
 	if (cpu_has_vmx_invvpid_single())
-		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
 }
 
 static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1426,10 @@ static inline void vpid_sync_vcpu_global(void)
 		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 }
 
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+static inline void vpid_sync_context(int vpid)
 {
 	if (cpu_has_vmx_invvpid_single())
-		vpid_sync_vcpu_single(vmx);
+		vpid_sync_vcpu_single(vpid);
 	else
 		vpid_sync_vcpu_global();
 }
@@ -1567,7 +1641,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	u32 eb;
 
 	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-	     (1u << NM_VECTOR) | (1u << DB_VECTOR);
+	     (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
 	if ((vcpu->guest_debug &
 	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1674,6 +1748,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 			return;
 		}
 		break;
+	case MSR_IA32_PEBS_ENABLE:
+		/* PEBS needs a quiescent period after being disabled (to write
+		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
+		 * provide that period, so a CPU could write host's record into
+		 * guest's memory.
+		 */
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 	}
 
 	for (i = 0; i < m->nr; ++i)
@@ -1711,26 +1792,31 @@ static void reload_tss(void)
 
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 {
-	u64 guest_efer;
-	u64 ignore_bits;
+	u64 guest_efer = vmx->vcpu.arch.efer;
+	u64 ignore_bits = 0;
 
-	guest_efer = vmx->vcpu.arch.efer;
+	if (!enable_ept) {
+		/*
+		 * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+		 * host CPUID is more efficient than testing guest CPUID
+		 * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+		 */
+		if (boot_cpu_has(X86_FEATURE_SMEP))
+			guest_efer |= EFER_NX;
+		else if (!(guest_efer & EFER_NX))
+			ignore_bits |= EFER_NX;
+	}
 
 	/*
-	 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
-	 * outside long mode
+	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
 	 */
-	ignore_bits = EFER_NX | EFER_SCE;
+	ignore_bits |= EFER_SCE;
 #ifdef CONFIG_X86_64
 	ignore_bits |= EFER_LMA | EFER_LME;
 	/* SCE is meaningful only in long mode on Intel */
 	if (guest_efer & EFER_LMA)
 		ignore_bits &= ~(u64)EFER_SCE;
 #endif
-	guest_efer &= ~ignore_bits;
-	guest_efer |= host_efer & ignore_bits;
-	vmx->guest_msrs[efer_offset].data = guest_efer;
-	vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 
 	clear_atomic_switch_msr(vmx, MSR_EFER);
 
@@ -1741,16 +1827,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	 */
 	if (cpu_has_load_ia32_efer ||
 	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-		guest_efer = vmx->vcpu.arch.efer;
 		if (!(guest_efer & EFER_LMA))
 			guest_efer &= ~EFER_LME;
 		if (guest_efer != host_efer)
 			add_atomic_switch_msr(vmx, MSR_EFER,
 					      guest_efer, host_efer);
 		return false;
-	}
+	} else {
+		guest_efer &= ~ignore_bits;
+		guest_efer |= host_efer & ignore_bits;
 
-	return true;
+		vmx->guest_msrs[efer_offset].data = guest_efer;
+		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+		return true;
+	}
 }
 
 static unsigned long segment_base(u16 selector)
@@ -1883,7 +1974,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 	 * If the FPU is not active (through the host task or
 	 * the guest vcpu), then restore the cr0.TS bit.
 	 */
-	if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
+	if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded)
 		stts();
 	load_gdt(this_cpu_ptr(&host_gdt));
 }
@@ -1895,6 +1986,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 }
 
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		/*
+		 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+		 * are two possible cases:
+		 * 1. After running 'pre_block', context switch
+		 *    happened. For this case, 'sn' was set in
+		 *    vmx_vcpu_put(), so we need to clear it here.
+		 * 2. After running 'pre_block', we were blocked,
+		 *    and woken up by some other guy. For this case,
+		 *    we don't need to do anything, 'pi_post_block'
+		 *    will do everything for us. However, we cannot
+		 *    check whether it is case #1 or case #2 here
+		 *    (maybe, not needed), so we also clear sn here,
+		 *    I think it is not a big deal.
+		 */
+		if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+			if (vcpu->cpu != cpu) {
+				dest = cpu_physical_id(cpu);
+
+				if (x2apic_enabled())
+					new.ndst = dest;
+				else
+					new.ndst = (dest << 8) & 0xFF00;
+			}
+
+			/* set 'NV' to 'notification vector' */
+			new.nv = POSTED_INTR_VECTOR;
+		}
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+}
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -1943,12 +2080,37 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
 		vmx->loaded_vmcs->cpu = cpu;
 	}
+
+	/* Setup TSC multiplier */
+	if (kvm_has_tsc_control &&
+	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+		vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+		vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+	}
+
+	vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	/* Set SN when the vCPU is preempted */
+	if (vcpu->preempted)
+		pi_set_sn(pi_desc);
 }
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	vmx_vcpu_pi_put(vcpu);
+
 	__vmx_load_host_state(to_vmx(vcpu));
 	if (!vmm_exclusive) {
 		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2170,8 +2332,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 
 	if (is_guest_mode(vcpu))
 		msr_bitmap = vmx_msr_bitmap_nested;
-	else if (irqchip_in_kernel(vcpu->kvm) &&
-		apic_x2apic_mode(vcpu->arch.apic)) {
+	else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 		else
@@ -2208,7 +2369,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		if (index >= 0)
 			move_msr_up(vmx, index, save_nmsrs++);
 		index = __find_msr_index(vmx, MSR_TSC_AUX);
-		if (index >= 0 && vmx->rdtscp_enabled)
+		if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
 			move_msr_up(vmx, index, save_nmsrs++);
 		/*
 		 * MSR_STAR is only needed on long mode guests, and only
@@ -2231,15 +2392,16 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 
 /*
  * reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset    -- 21.3
+ * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
+ * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
  */
-static u64 guest_read_tsc(void)
+static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
 {
 	u64 host_tsc, tsc_offset;
 
-	rdtscll(host_tsc);
+	host_tsc = rdtsc();
 	tsc_offset = vmcs_read64(TSC_OFFSET);
-	return host_tsc + tsc_offset;
+	return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
 }
 
 /*
@@ -2256,22 +2418,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 	return host_tsc + tsc_offset;
 }
 
-/*
- * Engage any workarounds for mis-matched TSC rates.  Currently limited to
- * software catchup for faster rates on slower CPUs.
- */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
-	if (!scale)
-		return;
-
-	if (user_tsc_khz > tsc_khz) {
-		vcpu->arch.tsc_catchup = 1;
-		vcpu->arch.tsc_always_catchup = 1;
-	} else
-		WARN(1, "user requested TSC rate below hardware speed\n");
-}
-
 static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
 {
 	return vmcs_read64(TSC_OFFSET);
@@ -2303,7 +2449,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	}
 }
 
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
 {
 	u64 offset = vmcs_read64(TSC_OFFSET);
 
@@ -2316,11 +2462,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
 					   offset + adjustment);
 }
 
-static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
-	return target_tsc - native_read_tsc();
-}
-
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2378,7 +2519,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_pinbased_ctls_high |=
 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
-	if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (vmx_cpu_uses_apicv(&vmx->vcpu))
 		vmx->nested.nested_vmx_pinbased_ctls_high |=
 			PIN_BASED_POSTED_INTR;
 
@@ -2444,10 +2585,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
 #endif
 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
-		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
-		CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
-		CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW |
-		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
+		CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
+		CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
+		CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	/*
 	 * We can allow some features even when not supported by the
 	 * hardware. For example, L1 can specify an MSR bitmap - and we
@@ -2472,10 +2613,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_RDTSCP |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_ENABLE_VPID |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
-		SECONDARY_EXEC_XSAVES;
+		SECONDARY_EXEC_XSAVES |
+		SECONDARY_EXEC_PCOMMIT;
 
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
@@ -2494,6 +2637,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
+	if (enable_vpid)
+		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+				VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+	else
+		vmx->nested.nested_vmx_vpid_caps = 0;
+
 	if (enable_unrestricted_guest)
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2609,7 +2758,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		break;
 	case MSR_IA32_VMX_EPT_VPID_CAP:
 		/* Currently, no nested vpid support */
-		*pdata = vmx->nested.nested_vmx_ept_caps;
+		*pdata = vmx->nested.nested_vmx_ept_caps |
+			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
 		break;
 	default:
 		return 1;
@@ -2623,76 +2773,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	u64 data;
 	struct shared_msr_entry *msr;
 
-	if (!pdata) {
-		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
-		return -EINVAL;
-	}
-
-	switch (msr_index) {
+	switch (msr_info->index) {
 #ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
-		data = vmcs_readl(GUEST_FS_BASE);
+		msr_info->data = vmcs_readl(GUEST_FS_BASE);
 		break;
 	case MSR_GS_BASE:
-		data = vmcs_readl(GUEST_GS_BASE);
+		msr_info->data = vmcs_readl(GUEST_GS_BASE);
 		break;
 	case MSR_KERNEL_GS_BASE:
 		vmx_load_host_state(to_vmx(vcpu));
-		data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+		msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
 		break;
 #endif
 	case MSR_EFER:
-		return kvm_get_msr_common(vcpu, msr_index, pdata);
+		return kvm_get_msr_common(vcpu, msr_info);
 	case MSR_IA32_TSC:
-		data = guest_read_tsc();
+		msr_info->data = guest_read_tsc(vcpu);
 		break;
 	case MSR_IA32_SYSENTER_CS:
-		data = vmcs_read32(GUEST_SYSENTER_CS);
+		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
 		break;
 	case MSR_IA32_SYSENTER_EIP:
-		data = vmcs_readl(GUEST_SYSENTER_EIP);
+		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
 		break;
 	case MSR_IA32_SYSENTER_ESP:
-		data = vmcs_readl(GUEST_SYSENTER_ESP);
+		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	case MSR_IA32_BNDCFGS:
 		if (!vmx_mpx_supported())
 			return 1;
-		data = vmcs_read64(GUEST_BNDCFGS);
+		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
 	case MSR_IA32_FEATURE_CONTROL:
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
-		data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+		msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
 		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
-		return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+		return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
 	case MSR_IA32_XSS:
 		if (!vmx_xsaves_supported())
 			return 1;
-		data = vcpu->arch.ia32_xss;
+		msr_info->data = vcpu->arch.ia32_xss;
 		break;
 	case MSR_TSC_AUX:
-		if (!to_vmx(vcpu)->rdtscp_enabled)
+		if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		msr = find_msr_entry(to_vmx(vcpu), msr_index);
+		msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
 		if (msr) {
-			data = msr->data;
+			msr_info->data = msr->data;
 			break;
 		}
-		return kvm_get_msr_common(vcpu, msr_index, pdata);
+		return kvm_get_msr_common(vcpu, msr_info);
 	}
 
-	*pdata = data;
 	return 0;
 }
 
@@ -2787,7 +2930,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
 		break;
 	case MSR_TSC_AUX:
-		if (!vmx->rdtscp_enabled)
+		if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
 			return 1;
 		/* Check reserved bit, higher 32 bits should be zero */
 		if ((data >> 32) != 0)
@@ -2882,6 +3025,8 @@ static int hardware_enable(void)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
 	/*
 	 * Now we can enable the vmclear operation in kdump
@@ -3023,7 +3168,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 			SECONDARY_EXEC_SHADOW_VMCS |
 			SECONDARY_EXEC_XSAVES |
-			SECONDARY_EXEC_ENABLE_PML;
+			SECONDARY_EXEC_ENABLE_PML |
+			SECONDARY_EXEC_PCOMMIT |
+			SECONDARY_EXEC_TSC_SCALING;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -3158,7 +3305,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 	struct page *pages;
 	struct vmcs *vmcs;
 
-	pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+	pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
 	if (!pages)
 		return NULL;
 	vmcs = page_address(pages);
@@ -3431,12 +3578,12 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
 	vmx_segment_cache_clear(to_vmx(vcpu));
 
 	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
-	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
 		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
 				     __func__);
 		vmcs_write32(GUEST_TR_AR_BYTES,
-			     (guest_tr_ar & ~AR_TYPE_MASK)
-			     | AR_TYPE_BUSY_64_TSS);
+			     (guest_tr_ar & ~VMX_AR_TYPE_MASK)
+			     | VMX_AR_TYPE_BUSY_64_TSS);
 	}
 	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
 }
@@ -3449,9 +3596,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 {
-	vpid_sync_context(to_vmx(vcpu));
+	vpid_sync_context(vpid);
 	if (enable_ept) {
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 			return;
@@ -3459,6 +3606,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3652,20 +3804,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		if (!is_paging(vcpu)) {
 			hw_cr4 &= ~X86_CR4_PAE;
 			hw_cr4 |= X86_CR4_PSE;
-			/*
-			 * SMEP/SMAP is disabled if CPU is in non-paging mode
-			 * in hardware. However KVM always uses paging mode to
-			 * emulate guest non-paging mode with TDP.
-			 * To emulate this behavior, SMEP/SMAP needs to be
-			 * manually disabled when guest switches to non-paging
-			 * mode.
-			 */
-			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
 		} else if (!(cr4 & X86_CR4_PAE)) {
 			hw_cr4 &= ~X86_CR4_PAE;
 		}
 	}
 
+	if (!enable_unrestricted_guest && !is_paging(vcpu))
+		/*
+		 * SMEP/SMAP is disabled if CPU is in non-paging mode in
+		 * hardware.  However KVM always uses paging mode without
+		 * unrestricted guest.
+		 * To emulate this behavior, SMEP/SMAP needs to be manually
+		 * disabled when guest switches to non-paging mode.
+		 */
+		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
 	return 0;
@@ -3727,7 +3880,7 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)
 		return 0;
 	else {
 		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
-		return AR_DPL(ar);
+		return VMX_AR_DPL(ar);
 	}
 }
 
@@ -3855,11 +4008,11 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
 
 	if (cs.unusable)
 		return false;
-	if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+	if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
 		return false;
 	if (!cs.s)
 		return false;
-	if (cs.type & AR_TYPE_WRITEABLE_MASK) {
+	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
 		if (cs.dpl > cs_rpl)
 			return false;
 	} else {
@@ -3909,7 +4062,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
 		return false;
 	if (!var.present)
 		return false;
-	if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+	if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
 		if (var.dpl < rpl) /* DPL < RPL */
 			return false;
 	}
@@ -4113,17 +4266,13 @@ static void seg_setup(int seg)
 static int alloc_apic_access_page(struct kvm *kvm)
 {
 	struct page *page;
-	struct kvm_userspace_memory_region kvm_userspace_mem;
 	int r = 0;
 
 	mutex_lock(&kvm->slots_lock);
 	if (kvm->arch.apic_access_page_done)
 		goto out;
-	kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
-	kvm_userspace_mem.flags = 0;
-	kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
-	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
 	if (r)
 		goto out;
 
@@ -4148,44 +4297,38 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 {
 	/* Called with kvm->slots_lock held. */
 
-	struct kvm_userspace_memory_region kvm_userspace_mem;
 	int r = 0;
 
 	BUG_ON(kvm->arch.ept_identity_pagetable_done);
 
-	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
-	kvm_userspace_mem.flags = 0;
-	kvm_userspace_mem.guest_phys_addr =
-		kvm->arch.ept_identity_map_addr;
-	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+				    kvm->arch.ept_identity_map_addr, PAGE_SIZE);
 
 	return r;
 }
 
-static void allocate_vpid(struct vcpu_vmx *vmx)
+static int allocate_vpid(void)
 {
 	int vpid;
 
-	vmx->vpid = 0;
 	if (!enable_vpid)
-		return;
+		return 0;
 	spin_lock(&vmx_vpid_lock);
 	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-	if (vpid < VMX_NR_VPIDS) {
-		vmx->vpid = vpid;
+	if (vpid < VMX_NR_VPIDS)
 		__set_bit(vpid, vmx_vpid_bitmap);
-	}
+	else
+		vpid = 0;
 	spin_unlock(&vmx_vpid_lock);
+	return vpid;
 }
 
-static void free_vpid(struct vcpu_vmx *vmx)
+static void free_vpid(int vpid)
 {
-	if (!enable_vpid)
+	if (!enable_vpid || vpid == 0)
 		return;
 	spin_lock(&vmx_vpid_lock);
-	if (vmx->vpid != 0)
-		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
+	__clear_bit(vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
 }
 
@@ -4340,9 +4483,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 			msr, MSR_TYPE_W);
 }
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
-	return enable_apicv && irqchip_in_kernel(kvm);
+	return enable_apicv && lapic_in_kernel(vcpu);
 }
 
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4386,6 +4529,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_SMP
 	if (vcpu->mode == IN_GUEST_MODE) {
+		struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+		/*
+		 * Currently, we don't support urgent interrupt,
+		 * all interrupts are recognized as non-urgent
+		 * interrupt, so we cannot post interrupts when
+		 * 'SN' is set.
+		 *
+		 * If the vcpu is in guest mode, it means it is
+		 * running instead of being scheduled out and
+		 * waiting in the run queue, and that's the only
+		 * case when 'SN' is set currently, warning if
+		 * 'SN' is set.
+		 */
+		WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
 				POSTED_INTR_VECTOR);
 		return true;
@@ -4522,7 +4681,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 {
 	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 
-	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
 		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 	return pin_based_exec_ctrl;
 }
@@ -4534,7 +4693,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
 		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
 
-	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
 		exec_control &= ~CPU_BASED_TPR_SHADOW;
 #ifdef CONFIG_X86_64
 		exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4551,7 +4710,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-	if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+	if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
 		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 	if (vmx->vpid == 0)
 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4565,7 +4724,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 	if (!ple_gap)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
 		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4575,8 +4734,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	   a current VMCS12
 	*/
 	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-	/* PML is enabled/disabled in creating/destorying vcpu */
-	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+	if (!enable_pml)
+		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+	/* Currently, we allow L1 guest to directly run pcommit instruction. */
+	exec_control &= ~SECONDARY_EXEC_PCOMMIT;
 
 	return exec_control;
 }
@@ -4621,12 +4784,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
-	if (cpu_has_secondary_exec_ctrls()) {
+	if (cpu_has_secondary_exec_ctrls())
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
 				vmx_secondary_exec_control(vmx));
-	}
 
-	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+	if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4667,16 +4829,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
-	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
-		u32 msr_low, msr_high;
-		u64 host_pat;
-		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-		host_pat = msr_low | ((u64) msr_high << 32);
-		/* Write the default value follow host pat */
-		vmcs_write64(GUEST_IA32_PAT, host_pat);
-		/* Keep arch.pat sync with GUEST_IA32_PAT */
-		vmx->vcpu.arch.pat = host_pat;
-	}
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 
 	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
 		u32 index = vmx_msr_index[i];
@@ -4708,22 +4862,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	return 0;
 }
 
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct msr_data apic_base_msr;
+	u64 cr0;
 
 	vmx->rmode.vm86_active = 0;
 
 	vmx->soft_vnmi_blocked = 0;
 
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	kvm_set_cr8(&vmx->vcpu, 0);
-	apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
-		apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
-	apic_base_msr.host_initiated = true;
-	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
+	kvm_set_cr8(vcpu, 0);
+
+	if (!init_event) {
+		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+				     MSR_IA32_APICBASE_ENABLE;
+		if (kvm_vcpu_is_reset_bsp(vcpu))
+			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+		apic_base_msr.host_initiated = true;
+		kvm_set_apic_base(vcpu, &apic_base_msr);
+	}
 
 	vmx_segment_cache_clear(vmx);
 
@@ -4747,9 +4906,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
 	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
 
-	vmcs_write32(GUEST_SYSENTER_CS, 0);
-	vmcs_writel(GUEST_SYSENTER_ESP, 0);
-	vmcs_writel(GUEST_SYSENTER_EIP, 0);
+	if (!init_event) {
+		vmcs_write32(GUEST_SYSENTER_CS, 0);
+		vmcs_writel(GUEST_SYSENTER_ESP, 0);
+		vmcs_writel(GUEST_SYSENTER_EIP, 0);
+		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+	}
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
 	kvm_rip_write(vcpu, 0xfff0);
@@ -4764,37 +4926,35 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-	/* Special registers */
-	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
 	setup_msrs(vmx);
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 
-	if (cpu_has_vmx_tpr_shadow()) {
+	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-		if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+		if (cpu_need_tpr_shadow(vcpu))
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-				     __pa(vmx->vcpu.arch.apic->regs));
+				     __pa(vcpu->arch.apic->regs));
 		vmcs_write32(TPR_THRESHOLD, 0);
 	}
 
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-	if (vmx_vm_has_apicv(vcpu->kvm))
+	if (vmx_cpu_uses_apicv(vcpu))
 		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
 
 	if (vmx->vpid != 0)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
-	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-	vmx_set_cr4(&vmx->vcpu, 0);
-	vmx_set_efer(&vmx->vcpu, 0);
-	vmx_fpu_activate(&vmx->vcpu);
-	update_exception_bitmap(&vmx->vcpu);
+	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+	vmx_set_cr0(vcpu, cr0); /* enter rmode */
+	vmx->vcpu.arch.cr0 = cr0;
+	vmx_set_cr4(vcpu, 0);
+	vmx_set_efer(vcpu, 0);
+	vmx_fpu_activate(vcpu);
+	update_exception_bitmap(vcpu);
 
-	vpid_sync_context(vmx);
+	vpid_sync_context(vmx->vpid);
 }
 
 /*
@@ -4958,14 +5118,9 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
 	int ret;
-	struct kvm_userspace_memory_region tss_mem = {
-		.slot = TSS_PRIVATE_MEMSLOT,
-		.guest_phys_addr = addr,
-		.memory_size = PAGE_SIZE * 3,
-		.flags = 0,
-	};
 
-	ret = kvm_set_memory_region(kvm, &tss_mem);
+	ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+				    PAGE_SIZE * 3);
 	if (ret)
 		return ret;
 	kvm->arch.tss_addr = addr;
@@ -5127,6 +5282,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		return handle_rmode_exception(vcpu, ex_no, error_code);
 
 	switch (ex_no) {
+	case AC_VECTOR:
+		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+		return 1;
 	case DB_VECTOR:
 		dr6 = vmcs_readl(EXIT_QUALIFICATION);
 		if (!(vcpu->guest_debug &
@@ -5319,7 +5477,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 				u8 cr8 = (u8)val;
 				err = kvm_set_cr8(vcpu, cr8);
 				kvm_complete_insn_gp(vcpu, err);
-				if (irqchip_in_kernel(vcpu->kvm))
+				if (lapic_in_kernel(vcpu))
 					return 1;
 				if (cr8_prev <= cr8)
 					return 1;
@@ -5475,19 +5633,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
 static int handle_rdmsr(struct kvm_vcpu *vcpu)
 {
 	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-	u64 data;
+	struct msr_data msr_info;
 
-	if (vmx_get_msr(vcpu, ecx, &data)) {
+	msr_info.index = ecx;
+	msr_info.host_initiated = false;
+	if (vmx_get_msr(vcpu, &msr_info)) {
 		trace_kvm_msr_read_ex(ecx);
 		kvm_inject_gp(vcpu, 0);
 		return 1;
 	}
 
-	trace_kvm_msr_read(ecx, data);
+	trace_kvm_msr_read(ecx, msr_info.data);
 
 	/* FIXME: handling of bits 32:63 of rax, rdx */
-	vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
-	vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+	vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
+	vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
@@ -5531,17 +5691,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	++vcpu->stat.irq_window_exits;
-
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (!irqchip_in_kernel(vcpu->kvm) &&
-	    vcpu->run->request_interrupt_window &&
-	    !kvm_cpu_has_interrupt(vcpu)) {
-		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		return 0;
-	}
 	return 1;
 }
 
@@ -5710,9 +5859,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	/* clear all local breakpoint enable flags */
-	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
-
 	/*
 	 * TODO: What about debug traps on tss switch?
 	 *       Are we supposed to inject them and update dr6?
@@ -5769,82 +5915,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
-static u64 ept_rsvd_mask(u64 spte, int level)
-{
-	int i;
-	u64 mask = 0;
-
-	for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
-		mask |= (1ULL << i);
-
-	if (level == 4)
-		/* bits 7:3 reserved */
-		mask |= 0xf8;
-	else if (spte & (1ULL << 7))
-		/*
-		 * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively,
-		 * level == 1 if the hypervisor is using the ignored bit 7.
-		 */
-		mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE;
-	else if (level > 1)
-		/* bits 6:3 reserved */
-		mask |= 0x78;
-
-	return mask;
-}
-
-static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
-				       int level)
-{
-	printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
-
-	/* 010b (write-only) */
-	WARN_ON((spte & 0x7) == 0x2);
-
-	/* 110b (write/execute) */
-	WARN_ON((spte & 0x7) == 0x6);
-
-	/* 100b (execute-only) and value not supported by logical processor */
-	if (!cpu_has_vmx_ept_execute_only())
-		WARN_ON((spte & 0x7) == 0x4);
-
-	/* not 000b */
-	if ((spte & 0x7)) {
-		u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
-
-		if (rsvd_bits != 0) {
-			printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
-					 __func__, rsvd_bits);
-			WARN_ON(1);
-		}
-
-		/* bits 5:3 are _not_ reserved for large page or leaf page */
-		if ((rsvd_bits & 0x38) == 0) {
-			u64 ept_mem_type = (spte & 0x38) >> 3;
-
-			if (ept_mem_type == 2 || ept_mem_type == 3 ||
-			    ept_mem_type == 7) {
-				printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
-						__func__, ept_mem_type);
-				WARN_ON(1);
-			}
-		}
-	}
-}
-
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
-	u64 sptes[4];
-	int nr_sptes, i, ret;
+	int ret;
 	gpa_t gpa;
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 		skip_emulated_instruction(vcpu);
+		trace_kvm_fast_mmio(gpa);
 		return 1;
 	}
 
-	ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+	ret = handle_mmio_page_fault(vcpu, gpa, true);
 	if (likely(ret == RET_MMIO_PF_EMULATE))
 		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
 					      EMULATE_DONE;
@@ -5856,13 +5939,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 		return 1;
 
 	/* It is the real ept misconfig */
-	printk(KERN_ERR "EPT: Misconfiguration.\n");
-	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
-
-	nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
-
-	for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
-		ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+	WARN_ON(1);
 
 	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
 	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
@@ -6004,6 +6081,25 @@ static void update_ple_window_actual_max(void)
 			                    ple_window_grow, INT_MIN);
 }
 
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+	struct kvm_vcpu *vcpu;
+	int cpu = smp_processor_id();
+
+	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+			blocked_vcpu_list) {
+		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+		if (pi_test_on(pi_desc) == 1)
+			kvm_vcpu_kick(vcpu);
+	}
+	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
 static __init int hardware_setup(void)
 {
 	int r = -ENOMEM, i, msr;
@@ -6122,6 +6218,12 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_apicv())
 		enable_apicv = 0;
 
+	if (cpu_has_vmx_tsc_scaling()) {
+		kvm_has_tsc_control = true;
+		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+		kvm_tsc_scaling_ratio_frac_bits = 48;
+	}
+
 	if (enable_apicv)
 		kvm_x86_ops->update_cr8_intercept = NULL;
 	else {
@@ -6144,6 +6246,8 @@ static __init int hardware_setup(void)
 	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
 
+	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
 	if (enable_apicv) {
 		for (msr = 0x800; msr <= 0x8ff; msr++)
 			vmx_disable_intercept_msr_read_x2apic(msr);
@@ -6188,6 +6292,8 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 	}
 
+	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
 	return alloc_kvm_area();
 
 out8:
@@ -6256,6 +6362,11 @@ static int handle_mwait(struct kvm_vcpu *vcpu)
 	return handle_nop(vcpu);
 }
 
+static int handle_monitor_trap(struct kvm_vcpu *vcpu)
+{
+	return 1;
+}
+
 static int handle_monitor(struct kvm_vcpu *vcpu)
 {
 	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
@@ -6418,8 +6529,12 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
  */
 static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 				 unsigned long exit_qualification,
-				 u32 vmx_instruction_info, gva_t *ret)
+				 u32 vmx_instruction_info, bool wr, gva_t *ret)
 {
+	gva_t off;
+	bool exn;
+	struct kvm_segment s;
+
 	/*
 	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
 	 * Execution", on an exit, vmx_instruction_info holds most of the
@@ -6444,22 +6559,63 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
 
 	/* Addr = segment_base + offset */
 	/* offset = base + [index * scale] + displacement */
-	*ret = vmx_get_segment_base(vcpu, seg_reg);
+	off = exit_qualification; /* holds the displacement */
 	if (base_is_valid)
-		*ret += kvm_register_read(vcpu, base_reg);
+		off += kvm_register_read(vcpu, base_reg);
 	if (index_is_valid)
-		*ret += kvm_register_read(vcpu, index_reg)<<scaling;
-	*ret += exit_qualification; /* holds the displacement */
+		off += kvm_register_read(vcpu, index_reg)<<scaling;
+	vmx_get_segment(vcpu, &s, seg_reg);
+	*ret = s.base + off;
 
 	if (addr_size == 1) /* 32 bit */
 		*ret &= 0xffffffff;
 
-	/*
-	 * TODO: throw #GP (and return 1) in various cases that the VM*
-	 * instructions require it - e.g., offset beyond segment limit,
-	 * unusable or unreadable/unwritable segment, non-canonical 64-bit
-	 * address, and so on. Currently these are not checked.
-	 */
+	/* Checks for #GP/#SS exceptions. */
+	exn = false;
+	if (is_protmode(vcpu)) {
+		/* Protected mode: apply checks for segment validity in the
+		 * following order:
+		 * - segment type check (#GP(0) may be thrown)
+		 * - usability check (#GP(0)/#SS(0))
+		 * - limit check (#GP(0)/#SS(0))
+		 */
+		if (wr)
+			/* #GP(0) if the destination operand is located in a
+			 * read-only data segment or any code segment.
+			 */
+			exn = ((s.type & 0xa) == 0 || (s.type & 8));
+		else
+			/* #GP(0) if the source operand is located in an
+			 * execute-only code segment
+			 */
+			exn = ((s.type & 0xa) == 8);
+	}
+	if (exn) {
+		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+		return 1;
+	}
+	if (is_long_mode(vcpu)) {
+		/* Long mode: #GP(0)/#SS(0) if the memory address is in a
+		 * non-canonical form. This is an only check for long mode.
+		 */
+		exn = is_noncanonical_address(*ret);
+	} else if (is_protmode(vcpu)) {
+		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
+		 */
+		exn = (s.unusable != 0);
+		/* Protected mode: #GP(0)/#SS(0) if the memory
+		 * operand is outside the segment limit.
+		 */
+		exn = exn || (off + sizeof(u64) > s.limit);
+	}
+	if (exn) {
+		kvm_queue_exception_e(vcpu,
+				      seg_reg == VCPU_SREG_SS ?
+						SS_VECTOR : GP_VECTOR,
+				      0);
+		return 1;
+	}
+
 	return 0;
 }
 
@@ -6481,7 +6637,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+			vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
 		return 1;
 
 	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
@@ -6669,7 +6825,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
-	u32 exec_control;
 	if (vmx->nested.current_vmptr == -1ull)
 		return;
 
@@ -6682,9 +6837,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 		   they were modified */
 		copy_shadow_to_vmcs12(vmx);
 		vmx->nested.sync_shadow_vmcs = false;
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+				SECONDARY_EXEC_SHADOW_VMCS);
 		vmcs_write64(VMCS_LINK_POINTER, -1ull);
 	}
 	vmx->nested.posted_intr_nv = -1;
@@ -6704,6 +6858,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		return;
 
 	vmx->nested.vmxon = false;
+	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7009,7 +7164,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 			field_value);
 	} else {
 		if (get_vmx_mem_address(vcpu, exit_qualification,
-				vmx_instruction_info, &gva))
+				vmx_instruction_info, true, &gva))
 			return 1;
 		/* _system ok, as nested_vmx_check_permission verified cpl=0 */
 		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
@@ -7046,7 +7201,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 			(((vmx_instruction_info) >> 3) & 0xf));
 	else {
 		if (get_vmx_mem_address(vcpu, exit_qualification,
-				vmx_instruction_info, &gva))
+				vmx_instruction_info, false, &gva))
 			return 1;
 		if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
 			   &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
@@ -7080,7 +7235,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	gpa_t vmptr;
-	u32 exec_control;
 
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
@@ -7112,9 +7266,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
 		if (enable_shadow_vmcs) {
-			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-			exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
-			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+				      SECONDARY_EXEC_SHADOW_VMCS);
 			vmcs_write64(VMCS_LINK_POINTER,
 				     __pa(vmx->nested.current_shadow_vmcs));
 			vmx->nested.sync_shadow_vmcs = true;
@@ -7138,7 +7291,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
 		return 1;
 
 	if (get_vmx_mem_address(vcpu, exit_qualification,
-			vmx_instruction_info, &vmcs_gva))
+			vmx_instruction_info, true, &vmcs_gva))
 		return 1;
 	/* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
 	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
@@ -7194,7 +7347,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 	 * operand is read even if it isn't needed (e.g., for type==global)
 	 */
 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-			vmx_instruction_info, &gva))
+			vmx_instruction_info, false, &gva))
 		return 1;
 	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
 				sizeof(operand), &e)) {
@@ -7220,7 +7373,58 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
 static int handle_invvpid(struct kvm_vcpu *vcpu)
 {
-	kvm_queue_exception(vcpu, UD_VECTOR);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 vmx_instruction_info;
+	unsigned long type, types;
+	gva_t gva;
+	struct x86_exception e;
+	int vpid;
+
+	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+	      SECONDARY_EXEC_ENABLE_VPID) ||
+			!(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+	types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+
+	if (!(types & (1UL << type))) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		return 1;
+	}
+
+	/* according to the intel vmx instruction reference, the memory
+	 * operand is read even if it isn't needed (e.g., for type==global)
+	 */
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmx_instruction_info, false, &gva))
+		return 1;
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
+				sizeof(u32), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	switch (type) {
+	case VMX_VPID_EXTENT_ALL_CONTEXT:
+		__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+		nested_vmx_succeed(vcpu);
+		break;
+	default:
+		/* Trap single context invalidation invvpid calls */
+		BUG_ON(1);
+		break;
+	}
+
+	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
@@ -7249,6 +7453,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+	/* we never catch pcommit instruct for L1 guest. */
+	WARN_ON(1);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7292,12 +7503,14 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
 	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
+	[EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
 	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
 	[EXIT_REASON_INVEPT]                  = handle_invept,
 	[EXIT_REASON_INVVPID]                 = handle_invvpid,
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
+	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7333,7 +7546,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 		bitmap += (port & 0x7fff) / 8;
 
 		if (last_bitmap != bitmap)
-			if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
 				return true;
 		if (b & (1 << (port & 7)))
 			return true;
@@ -7377,7 +7590,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 	/* Then read the msr_index'th bit from this bitmap: */
 	if (msr_index < 1024*8) {
 		unsigned char b;
-		if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
 			return true;
 		return 1 & (b >> (msr_index & 7));
 	} else
@@ -7552,6 +7765,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return true;
 	case EXIT_REASON_MWAIT_INSTRUCTION:
 		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+	case EXIT_REASON_MONITOR_TRAP_FLAG:
+		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
 	case EXIT_REASON_MONITOR_INSTRUCTION:
 		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
 	case EXIT_REASON_PAUSE_INSTRUCTION:
@@ -7597,6 +7812,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		 * the XSS exit bitmap in vmcs12.
 		 */
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+	case EXIT_REASON_PCOMMIT:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
 	default:
 		return true;
 	}
@@ -7608,10 +7825,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
-static int vmx_enable_pml(struct vcpu_vmx *vmx)
+static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
 {
 	struct page *pml_pg;
-	u32 exec_control;
 
 	pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!pml_pg)
@@ -7622,29 +7838,20 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
 	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 
-	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-	exec_control |= SECONDARY_EXEC_ENABLE_PML;
-	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-
 	return 0;
 }
 
-static void vmx_disable_pml(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 {
-	u32 exec_control;
-
-	ASSERT(vmx->pml_pg);
-	__free_page(vmx->pml_pg);
-	vmx->pml_pg = NULL;
-
-	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
-	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	if (vmx->pml_pg) {
+		__free_page(vmx->pml_pg);
+		vmx->pml_pg = NULL;
+	}
 }
 
-static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
 {
-	struct kvm *kvm = vmx->vcpu.kvm;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 *pml_buf;
 	u16 pml_idx;
 
@@ -7666,7 +7873,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
 
 		gpa = pml_buf[pml_idx];
 		WARN_ON(gpa & (PAGE_SIZE - 1));
-		mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
 	}
 
 	/* reset PML index */
@@ -7691,6 +7898,161 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
 		kvm_vcpu_kick(vcpu);
 }
 
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+	       name, vmcs_read32(sel),
+	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
+	       name, vmcs_read32(limit),
+	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+	u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+	u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+	u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	u32 secondary_exec_control = 0;
+	unsigned long cr4 = vmcs_readl(GUEST_CR4);
+	u64 efer = vmcs_readl(GUEST_IA32_EFER);
+	int i, n;
+
+	if (cpu_has_secondary_exec_ctrls())
+		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+	pr_err("*** Guest State ***\n");
+	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+	       vmcs_readl(CR0_GUEST_HOST_MASK));
+	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+	    (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+	{
+		pr_err("PDPTR0 = 0x%016lx  PDPTR1 = 0x%016lx\n",
+		       vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
+		pr_err("PDPTR2 = 0x%016lx  PDPTR3 = 0x%016lx\n",
+		       vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+	}
+	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
+	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
+	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+	       vmcs_readl(GUEST_SYSENTER_ESP),
+	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
+	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
+	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
+	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
+	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
+	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
+	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
+	if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+	    (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+		pr_err("EFER =     0x%016llx  PAT = 0x%016lx\n",
+		       efer, vmcs_readl(GUEST_IA32_PAT));
+	pr_err("DebugCtl = 0x%016lx  DebugExceptions = 0x%016lx\n",
+	       vmcs_readl(GUEST_IA32_DEBUGCTL),
+	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+		pr_err("PerfGlobCtl = 0x%016lx\n",
+		       vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+		pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
+	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+	       vmcs_read32(GUEST_ACTIVITY_STATE));
+	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+		pr_err("InterruptStatus = %04x\n",
+		       vmcs_read16(GUEST_INTR_STATUS));
+
+	pr_err("*** Host State ***\n");
+	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
+	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+	       vmcs_read16(HOST_TR_SELECTOR));
+	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+	       vmcs_readl(HOST_TR_BASE));
+	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+	       vmcs_readl(HOST_CR4));
+	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
+	       vmcs_read32(HOST_IA32_SYSENTER_CS),
+	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
+	if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+		pr_err("EFER = 0x%016lx  PAT = 0x%016lx\n",
+		       vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+		pr_err("PerfGlobCtl = 0x%016lx\n",
+		       vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+
+	pr_err("*** Control State ***\n");
+	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+	       vmcs_read32(EXCEPTION_BITMAP),
+	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+	       vmcs_read32(VM_EXIT_INTR_INFO),
+	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+	pr_err("        reason=%08x qualification=%016lx\n",
+	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
+	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
+	pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+		pr_err("TSC Multiplier = 0x%016lx\n",
+		       vmcs_readl(TSC_MULTIPLIER));
+	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+		pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+		pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+	n = vmcs_read32(CR3_TARGET_COUNT);
+	for (i = 0; i + 1 < n; i += 4)
+		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+		       i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+	if (i < n)
+		pr_err("CR3 target%u=%016lx\n",
+		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+		pr_err("PLE Gap=%08x Window=%08x\n",
+		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+		pr_err("Virtual processor ID = 0x%04x\n",
+		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -7701,6 +8063,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
+	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
 	/*
 	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
 	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
@@ -7709,7 +8073,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	 * flushed already.
 	 */
 	if (enable_pml)
-		vmx_flush_pml_buffer(vmx);
+		vmx_flush_pml_buffer(vcpu);
 
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required)
@@ -7723,6 +8087,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	}
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+		dump_vmcs();
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
 			= exit_reason;
@@ -7810,10 +8175,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	 * apicv
 	 */
 	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-				!vmx_vm_has_apicv(vcpu->kvm))
+				!vmx_cpu_uses_apicv(vcpu))
 		return;
 
-	if (!vm_need_tpr_shadow(vcpu->kvm))
+	if (!cpu_need_tpr_shadow(vcpu))
 		return;
 
 	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -7915,9 +8280,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 	}
 }
 
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
-	if (!vmx_vm_has_apicv(vcpu->kvm))
+	u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
+	if (!vmx_cpu_uses_apicv(vcpu))
 		return;
 
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -7996,6 +8362,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 		local_irq_enable();
 }
 
+static bool vmx_has_high_real_mode_segbase(void)
+{
+	return enable_unrestricted_guest || emulate_invalid_guest_state;
+}
+
 static bool vmx_mpx_supported(void)
 {
 	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
@@ -8320,7 +8691,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
 
 	/*
 	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
@@ -8358,8 +8728,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (enable_pml)
-		vmx_disable_pml(vmx);
-	free_vpid(vmx);
+		vmx_destroy_pml_buffer(vmx);
+	free_vpid(vmx->vpid);
 	leave_guest_mode(vcpu);
 	vmx_load_vmcs01(vcpu);
 	free_nested(vmx);
@@ -8378,7 +8748,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
 
-	allocate_vpid(vmx);
+	vmx->vpid = allocate_vpid();
 
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
@@ -8411,7 +8781,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	put_cpu();
 	if (err)
 		goto free_vmcs;
-	if (vm_need_virtualize_apic_accesses(kvm)) {
+	if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
 		err = alloc_apic_access_page(kvm);
 		if (err)
 			goto free_vmcs;
@@ -8426,8 +8796,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
-	if (nested)
+	if (nested) {
 		nested_vmx_setup_ctls_msrs(vmx);
+		vmx->nested.vpid02 = allocate_vpid();
+	}
 
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
@@ -8440,7 +8812,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	 * for the guest, etc.
 	 */
 	if (enable_pml) {
-		err = vmx_enable_pml(vmx);
+		err = vmx_create_pml_buffer(vmx);
 		if (err)
 			goto free_vmcs;
 	}
@@ -8448,13 +8820,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	return &vmx->vcpu;
 
 free_vmcs:
+	free_vpid(vmx->nested.vpid02);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu:
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
-	free_vpid(vmx);
+	free_vpid(vmx->vpid);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
 }
@@ -8480,7 +8853,8 @@ static int get_ept_level(void)
 
 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
-	u64 ret;
+	u8 cache;
+	u64 ipat = 0;
 
 	/* For VT-d and EPT combination
 	 * 1. MMIO: always map as UC
@@ -8493,16 +8867,30 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
 	 *    consistent with host MTRR
 	 */
-	if (is_mmio)
-		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
-	else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
-		ret = kvm_get_guest_memory_type(vcpu, gfn) <<
-		      VMX_EPT_MT_EPTE_SHIFT;
-	else
-		ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
-			| VMX_EPT_IPAT_BIT;
+	if (is_mmio) {
+		cache = MTRR_TYPE_UNCACHABLE;
+		goto exit;
+	}
 
-	return ret;
+	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+		ipat = VMX_EPT_IPAT_BIT;
+		cache = MTRR_TYPE_WRBACK;
+		goto exit;
+	}
+
+	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+		ipat = VMX_EPT_IPAT_BIT;
+		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+			cache = MTRR_TYPE_WRBACK;
+		else
+			cache = MTRR_TYPE_UNCACHABLE;
+		goto exit;
+	}
+
+	cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+
+exit:
+	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
 }
 
 static int vmx_get_lpage_level(void)
@@ -8514,49 +8902,68 @@ static int vmx_get_lpage_level(void)
 		return PT_PDPE_LEVEL;
 }
 
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+	/*
+	 * These bits in the secondary execution controls field
+	 * are dynamic, the others are mostly based on the hypervisor
+	 * architecture and the guest's CPUID.  Do not touch the
+	 * dynamic bits.
+	 */
+	u32 mask =
+		SECONDARY_EXEC_SHADOW_VMCS |
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+	u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+		     (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exec_control;
+	u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
 
-	vmx->rdtscp_enabled = false;
 	if (vmx_rdtscp_supported()) {
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		if (exec_control & SECONDARY_EXEC_RDTSCP) {
-			best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-			if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
-				vmx->rdtscp_enabled = true;
-			else {
-				exec_control &= ~SECONDARY_EXEC_RDTSCP;
-				vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-						exec_control);
-			}
+		bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+		if (!rdtscp_enabled)
+			secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
+
+		if (nested) {
+			if (rdtscp_enabled)
+				vmx->nested.nested_vmx_secondary_ctls_high |=
+					SECONDARY_EXEC_RDTSCP;
+			else
+				vmx->nested.nested_vmx_secondary_ctls_high &=
+					~SECONDARY_EXEC_RDTSCP;
 		}
-		if (nested && !vmx->rdtscp_enabled)
-			vmx->nested.nested_vmx_secondary_ctls_high &=
-				~SECONDARY_EXEC_RDTSCP;
 	}
 
 	/* Exposing INVPCID only when PCID is exposed */
 	best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
 	if (vmx_invpcid_supported() &&
-	    best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
-	    guest_cpuid_has_pcid(vcpu)) {
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-			     exec_control);
-	} else {
-		if (cpu_has_secondary_exec_ctrls()) {
-			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-			exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-			vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-				     exec_control);
-		}
+	    (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+	    !guest_cpuid_has_pcid(vcpu))) {
+		secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+
 		if (best)
 			best->ebx &= ~bit(X86_FEATURE_INVPCID);
 	}
+
+	if (cpu_has_secondary_exec_ctrls())
+		vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+	if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+		if (guest_cpuid_has_pcommit(vcpu))
+			vmx->nested.nested_vmx_secondary_ctls_high |=
+				SECONDARY_EXEC_PCOMMIT;
+		else
+			vmx->nested.nested_vmx_secondary_ctls_high &=
+				~SECONDARY_EXEC_PCOMMIT;
+	}
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -8924,7 +9331,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 				       struct vmx_msr_entry *e)
 {
 	/* x2APIC MSR accesses are not allowed */
-	if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+	if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
 		return -EINVAL;
 	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
 	    e->index == MSR_IA32_UCODE_REV)
@@ -8966,8 +9373,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 
 	msr.host_initiated = false;
 	for (i = 0; i < count; i++) {
-		if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
-				   &e, sizeof(e))) {
+		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+					&e, sizeof(e))) {
 			pr_warn_ratelimited(
 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
 				__func__, i, gpa + i * sizeof(e));
@@ -8999,9 +9406,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 	struct vmx_msr_entry e;
 
 	for (i = 0; i < count; i++) {
-		if (kvm_read_guest(vcpu->kvm,
-				   gpa + i * sizeof(e),
-				   &e, 2 * sizeof(u32))) {
+		struct msr_data msr_info;
+		if (kvm_vcpu_read_guest(vcpu,
+					gpa + i * sizeof(e),
+					&e, 2 * sizeof(u32))) {
 			pr_warn_ratelimited(
 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
 				__func__, i, gpa + i * sizeof(e));
@@ -9013,19 +9421,21 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 				__func__, i, e.index, e.reserved);
 			return -EINVAL;
 		}
-		if (kvm_get_msr(vcpu, e.index, &e.value)) {
+		msr_info.host_initiated = false;
+		msr_info.index = e.index;
+		if (kvm_get_msr(vcpu, &msr_info)) {
 			pr_warn_ratelimited(
 				"%s cannot read MSR (%u, 0x%x)\n",
 				__func__, i, e.index);
 			return -EINVAL;
 		}
-		if (kvm_write_guest(vcpu->kvm,
-				    gpa + i * sizeof(e) +
-					offsetof(struct vmx_msr_entry, value),
-				    &e.value, sizeof(e.value))) {
+		if (kvm_vcpu_write_guest(vcpu,
+					 gpa + i * sizeof(e) +
+					     offsetof(struct vmx_msr_entry, value),
+					 &msr_info.data, sizeof(msr_info.data))) {
 			pr_warn_ratelimited(
 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
-				__func__, i, e.index, e.value);
+				__func__, i, e.index, msr_info.data);
 			return -EINVAL;
 		}
 	}
@@ -9161,13 +9571,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	if (cpu_has_secondary_exec_ctrls()) {
 		exec_control = vmx_secondary_exec_control(vmx);
-		if (!vmx->rdtscp_enabled)
-			exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
 		/* Take the following fields only from vmcs12 */
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
+				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				  SECONDARY_EXEC_PCOMMIT);
 		if (nested_cpu_has(vmcs12,
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 			exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9186,7 +9596,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmcs_write64(APIC_ACCESS_ADDR,
 				  page_to_phys(vmx->nested.apic_access_page));
 		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-			    (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+			    cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
 			exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 			kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9296,12 +9706,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	if (enable_vpid) {
 		/*
-		 * Trivially support vpid by letting L2s share their parent
-		 * L1's vpid. TODO: move to a more elaborate solution, giving
-		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 * There is no direct mapping between vpid02 and vpid12, the
+		 * vpid02 is per-vCPU for L0 and reused while the value of
+		 * vpid12 is changed w/ one invvpid during nested vmentry.
+		 * The vpid12 is allocated by L1 for L2, so it will not
+		 * influence global bitmap(for vpid01 and vpid02 allocation)
+		 * even if spawn a lot of nested vCPUs.
 		 */
-		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-		vmx_flush_tlb(vcpu);
+		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+			}
+		} else {
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+			vmx_flush_tlb(vcpu);
+		}
+
 	}
 
 	if (nested_cpu_has_ept(vmcs12)) {
@@ -10141,6 +10563,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ *   we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ *      'NDST' <-- vcpu->pre_pcpu
+ *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ *   interrupt is posted for this vCPU, we cannot block it, in
+ *   this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+	unsigned int dest;
+	struct pi_desc old, new;
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	vcpu->pre_pcpu = vcpu->cpu;
+	spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+			  vcpu->pre_pcpu), flags);
+	list_add_tail(&vcpu->blocked_vcpu_list,
+		      &per_cpu(blocked_vcpu_on_cpu,
+		      vcpu->pre_pcpu));
+	spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+			       vcpu->pre_pcpu), flags);
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		/*
+		 * We should not block the vCPU if
+		 * an interrupt is posted for it.
+		 */
+		if (pi_test_on(pi_desc) == 1) {
+			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+					  vcpu->pre_pcpu), flags);
+			list_del(&vcpu->blocked_vcpu_list);
+			spin_unlock_irqrestore(
+					&per_cpu(blocked_vcpu_on_cpu_lock,
+					vcpu->pre_pcpu), flags);
+			vcpu->pre_pcpu = -1;
+
+			return 1;
+		}
+
+		WARN((pi_desc->sn == 1),
+		     "Warning: SN field of posted-interrupts "
+		     "is set before blocking\n");
+
+		/*
+		 * Since vCPU can be preempted during this process,
+		 * vcpu->cpu could be different with pre_pcpu, we
+		 * need to set pre_pcpu as the destination of wakeup
+		 * notification event, then we can find the right vCPU
+		 * to wakeup in wakeup handler if interrupts happen
+		 * when the vCPU is in blocked state.
+		 */
+		dest = cpu_physical_id(vcpu->pre_pcpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* set 'NV' to 'wakeup vector' */
+		new.nv = POSTED_INTR_WAKEUP_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+	unsigned long flags;
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		dest = cpu_physical_id(vcpu->cpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+
+		/* set 'NV' to 'notification vector' */
+		new.nv = POSTED_INTR_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	if(vcpu->pre_pcpu != -1) {
+		spin_lock_irqsave(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		list_del(&vcpu->blocked_vcpu_list);
+		spin_unlock_irqrestore(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		vcpu->pre_pcpu = -1;
+	}
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct kvm_irq_routing_table *irq_rt;
+	struct kvm_lapic_irq irq;
+	struct kvm_vcpu *vcpu;
+	struct vcpu_data vcpu_info;
+	int idx, ret = -EINVAL;
+
+	if (!kvm_arch_has_assigned_device(kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+		if (e->type != KVM_IRQ_ROUTING_MSI)
+			continue;
+		/*
+		 * VT-d PI cannot support posting multicast/broadcast
+		 * interrupts to a vCPU, we still use interrupt remapping
+		 * for these kind of interrupts.
+		 *
+		 * For lowest-priority interrupts, we only support
+		 * those with single CPU as the destination, e.g. user
+		 * configures the interrupts via /proc/irq or uses
+		 * irqbalance to make the interrupts single-CPU.
+		 *
+		 * We will support full lowest-priority interrupt later.
+		 */
+
+		kvm_set_msi_irq(e, &irq);
+		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+			continue;
+
+		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+		vcpu_info.vector = irq.vector;
+
+		trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+		if (set)
+			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+		else {
+			/* suppress notification event before unposting */
+			pi_set_sn(vcpu_to_pi_desc(vcpu));
+			ret = irq_set_vcpu_affinity(host_irq, NULL);
+			pi_clear_sn(vcpu_to_pi_desc(vcpu));
+		}
+
+		if (ret < 0) {
+			printk(KERN_INFO "%s: failed to update PI IRTE\n",
+					__func__);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -10150,6 +10767,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 	.cpu_has_accelerated_tpr = report_flexpriority,
+	.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
@@ -10159,7 +10777,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
 
-	.update_db_bp_intercept = update_exception_bitmap,
+	.update_bp_intercept = update_exception_bitmap,
 	.get_msr = vmx_get_msr,
 	.set_msr = vmx_set_msr,
 	.get_segment_base = vmx_get_segment_base,
@@ -10209,7 +10827,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
-	.vm_has_apicv = vmx_vm_has_apicv,
+	.cpu_uses_apicv = vmx_cpu_uses_apicv,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10233,11 +10851,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
-	.set_tsc_khz = vmx_set_tsc_khz,
 	.read_tsc_offset = vmx_read_tsc_offset,
 	.write_tsc_offset = vmx_write_tsc_offset,
-	.adjust_tsc_offset = vmx_adjust_tsc_offset,
-	.compute_tsc_offset = vmx_compute_tsc_offset,
+	.adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
 	.read_l1_tsc = vmx_read_l1_tsc,
 
 	.set_tdp_cr3 = vmx_set_cr3,
@@ -10255,6 +10871,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+
+	.pre_block = vmx_pre_block,
+	.post_block = vmx_post_block,
+
+	.pmu_ops = &intel_pmu_ops,
+
+	.update_pi_irte = vmx_update_pi_irte,
 };
 
 static int __init vmx_init(void)
@@ -10264,7 +10887,7 @@ static int __init vmx_init(void)
 	if (r)
 		return r;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
 			   crash_vmclear_local_loaded_vmcss);
 #endif
@@ -10274,7 +10897,7 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
 	synchronize_rcu();
 #endif
diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c
index 37d79a026..27419ba5a 100644
--- a/kernel/arch/x86/kvm/x86.c
+++ b/kernel/arch/x86/kvm/x86.c
@@ -28,6 +28,8 @@
 #include "x86.h"
 #include "cpuid.h"
 #include "assigned-dev.h"
+#include "pmu.h"
+#include "hyperv.h"
 
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
@@ -49,6 +51,8 @@
 #include <linux/pci.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -57,13 +61,12 @@
 #include <asm/debugreg.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
-#include <asm/mtrr.h>
 #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h> /* Ugh! */
-#include <asm/xcr.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
 #include <asm/div64.h>
+#include <asm/irq_remapping.h>
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
@@ -90,29 +93,37 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static void process_nmi(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
-struct kvm_x86_ops *kvm_x86_ops;
+struct kvm_x86_ops *kvm_x86_ops __read_mostly;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
-static bool ignore_msrs = 0;
+static bool __read_mostly ignore_msrs = 0;
 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
 unsigned int min_timer_period_us = 500;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 
-bool kvm_has_tsc_control;
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
+bool __read_mostly kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32  kvm_max_guest_tsc_khz;
+u32  __read_mostly kvm_max_guest_tsc_khz;
 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
+EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
+u64  __read_mostly kvm_max_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
+static u64 __read_mostly kvm_default_tsc_scaling_ratio;
 
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
-static u32 tsc_tolerance_ppm = 250;
+static u32 __read_mostly tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 
 /* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int lapic_timer_advance_ns = 0;
+unsigned int __read_mostly lapic_timer_advance_ns = 0;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 
-static bool backwards_tsc_observed = false;
+static bool __read_mostly backwards_tsc_observed = false;
 
 #define KVM_NR_SHARED_MSRS 16
 
@@ -146,6 +157,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
 	{ "halt_exits", VCPU_STAT(halt_exits) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
+	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
@@ -219,11 +231,9 @@ static void shared_msr_update(unsigned slot, u32 msr)
 void kvm_define_shared_msr(unsigned slot, u32 msr)
 {
 	BUG_ON(slot >= KVM_NR_SHARED_MSRS);
+	shared_msrs_global.msrs[slot] = msr;
 	if (slot >= shared_msrs_global.nr)
 		shared_msrs_global.nr = slot + 1;
-	shared_msrs_global.msrs[slot] = msr;
-	/* we need ensured the shared_msr_global have been updated */
-	smp_wmb();
 }
 EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
 
@@ -475,7 +485,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr);
 
 /*
  * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
+ * running guest. The difference to kvm_vcpu_read_guest_page is that this function
  * can read from guest physical or from the guest's guest physical memory.
  */
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -493,7 +503,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 
 	real_gfn = gpa_to_gfn(real_gfn);
 
-	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
@@ -524,7 +534,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 	}
 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
 		if (is_present_gpte(pdpte[i]) &&
-		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
+		    (pdpte[i] &
+		     vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
 			ret = 0;
 			goto out;
 		}
@@ -572,8 +583,7 @@ out:
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	unsigned long old_cr0 = kvm_read_cr0(vcpu);
-	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
-				    X86_CR0_CD | X86_CR0_NW;
+	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 
 	cr0 |= X86_CR0_ET;
 
@@ -619,6 +629,12 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	if ((cr0 ^ old_cr0) & update_bits)
 		kvm_mmu_reset_context(vcpu);
+
+	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
@@ -657,9 +673,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
 	if (index != XCR_XFEATURE_ENABLED_MASK)
 		return 1;
-	if (!(xcr0 & XSTATE_FP))
+	if (!(xcr0 & XFEATURE_MASK_FP))
 		return 1;
-	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
 		return 1;
 
 	/*
@@ -667,23 +683,24 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 	 * saving.  However, xcr0 bit 0 is always set, even if the
 	 * emulated CPU does not support XSAVE (see fx_init).
 	 */
-	valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
 	if (xcr0 & ~valid_bits)
 		return 1;
 
-	if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
 		return 1;
 
-	if (xcr0 & XSTATE_AVX512) {
-		if (!(xcr0 & XSTATE_YMM))
+	if (xcr0 & XFEATURE_MASK_AVX512) {
+		if (!(xcr0 & XFEATURE_MASK_YMM))
 			return 1;
-		if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
 			return 1;
 	}
 	kvm_put_guest_xcr0(vcpu);
 	vcpu->arch.xcr0 = xcr0;
 
-	if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
 		kvm_update_cpuid(vcpu);
 	return 0;
 }
@@ -782,7 +799,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
@@ -792,7 +809,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		return kvm_lapic_get_cr8(vcpu);
 	else
 		return vcpu->arch.cr8;
@@ -908,7 +925,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 	u64 data;
 	int err;
 
-	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
 	if (err)
 		return err;
 	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
@@ -923,36 +940,45 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  *
  * This list is modified at module load time to reflect the
  * capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	12
 static u32 msrs_to_save[] = {
-	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
-	MSR_KVM_PV_EOI_EN,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
-	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
+	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
 };
 
 static unsigned num_msrs_to_save;
 
-static const u32 emulated_msrs[] = {
+static u32 emulated_msrs[] = {
+	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+	HV_X64_MSR_RESET,
+	HV_X64_MSR_VP_INDEX,
+	HV_X64_MSR_VP_RUNTIME,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+	MSR_KVM_PV_EOI_EN,
+
 	MSR_IA32_TSC_ADJUST,
 	MSR_IA32_TSCDEADLINE,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
+	MSR_IA32_SMBASE,
 };
 
+static unsigned num_emulated_msrs;
+
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	if (efer & efer_reserved_bits)
@@ -1046,6 +1072,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr);
 /*
  * Adapt set_msr() to msr_io()'s calling convention
  */
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+	struct msr_data msr;
+	int r;
+
+	msr.index = index;
+	msr.host_initiated = true;
+	r = kvm_get_msr(vcpu, &msr);
+	if (r)
+		return r;
+
+	*data = msr.data;
+	return 0;
+}
+
 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 {
 	struct msr_data msr;
@@ -1193,11 +1234,6 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
 }
 
-static inline u64 get_kernel_ns(void)
-{
-	return ktime_get_boot_ns();
-}
-
 #ifdef CONFIG_X86_64
 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
 #endif
@@ -1218,14 +1254,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
 	return v;
 }
 
-static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+	u64 ratio;
+
+	/* Guest TSC same frequency as host TSC? */
+	if (!scale) {
+		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+		return 0;
+	}
+
+	/* TSC scaling supported? */
+	if (!kvm_has_tsc_control) {
+		if (user_tsc_khz > tsc_khz) {
+			vcpu->arch.tsc_catchup = 1;
+			vcpu->arch.tsc_always_catchup = 1;
+			return 0;
+		} else {
+			WARN(1, "user requested TSC rate below hardware speed\n");
+			return -1;
+		}
+	}
+
+	/* TSC scaling required  - calculate ratio */
+	ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+				user_tsc_khz, tsc_khz);
+
+	if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+		WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+			  user_tsc_khz);
+		return -1;
+	}
+
+	vcpu->arch.tsc_scaling_ratio = ratio;
+	return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 {
 	u32 thresh_lo, thresh_hi;
 	int use_scaling = 0;
 
 	/* tsc_khz can be zero if TSC calibration fails */
-	if (this_tsc_khz == 0)
-		return;
+	if (this_tsc_khz == 0) {
+		/* set tsc_scaling_ratio to a safe value */
+		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+		return -1;
+	}
 
 	/* Compute a scale to convert nanoseconds in TSC cycles */
 	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
@@ -1245,7 +1320,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
 		use_scaling = 1;
 	}
-	kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+	return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
 }
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1291,6 +1366,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+	return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+	u64 _tsc = tsc;
+	u64 ratio = vcpu->arch.tsc_scaling_ratio;
+
+	if (ratio != kvm_default_tsc_scaling_ratio)
+		_tsc = __scale_tsc(ratio, tsc);
+
+	return _tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_scale_tsc);
+
+static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+	u64 tsc;
+
+	tsc = kvm_scale_tsc(vcpu, rdtsc());
+
+	return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+	return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc));
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -1302,7 +1419,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	u64 data = msr->data;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
-	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+	offset = kvm_compute_tsc_offset(vcpu, data);
 	ns = get_kernel_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
@@ -1359,7 +1476,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		} else {
 			u64 delta = nsec_to_cycles(vcpu, elapsed);
 			data += delta;
-			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+			offset = kvm_compute_tsc_offset(vcpu, data);
 			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
 		}
 		matched = true;
@@ -1416,24 +1533,26 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
-#ifdef CONFIG_X86_64
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+					   s64 adjustment)
+{
+	kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
 
-static cycle_t read_tsc(void)
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 {
-	cycle_t ret;
-	u64 last;
+	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+		WARN_ON(adjustment < 0);
+	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+	kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
 
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)vget_cycles();
+#ifdef CONFIG_X86_64
 
-	last = pvclock_gtod_data.clock.cycle_last;
+static cycle_t read_tsc(void)
+{
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = pvclock_gtod_data.clock.cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
@@ -1589,7 +1708,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-	unsigned long flags, this_tsc_khz;
+	unsigned long flags, this_tsc_khz, tgt_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
@@ -1622,11 +1741,11 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		return 1;
 	}
 	if (!use_master_clock) {
-		host_tsc = native_read_tsc();
+		host_tsc = rdtsc();
 		kernel_ns = get_kernel_ns();
 	}
 
-	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
 
 	/*
 	 * We may have to catch up the TSC to match elapsed wall clock
@@ -1652,7 +1771,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		return 0;
 
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+		tgt_tsc_khz = kvm_has_tsc_control ?
+			vcpu->virtual_tsc_khz : this_tsc_khz;
+		kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
 		vcpu->hw_tsc_khz = this_tsc_khz;
@@ -1768,127 +1889,14 @@ static void kvmclock_sync_fn(struct work_struct *work)
 					   kvmclock_sync_work);
 	struct kvm *kvm = container_of(ka, struct kvm, arch);
 
+	if (!kvmclock_periodic_sync)
+		return;
+
 	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
 }
 
-static bool msr_mtrr_valid(unsigned msr)
-{
-	switch (msr) {
-	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
-	case MSR_MTRRfix64K_00000:
-	case MSR_MTRRfix16K_80000:
-	case MSR_MTRRfix16K_A0000:
-	case MSR_MTRRfix4K_C0000:
-	case MSR_MTRRfix4K_C8000:
-	case MSR_MTRRfix4K_D0000:
-	case MSR_MTRRfix4K_D8000:
-	case MSR_MTRRfix4K_E0000:
-	case MSR_MTRRfix4K_E8000:
-	case MSR_MTRRfix4K_F0000:
-	case MSR_MTRRfix4K_F8000:
-	case MSR_MTRRdefType:
-	case MSR_IA32_CR_PAT:
-		return true;
-	case 0x2f8:
-		return true;
-	}
-	return false;
-}
-
-static bool valid_pat_type(unsigned t)
-{
-	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
-}
-
-static bool valid_mtrr_type(unsigned t)
-{
-	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
-}
-
-bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	int i;
-	u64 mask;
-
-	if (!msr_mtrr_valid(msr))
-		return false;
-
-	if (msr == MSR_IA32_CR_PAT) {
-		for (i = 0; i < 8; i++)
-			if (!valid_pat_type((data >> (i * 8)) & 0xff))
-				return false;
-		return true;
-	} else if (msr == MSR_MTRRdefType) {
-		if (data & ~0xcff)
-			return false;
-		return valid_mtrr_type(data & 0xff);
-	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
-		for (i = 0; i < 8 ; i++)
-			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
-				return false;
-		return true;
-	}
-
-	/* variable MTRRs */
-	WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
-
-	mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
-	if ((msr & 1) == 0) {
-		/* MTRR base */
-		if (!valid_mtrr_type(data & 0xff))
-			return false;
-		mask |= 0xf00;
-	} else
-		/* MTRR mask */
-		mask |= 0x7ff;
-	if (data & mask) {
-		kvm_inject_gp(vcpu, 0);
-		return false;
-	}
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
-
-static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
-	if (!kvm_mtrr_valid(vcpu, msr, data))
-		return 1;
-
-	if (msr == MSR_MTRRdefType) {
-		vcpu->arch.mtrr_state.def_type = data;
-		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
-	} else if (msr == MSR_MTRRfix64K_00000)
-		p[0] = data;
-	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
-		p[1 + msr - MSR_MTRRfix16K_80000] = data;
-	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
-		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
-	else if (msr == MSR_IA32_CR_PAT)
-		vcpu->arch.pat = data;
-	else {	/* Variable MTRRs */
-		int idx, is_mtrr_mask;
-		u64 *pt;
-
-		idx = (msr - 0x200) / 2;
-		is_mtrr_mask = msr - 0x200 - 2 * idx;
-		if (!is_mtrr_mask)
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
-		else
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
-		*pt = data;
-	}
-
-	kvm_mmu_reset_context(vcpu);
-	return 0;
-}
-
 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -1947,7 +1955,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
 		r = PTR_ERR(page);
 		goto out;
 	}
-	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+	if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
 		goto out_free;
 	r = 0;
 out_free:
@@ -1956,123 +1964,6 @@ out:
 	return r;
 }
 
-static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
-{
-	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
-}
-
-static bool kvm_hv_msr_partition_wide(u32 msr)
-{
-	bool r = false;
-	switch (msr) {
-	case HV_X64_MSR_GUEST_OS_ID:
-	case HV_X64_MSR_HYPERCALL:
-	case HV_X64_MSR_REFERENCE_TSC:
-	case HV_X64_MSR_TIME_REF_COUNT:
-		r = true;
-		break;
-	}
-
-	return r;
-}
-
-static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	struct kvm *kvm = vcpu->kvm;
-
-	switch (msr) {
-	case HV_X64_MSR_GUEST_OS_ID:
-		kvm->arch.hv_guest_os_id = data;
-		/* setting guest os id to zero disables hypercall page */
-		if (!kvm->arch.hv_guest_os_id)
-			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
-		break;
-	case HV_X64_MSR_HYPERCALL: {
-		u64 gfn;
-		unsigned long addr;
-		u8 instructions[4];
-
-		/* if guest os id is not set hypercall should remain disabled */
-		if (!kvm->arch.hv_guest_os_id)
-			break;
-		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
-			kvm->arch.hv_hypercall = data;
-			break;
-		}
-		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
-		addr = gfn_to_hva(kvm, gfn);
-		if (kvm_is_error_hva(addr))
-			return 1;
-		kvm_x86_ops->patch_hypercall(vcpu, instructions);
-		((unsigned char *)instructions)[3] = 0xc3; /* ret */
-		if (__copy_to_user((void __user *)addr, instructions, 4))
-			return 1;
-		kvm->arch.hv_hypercall = data;
-		mark_page_dirty(kvm, gfn);
-		break;
-	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
-		kvm->arch.hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-			&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
-		break;
-	}
-	default:
-		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
-			    "data 0x%llx\n", msr, data);
-		return 1;
-	}
-	return 0;
-}
-
-static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	switch (msr) {
-	case HV_X64_MSR_APIC_ASSIST_PAGE: {
-		u64 gfn;
-		unsigned long addr;
-
-		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
-			vcpu->arch.hv_vapic = data;
-			if (kvm_lapic_enable_pv_eoi(vcpu, 0))
-				return 1;
-			break;
-		}
-		gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
-		addr = gfn_to_hva(vcpu->kvm, gfn);
-		if (kvm_is_error_hva(addr))
-			return 1;
-		if (__clear_user((void __user *)addr, PAGE_SIZE))
-			return 1;
-		vcpu->arch.hv_vapic = data;
-		mark_page_dirty(vcpu->kvm, gfn);
-		if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
-			return 1;
-		break;
-	}
-	case HV_X64_MSR_EOI:
-		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
-	case HV_X64_MSR_ICR:
-		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
-	case HV_X64_MSR_TPR:
-		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
-	default:
-		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
-			    "data 0x%llx\n", msr, data);
-		return 1;
-	}
-
-	return 0;
-}
-
 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 {
 	gpa_t gpa = data & ~0x3f;
@@ -2117,6 +2008,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
+	accumulate_steal_time(vcpu);
+
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
@@ -2180,7 +2073,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			    __func__, data);
 		break;
 	case 0x200 ... 0x2ff:
-		return set_msr_mtrr(vcpu, msr, data);
+		return kvm_mtrr_set_msr(vcpu, msr, data);
 	case MSR_IA32_APICBASE:
 		return kvm_set_apic_base(vcpu, msr_info);
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
@@ -2200,6 +2093,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_IA32_SMBASE:
+		if (!msr_info->host_initiated)
+			return 1;
+		vcpu->arch.smbase = data;
+		break;
 	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
@@ -2262,12 +2160,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & KVM_MSR_ENABLED))
 			break;
 
-		vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
-		preempt_disable();
-		accumulate_steal_time(vcpu);
-		preempt_enable();
-
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
 		break;
@@ -2281,37 +2173,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 		return set_msr_mce(vcpu, msr, data);
 
-	/* Performance counters are not protected by a CPUID bit,
-	 * so we should check all of them in the generic path for the sake of
-	 * cross vendor migration.
-	 * Writing a zero into the event select MSRs disables them,
-	 * which we perfectly emulate ;-). Any other value should be at least
-	 * reported, some guests depend on them.
-	 */
-	case MSR_K7_EVNTSEL0:
-	case MSR_K7_EVNTSEL1:
-	case MSR_K7_EVNTSEL2:
-	case MSR_K7_EVNTSEL3:
-		if (data != 0)
-			vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-				    "0x%x data 0x%llx\n", msr, data);
-		break;
-	/* at least RHEL 4 unconditionally writes to the perfctr registers,
-	 * so we ignore writes to make it happy.
-	 */
-	case MSR_K7_PERFCTR0:
-	case MSR_K7_PERFCTR1:
-	case MSR_K7_PERFCTR2:
-	case MSR_K7_PERFCTR3:
-		vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-			    "0x%x data 0x%llx\n", msr, data);
-		break;
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-		pr = true;
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
-		if (kvm_pmu_msr(vcpu, msr))
+	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+		pr = true; /* fall through */
+	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
 
 		if (pr || data != 0)
@@ -2329,15 +2196,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 */
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
-		if (kvm_hv_msr_partition_wide(msr)) {
-			int r;
-			mutex_lock(&vcpu->kvm->lock);
-			r = set_msr_hyperv_pw(vcpu, msr, data);
-			mutex_unlock(&vcpu->kvm->lock);
-			return r;
-		} else
-			return set_msr_hyperv(vcpu, msr, data);
-		break;
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_CRASH_CTL:
+		return kvm_hv_set_msr_common(vcpu, msr, data,
+					     msr_info->host_initiated);
 	case MSR_IA32_BBL_CR_CTL3:
 		/* Drop writes to this legacy MSR -- see rdmsr
 		 * counterpart for further detail.
@@ -2357,7 +2219,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
-		if (kvm_pmu_msr(vcpu, msr))
+		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
 		if (!ignore_msrs) {
 			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
@@ -2379,48 +2241,12 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common);
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
-	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+	return kvm_x86_ops->get_msr(vcpu, msr);
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr);
 
-static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
-	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
-	if (!msr_mtrr_valid(msr))
-		return 1;
-
-	if (msr == MSR_MTRRdefType)
-		*pdata = vcpu->arch.mtrr_state.def_type +
-			 (vcpu->arch.mtrr_state.enabled << 10);
-	else if (msr == MSR_MTRRfix64K_00000)
-		*pdata = p[0];
-	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
-		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
-	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
-		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
-	else if (msr == MSR_IA32_CR_PAT)
-		*pdata = vcpu->arch.pat;
-	else {	/* Variable MTRRs */
-		int idx, is_mtrr_mask;
-		u64 *pt;
-
-		idx = (msr - 0x200) / 2;
-		is_mtrr_mask = msr - 0x200 - 2 * idx;
-		if (!is_mtrr_mask)
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
-		else
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
-		*pdata = *pt;
-	}
-
-	return 0;
-}
-
 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data;
@@ -2456,73 +2282,9 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	return 0;
 }
 
-static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
-	u64 data = 0;
-	struct kvm *kvm = vcpu->kvm;
-
-	switch (msr) {
-	case HV_X64_MSR_GUEST_OS_ID:
-		data = kvm->arch.hv_guest_os_id;
-		break;
-	case HV_X64_MSR_HYPERCALL:
-		data = kvm->arch.hv_hypercall;
-		break;
-	case HV_X64_MSR_TIME_REF_COUNT: {
-		data =
-		     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
-		break;
-	}
-	case HV_X64_MSR_REFERENCE_TSC:
-		data = kvm->arch.hv_tsc_page;
-		break;
-	default:
-		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
-		return 1;
-	}
-
-	*pdata = data;
-	return 0;
-}
-
-static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
-	u64 data = 0;
-
-	switch (msr) {
-	case HV_X64_MSR_VP_INDEX: {
-		int r;
-		struct kvm_vcpu *v;
-		kvm_for_each_vcpu(r, v, vcpu->kvm) {
-			if (v == vcpu) {
-				data = r;
-				break;
-			}
-		}
-		break;
-	}
-	case HV_X64_MSR_EOI:
-		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
-	case HV_X64_MSR_ICR:
-		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
-	case HV_X64_MSR_TPR:
-		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
-	case HV_X64_MSR_APIC_ASSIST_PAGE:
-		data = vcpu->arch.hv_vapic;
-		break;
-	default:
-		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
-		return 1;
-	}
-	*pdata = data;
-	return 0;
-}
-
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	u64 data;
-
-	switch (msr) {
+	switch (msr_info->index) {
 	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
@@ -2531,40 +2293,32 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_LASTINTFROMIP:
 	case MSR_IA32_LASTINTTOIP:
 	case MSR_K8_SYSCFG:
+	case MSR_K8_TSEG_ADDR:
+	case MSR_K8_TSEG_MASK:
 	case MSR_K7_HWCR:
 	case MSR_VM_HSAVE_PA:
-	case MSR_K7_EVNTSEL0:
-	case MSR_K7_EVNTSEL1:
-	case MSR_K7_EVNTSEL2:
-	case MSR_K7_EVNTSEL3:
-	case MSR_K7_PERFCTR0:
-	case MSR_K7_PERFCTR1:
-	case MSR_K7_PERFCTR2:
-	case MSR_K7_PERFCTR3:
 	case MSR_K8_INT_PENDING_MSG:
 	case MSR_AMD64_NB_CFG:
 	case MSR_FAM10H_MMIO_CONF_BASE:
 	case MSR_AMD64_BU_CFG2:
-		data = 0;
+		msr_info->data = 0;
 		break;
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
-		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_get_msr(vcpu, msr, pdata);
-		data = 0;
+	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+		msr_info->data = 0;
 		break;
 	case MSR_IA32_UCODE_REV:
-		data = 0x100000000ULL;
+		msr_info->data = 0x100000000ULL;
 		break;
 	case MSR_MTRRcap:
-		data = 0x500 | KVM_NR_VAR_MTRR;
-		break;
 	case 0x200 ... 0x2ff:
-		return get_msr_mtrr(vcpu, msr, pdata);
+		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
 	case 0xcd: /* fsb frequency */
-		data = 3;
+		msr_info->data = 3;
 		break;
 		/*
 		 * MSR_EBC_FREQUENCY_ID
@@ -2578,48 +2332,53 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 * multiplying by zero otherwise.
 		 */
 	case MSR_EBC_FREQUENCY_ID:
-		data = 1 << 24;
+		msr_info->data = 1 << 24;
 		break;
 	case MSR_IA32_APICBASE:
-		data = kvm_get_apic_base(vcpu);
+		msr_info->data = kvm_get_apic_base(vcpu);
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
-		return kvm_x2apic_msr_read(vcpu, msr, pdata);
+		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
 		break;
 	case MSR_IA32_TSCDEADLINE:
-		data = kvm_get_lapic_tscdeadline_msr(vcpu);
+		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
 		break;
 	case MSR_IA32_TSC_ADJUST:
-		data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
 		break;
 	case MSR_IA32_MISC_ENABLE:
-		data = vcpu->arch.ia32_misc_enable_msr;
+		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+		break;
+	case MSR_IA32_SMBASE:
+		if (!msr_info->host_initiated)
+			return 1;
+		msr_info->data = vcpu->arch.smbase;
 		break;
 	case MSR_IA32_PERF_STATUS:
 		/* TSC increment by tick */
-		data = 1000ULL;
+		msr_info->data = 1000ULL;
 		/* CPU multiplier */
-		data |= (((uint64_t)4ULL) << 40);
+		msr_info->data |= (((uint64_t)4ULL) << 40);
 		break;
 	case MSR_EFER:
-		data = vcpu->arch.efer;
+		msr_info->data = vcpu->arch.efer;
 		break;
 	case MSR_KVM_WALL_CLOCK:
 	case MSR_KVM_WALL_CLOCK_NEW:
-		data = vcpu->kvm->arch.wall_clock;
+		msr_info->data = vcpu->kvm->arch.wall_clock;
 		break;
 	case MSR_KVM_SYSTEM_TIME:
 	case MSR_KVM_SYSTEM_TIME_NEW:
-		data = vcpu->arch.time;
+		msr_info->data = vcpu->arch.time;
 		break;
 	case MSR_KVM_ASYNC_PF_EN:
-		data = vcpu->arch.apf.msr_val;
+		msr_info->data = vcpu->arch.apf.msr_val;
 		break;
 	case MSR_KVM_STEAL_TIME:
-		data = vcpu->arch.st.msr_val;
+		msr_info->data = vcpu->arch.st.msr_val;
 		break;
 	case MSR_KVM_PV_EOI_EN:
-		data = vcpu->arch.pv_eoi.msr_val;
+		msr_info->data = vcpu->arch.pv_eoi.msr_val;
 		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
@@ -2627,7 +2386,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
-		return get_msr_mce(vcpu, msr, pdata);
+		return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Provide expected ramp-up count for K7. All other
@@ -2638,17 +2397,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 * type 6, model 8 and higher from exploding due to
 		 * the rdmsr failing.
 		 */
-		data = 0x20000000;
+		msr_info->data = 0x20000000;
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
-		if (kvm_hv_msr_partition_wide(msr)) {
-			int r;
-			mutex_lock(&vcpu->kvm->lock);
-			r = get_msr_hyperv_pw(vcpu, msr, pdata);
-			mutex_unlock(&vcpu->kvm->lock);
-			return r;
-		} else
-			return get_msr_hyperv(vcpu, msr, pdata);
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_CRASH_CTL:
+		return kvm_hv_get_msr_common(vcpu,
+					     msr_info->index, &msr_info->data);
 		break;
 	case MSR_IA32_BBL_CR_CTL3:
 		/* This legacy MSR exists but isn't fully documented in current
@@ -2661,31 +2416,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 * L2 cache control register 3: 64GB range, 256KB size,
 		 * enabled, latency 0x1, configured
 		 */
-		data = 0xbe702111;
+		msr_info->data = 0xbe702111;
 		break;
 	case MSR_AMD64_OSVW_ID_LENGTH:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
-		data = vcpu->arch.osvw.length;
+		msr_info->data = vcpu->arch.osvw.length;
 		break;
 	case MSR_AMD64_OSVW_STATUS:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
-		data = vcpu->arch.osvw.status;
+		msr_info->data = vcpu->arch.osvw.status;
 		break;
 	default:
-		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_get_msr(vcpu, msr, pdata);
+		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
 		if (!ignore_msrs) {
-			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
 			return 1;
 		} else {
-			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
-			data = 0;
+			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
+			msr_info->data = 0;
 		}
 		break;
 	}
-	*pdata = data;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
@@ -2798,12 +2552,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_TIME:
 	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
 	case KVM_CAP_TSC_DEADLINE_TIMER:
+	case KVM_CAP_ENABLE_CAP_VM:
+	case KVM_CAP_DISABLE_QUIRKS:
+	case KVM_CAP_SET_BOOT_CPU_ID:
+ 	case KVM_CAP_SPLIT_IRQCHIP:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
 #endif
 		r = 1;
 		break;
+	case KVM_CAP_X86_SMM:
+		/* SMBASE is usually relocated above 1M on modern chipsets,
+		 * and SMM handlers might indeed rely on 4G segment limits,
+		 * so do not report SMM to be available if real mode is
+		 * emulated via vm86 mode.  Still, do not go to great lengths
+		 * to avoid userspace's usage of the feature, because it is a
+		 * fringe case that is not enabled except via specific settings
+		 * of the module parameters.
+		 */
+		r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
@@ -2860,7 +2629,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
 			goto out;
 		n = msr_list.nmsrs;
-		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+		msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
 		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
 			goto out;
 		r = -E2BIG;
@@ -2872,7 +2641,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			goto out;
 		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
 				 &emulated_msrs,
-				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+				 num_emulated_msrs * sizeof(u32)))
 			goto out;
 		r = 0;
 		break;
@@ -2946,11 +2715,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
-				native_read_tsc() - vcpu->arch.last_host_tsc;
+				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
-			u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+			u64 offset = kvm_compute_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);
 			kvm_x86_ops->write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
@@ -2966,7 +2735,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vcpu->cpu = cpu;
 	}
 
-	accumulate_steal_time(vcpu);
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
@@ -2974,7 +2742,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	vcpu->arch.last_host_tsc = native_read_tsc();
+	vcpu->arch.last_host_tsc = rdtsc();
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2995,17 +2763,50 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+	return (!lapic_in_kernel(vcpu) ||
+		kvm_apic_accept_pic_intr(vcpu));
+}
+
+/*
+ * if userspace requested an interrupt window, check that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+	return kvm_arch_interrupt_allowed(vcpu) &&
+		!kvm_cpu_has_interrupt(vcpu) &&
+		!kvm_event_needs_reinjection(vcpu) &&
+		kvm_cpu_accept_dm_intr(vcpu);
+}
+
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 				    struct kvm_interrupt *irq)
 {
 	if (irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
+
+	if (!irqchip_in_kernel(vcpu->kvm)) {
+		kvm_queue_interrupt(vcpu, irq->irq, false);
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		return 0;
+	}
+
+	/*
+	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
+	 * fail for in-kernel 8259.
+	 */
+	if (pic_in_kernel(vcpu->kvm))
 		return -ENXIO;
 
-	kvm_queue_interrupt(vcpu, irq->irq, false);
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	if (vcpu->arch.pending_external_vector != -1)
+		return -EEXIST;
 
+	vcpu->arch.pending_external_vector = irq->irq;
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return 0;
 }
 
@@ -3016,6 +2817,13 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
+{
+	kvm_make_request(KVM_REQ_SMI, vcpu);
+
+	return 0;
+}
+
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 					   struct kvm_tpr_access_ctl *tac)
 {
@@ -3121,8 +2929,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 
 	events->sipi_vector = 0; /* never valid when reporting to user space */
 
+	events->smi.smm = is_smm(vcpu);
+	events->smi.pending = vcpu->arch.smi_pending;
+	events->smi.smm_inside_nmi =
+		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-			 | KVM_VCPUEVENT_VALID_SHADOW);
+			 | KVM_VCPUEVENT_VALID_SHADOW
+			 | KVM_VCPUEVENT_VALID_SMM);
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
@@ -3131,7 +2946,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 {
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
-			      | KVM_VCPUEVENT_VALID_SHADOW))
+			      | KVM_VCPUEVENT_VALID_SHADOW
+			      | KVM_VCPUEVENT_VALID_SMM))
 		return -EINVAL;
 
 	process_nmi(vcpu);
@@ -3156,6 +2972,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	    kvm_vcpu_has_lapic(vcpu))
 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
+	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+		if (events->smi.smm)
+			vcpu->arch.hflags |= HF_SMM_MASK;
+		else
+			vcpu->arch.hflags &= ~HF_SMM_MASK;
+		vcpu->arch.smi_pending = events->smi.pending;
+		if (events->smi.smm_inside_nmi)
+			vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+		else
+			vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+		if (kvm_vcpu_has_lapic(vcpu)) {
+			if (events->smi.latched_init)
+				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+			else
+				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+		}
+	}
+
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 0;
@@ -3194,8 +3028,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 
 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 {
-	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
-	u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
+	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+	u64 xstate_bv = xsave->header.xfeatures;
 	u64 valid;
 
 	/*
@@ -3211,7 +3045,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 	 * Copy each region from the possibly compacted offset to the
 	 * non-compacted offset.
 	 */
-	valid = xstate_bv & ~XSTATE_FPSSE;
+	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
 		u64 feature = valid & -valid;
 		int index = fls64(feature) - 1;
@@ -3230,7 +3064,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 
 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 {
-	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
 	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
 	u64 valid;
 
@@ -3241,15 +3075,15 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	memcpy(xsave, src, XSAVE_HDR_OFFSET);
 
 	/* Set XSTATE_BV and possibly XCOMP_BV.  */
-	xsave->xsave_hdr.xstate_bv = xstate_bv;
+	xsave->header.xfeatures = xstate_bv;
 	if (cpu_has_xsaves)
-		xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
 	/*
 	 * Copy each region from the non-compacted offset to the
 	 * possibly compacted offset.
 	 */
-	valid = xstate_bv & ~XSTATE_FPSSE;
+	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
 	while (valid) {
 		u64 feature = valid & -valid;
 		int index = fls64(feature) - 1;
@@ -3260,8 +3094,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 			cpuid_count(XSTATE_CPUID, index,
 				    &size, &offset, &ecx, &edx);
 			memcpy(dest, src + offset, size);
-		} else
-			WARN_ON_ONCE(1);
+		}
 
 		valid -= feature;
 	}
@@ -3275,10 +3108,10 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 		fill_xsave((u8 *) guest_xsave->region, vcpu);
 	} else {
 		memcpy(guest_xsave->region,
-			&vcpu->arch.guest_fpu.state->fxsave,
-			sizeof(struct i387_fxsave_struct));
+			&vcpu->arch.guest_fpu.state.fxsave,
+			sizeof(struct fxregs_state));
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
-			XSTATE_FPSSE;
+			XFEATURE_MASK_FPSSE;
 	}
 }
 
@@ -3298,10 +3131,10 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 			return -EINVAL;
 		load_xsave(vcpu, (u8 *)guest_xsave->region);
 	} else {
-		if (xstate_bv & ~XSTATE_FPSSE)
+		if (xstate_bv & ~XFEATURE_MASK_FPSSE)
 			return -EINVAL;
-		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
-			guest_xsave->region, sizeof(struct i387_fxsave_struct));
+		memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+			guest_xsave->region, sizeof(struct fxregs_state));
 	}
 	return 0;
 }
@@ -3415,6 +3248,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_nmi(vcpu);
 		break;
 	}
+	case KVM_SMI: {
+		r = kvm_vcpu_ioctl_smi(vcpu);
+		break;
+	}
 	case KVM_SET_CPUID: {
 		struct kvm_cpuid __user *cpuid_arg = argp;
 		struct kvm_cpuid cpuid;
@@ -3454,7 +3291,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_MSRS:
-		r = msr_io(vcpu, argp, kvm_get_msr, 1);
+		r = msr_io(vcpu, argp, do_get_msr, 1);
 		break;
 	case KVM_SET_MSRS:
 		r = msr_io(vcpu, argp, do_set_msr, 0);
@@ -3478,7 +3315,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_vapic_addr va;
 
 		r = -EINVAL;
-		if (!irqchip_in_kernel(vcpu->kvm))
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
@@ -3605,9 +3442,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (user_tsc_khz == 0)
 			user_tsc_khz = tsc_khz;
 
-		kvm_set_tsc_khz(vcpu, user_tsc_khz);
+		if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+			r = 0;
 
-		r = 0;
 		goto out;
 	}
 	case KVM_GET_TSC_KHZ: {
@@ -3727,41 +3564,38 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-	int r = 0;
-
+	int i;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
-	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+	for (i = 0; i < 3; i++)
+		kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
-	int r = 0, start = 0;
+	int start = 0;
+	int i;
 	u32 prev_legacy, cur_legacy;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3771,9 +3605,11 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 	memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
 	       sizeof(kvm->arch.vpit->pit_state.channels));
 	kvm->arch.vpit->pit_state.flags = ps->flags;
-	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+	for (i = 0; i < 3; i++)
+		kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+				   start && i == 0);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@@ -3845,6 +3681,48 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 	return 0;
 }
 
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+				   struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+	case KVM_CAP_DISABLE_QUIRKS:
+		kvm->arch.disabled_quirks = cap->args[0];
+		r = 0;
+		break;
+	case KVM_CAP_SPLIT_IRQCHIP: {
+		mutex_lock(&kvm->lock);
+		r = -EINVAL;
+		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+			goto split_irqchip_unlock;
+		r = -EEXIST;
+		if (irqchip_in_kernel(kvm))
+			goto split_irqchip_unlock;
+		if (atomic_read(&kvm->online_vcpus))
+			goto split_irqchip_unlock;
+		r = kvm_setup_empty_irq_routing(kvm);
+		if (r)
+			goto split_irqchip_unlock;
+		/* Pairs with irqchip_in_kernel. */
+		smp_wmb();
+		kvm->arch.irqchip_split = true;
+		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+		r = 0;
+split_irqchip_unlock:
+		mutex_unlock(&kvm->lock);
+		break;
+	}
+	default:
+		r = -EINVAL;
+		break;
+	}
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -3897,30 +3775,25 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = kvm_ioapic_init(kvm);
 			if (r) {
 				mutex_lock(&kvm->slots_lock);
-				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-							  &vpic->dev_master);
-				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-							  &vpic->dev_slave);
-				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-							  &vpic->dev_eclr);
+				kvm_destroy_pic(vpic);
 				mutex_unlock(&kvm->slots_lock);
-				kfree(vpic);
 				goto create_irqchip_unlock;
 			}
 		} else
 			goto create_irqchip_unlock;
-		smp_wmb();
-		kvm->arch.vpic = vpic;
-		smp_wmb();
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
 			mutex_lock(&kvm->slots_lock);
 			mutex_lock(&kvm->irq_lock);
 			kvm_ioapic_destroy(kvm);
-			kvm_destroy_pic(kvm);
+			kvm_destroy_pic(vpic);
 			mutex_unlock(&kvm->irq_lock);
 			mutex_unlock(&kvm->slots_lock);
+			goto create_irqchip_unlock;
 		}
+		/* Write kvm->irq_routing before kvm->arch.vpic.  */
+		smp_wmb();
+		kvm->arch.vpic = vpic;
 	create_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
 		break;
@@ -3956,7 +3829,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
@@ -3980,7 +3853,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
@@ -4047,6 +3920,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_reinject(kvm, &control);
 		break;
 	}
+	case KVM_SET_BOOT_CPU_ID:
+		r = 0;
+		mutex_lock(&kvm->lock);
+		if (atomic_read(&kvm->online_vcpus) != 0)
+			r = -EBUSY;
+		else
+			kvm->arch.bsp_vcpu_id = arg;
+		mutex_unlock(&kvm->lock);
+		break;
 	case KVM_XEN_HVM_CONFIG: {
 		r = -EFAULT;
 		if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
@@ -4097,7 +3979,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_ENABLE_CAP: {
+		struct kvm_enable_cap cap;
 
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+		break;
+	}
 	default:
 		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 	}
@@ -4110,23 +4000,23 @@ static void kvm_init_msr_list(void)
 	u32 dummy[2];
 	unsigned i, j;
 
-	/* skip the first msrs in the list. KVM-specific */
-	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
 		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
 			continue;
 
 		/*
 		 * Even MSRs that are valid in the host may not be exposed
-		 * to the guests in some cases.  We could work around this
-		 * in VMX with the generic MSR save/load machinery, but it
-		 * is not really worthwhile since it will really only
-		 * happen with nested virtualization.
+		 * to the guests in some cases.
 		 */
 		switch (msrs_to_save[i]) {
 		case MSR_IA32_BNDCFGS:
 			if (!kvm_x86_ops->mpx_supported())
 				continue;
 			break;
+		case MSR_TSC_AUX:
+			if (!kvm_x86_ops->rdtscp_supported())
+				continue;
+			break;
 		default:
 			break;
 		}
@@ -4136,6 +4026,22 @@ static void kvm_init_msr_list(void)
 		j++;
 	}
 	num_msrs_to_save = j;
+
+	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+		switch (emulated_msrs[i]) {
+		case MSR_IA32_SMBASE:
+			if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		if (j < i)
+			emulated_msrs[j] = emulated_msrs[i];
+		j++;
+	}
+	num_emulated_msrs = j;
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -4253,8 +4159,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 
 		if (gpa == UNMAPPED_GVA)
 			return X86EMUL_PROPAGATE_FAULT;
-		ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
-					  offset, toread);
+		ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+					       offset, toread);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
 			goto out;
@@ -4287,8 +4193,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 	offset = addr & (PAGE_SIZE-1);
 	if (WARN_ON(offset + bytes > PAGE_SIZE))
 		bytes = (unsigned)PAGE_SIZE - offset;
-	ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
-				  offset, bytes);
+	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+				       offset, bytes);
 	if (unlikely(ret < 0))
 		return X86EMUL_IO_NEEDED;
 
@@ -4315,6 +4221,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+		unsigned long addr, void *val, unsigned int bytes)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				       gva_t addr, void *val,
 				       unsigned int bytes,
@@ -4334,7 +4249,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 
 		if (gpa == UNMAPPED_GVA)
 			return X86EMUL_PROPAGATE_FAULT;
-		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+		ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
 			goto out;
@@ -4387,7 +4302,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	int ret;
 
-	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
@@ -4421,7 +4336,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 			void *val, int bytes)
 {
-	return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
 }
 
 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4619,7 +4534,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
 		goto emul_write;
 
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto emul_write;
 
@@ -4647,7 +4562,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
-	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
+	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
 	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 
 	return X86EMUL_CONTINUE;
@@ -4946,7 +4861,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 			    u32 msr_index, u64 *pdata)
 {
-	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+	struct msr_data msr;
+	int r;
+
+	msr.index = msr_index;
+	msr.host_initiated = false;
+	r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
+	if (r)
+		return r;
+
+	*pdata = msr.data;
+	return 0;
 }
 
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
@@ -4960,16 +4885,30 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 	return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	vcpu->arch.smbase = smbase;
+}
+
 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
 			      u32 pmc)
 {
-	return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+	return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
 }
 
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
 			     u32 pmc, u64 *pdata)
 {
-	return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
 }
 
 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
@@ -5026,6 +4965,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.write_gpr           = emulator_write_gpr,
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
+	.read_phys           = kvm_read_guest_phys_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
@@ -5045,6 +4985,8 @@ static const struct x86_emulate_ops emulate_ops = {
 	.cpl                 = emulator_get_cpl,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
+	.get_smbase          = emulator_get_smbase,
+	.set_smbase          = emulator_set_smbase,
 	.set_msr             = emulator_set_msr,
 	.get_msr             = emulator_get_msr,
 	.check_pmc	     = emulator_check_pmc,
@@ -5106,7 +5048,10 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :
 		     cs_db				? X86EMUL_MODE_PROT32 :
 							  X86EMUL_MODE_PROT16;
-	ctxt->guest_mode = is_guest_mode(vcpu);
+	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+	ctxt->emul_flags = vcpu->arch.hflags;
 
 	init_decode_cache(ctxt);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5275,6 +5220,34 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
+static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
+		/* This is a good place to trace that we are exiting SMM.  */
+		trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+
+		if (unlikely(vcpu->arch.smi_pending)) {
+			kvm_make_request(KVM_REQ_SMI, vcpu);
+			vcpu->arch.smi_pending = 0;
+		} else {
+			/* Process a latched INIT, if any.  */
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
+		}
+	}
+
+	kvm_mmu_reset_context(vcpu);
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+	unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
+	vcpu->arch.hflags = emul_flags;
+
+	if (changed & HF_SMM_MASK)
+		kvm_smm_changed(vcpu);
+}
+
 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
 				unsigned long *db)
 {
@@ -5474,6 +5447,8 @@ restart:
 		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+		if (vcpu->arch.hflags != ctxt->emul_flags)
+			kvm_set_hflags(vcpu, ctxt->emul_flags);
 		kvm_rip_write(vcpu, ctxt->eip);
 		if (r == EMULATE_DONE)
 			kvm_vcpu_check_singlestep(vcpu, rflags, &r);
@@ -5870,7 +5845,7 @@ void kvm_arch_exit(void)
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (lapic_in_kernel(vcpu)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		return 1;
 	} else {
@@ -5887,66 +5862,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
-int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
-{
-	u64 param, ingpa, outgpa, ret;
-	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
-	bool fast, longmode;
-
-	/*
-	 * hypercall generates UD from non zero cpl and real mode
-	 * per HYPER-V spec
-	 */
-	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
-	}
-
-	longmode = is_64_bit_mode(vcpu);
-
-	if (!longmode) {
-		param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
-			(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
-		ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
-			(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
-		outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
-			(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
-	}
-#ifdef CONFIG_X86_64
-	else {
-		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
-		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
-		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
-	}
-#endif
-
-	code = param & 0xffff;
-	fast = (param >> 16) & 0x1;
-	rep_cnt = (param >> 32) & 0xfff;
-	rep_idx = (param >> 48) & 0xfff;
-
-	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
-
-	switch (code) {
-	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
-		kvm_vcpu_on_spin(vcpu);
-		break;
-	default:
-		res = HV_STATUS_INVALID_HYPERCALL_CODE;
-		break;
-	}
-
-	ret = res | (((u64)rep_done & 0xfff) << 32);
-	if (longmode) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-	} else {
-		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
-	}
-
-	return 1;
-}
-
 /*
  * kvm_pv_kick_cpu_op:  Kick a vcpu.
  *
@@ -5959,6 +5874,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 	lapic_irq.shorthand = 0;
 	lapic_irq.dest_mode = 0;
 	lapic_irq.dest_id = apicid;
+	lapic_irq.msi_redir_hint = false;
 
 	lapic_irq.delivery_mode = APIC_DM_REMRD;
 	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6028,17 +5944,10 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 }
 
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
-	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
-		vcpu->run->request_interrupt_window &&
-		kvm_arch_interrupt_allowed(vcpu));
+	return vcpu->run->request_interrupt_window &&
+		likely(!pic_in_kernel(vcpu->kvm));
 }
 
 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -6046,15 +5955,12 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 
 	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_run->ready_for_interrupt_injection = 1;
-	else
-		kvm_run->ready_for_interrupt_injection =
-			kvm_arch_interrupt_allowed(vcpu) &&
-			!kvm_cpu_has_interrupt(vcpu) &&
-			!kvm_event_needs_reinjection(vcpu);
+	kvm_run->ready_for_interrupt_injection =
+		pic_in_kernel(vcpu->kvm) ||
+		kvm_vcpu_ready_for_interrupt_injection(vcpu);
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -6169,20 +6075,254 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
-static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
+#define put_smstate(type, buf, offset, val)			  \
+	*(type *)((buf) + (offset) - 0x7e00) = val
+
+static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
 {
-	u64 eoi_exit_bitmap[4];
-	u32 tmr[8];
+	u32 flags = 0;
+	flags |= seg->g       << 23;
+	flags |= seg->db      << 22;
+	flags |= seg->l       << 21;
+	flags |= seg->avl     << 20;
+	flags |= seg->present << 15;
+	flags |= seg->dpl     << 13;
+	flags |= seg->s       << 12;
+	flags |= seg->type    << 8;
+	return flags;
+}
 
+static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+	struct kvm_segment seg;
+	int offset;
+
+	kvm_get_segment(vcpu, &seg, n);
+	put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
+
+	if (n < 3)
+		offset = 0x7f84 + n * 12;
+	else
+		offset = 0x7f2c + (n - 3) * 12;
+
+	put_smstate(u32, buf, offset + 8, seg.base);
+	put_smstate(u32, buf, offset + 4, seg.limit);
+	put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+}
+
+#ifdef CONFIG_X86_64
+static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+	struct kvm_segment seg;
+	int offset;
+	u16 flags;
+
+	kvm_get_segment(vcpu, &seg, n);
+	offset = 0x7e00 + n * 16;
+
+	flags = process_smi_get_segment_flags(&seg) >> 8;
+	put_smstate(u16, buf, offset, seg.selector);
+	put_smstate(u16, buf, offset + 2, flags);
+	put_smstate(u32, buf, offset + 4, seg.limit);
+	put_smstate(u64, buf, offset + 8, seg.base);
+}
+#endif
+
+static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+{
+	struct desc_ptr dt;
+	struct kvm_segment seg;
+	unsigned long val;
+	int i;
+
+	put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
+	put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
+	put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
+	put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
+
+	for (i = 0; i < 8; i++)
+		put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+
+	kvm_get_dr(vcpu, 6, &val);
+	put_smstate(u32, buf, 0x7fcc, (u32)val);
+	kvm_get_dr(vcpu, 7, &val);
+	put_smstate(u32, buf, 0x7fc8, (u32)val);
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+	put_smstate(u32, buf, 0x7fc4, seg.selector);
+	put_smstate(u32, buf, 0x7f64, seg.base);
+	put_smstate(u32, buf, 0x7f60, seg.limit);
+	put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+	put_smstate(u32, buf, 0x7fc0, seg.selector);
+	put_smstate(u32, buf, 0x7f80, seg.base);
+	put_smstate(u32, buf, 0x7f7c, seg.limit);
+	put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+
+	kvm_x86_ops->get_gdt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7f74, dt.address);
+	put_smstate(u32, buf, 0x7f70, dt.size);
+
+	kvm_x86_ops->get_idt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7f58, dt.address);
+	put_smstate(u32, buf, 0x7f54, dt.size);
+
+	for (i = 0; i < 6; i++)
+		process_smi_save_seg_32(vcpu, buf, i);
+
+	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
+
+	/* revision id */
+	put_smstate(u32, buf, 0x7efc, 0x00020000);
+	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+}
+
+static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+{
+#ifdef CONFIG_X86_64
+	struct desc_ptr dt;
+	struct kvm_segment seg;
+	unsigned long val;
+	int i;
+
+	for (i = 0; i < 16; i++)
+		put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+
+	put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
+	put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
+
+	kvm_get_dr(vcpu, 6, &val);
+	put_smstate(u64, buf, 0x7f68, val);
+	kvm_get_dr(vcpu, 7, &val);
+	put_smstate(u64, buf, 0x7f60, val);
+
+	put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
+	put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
+	put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
+
+	put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
+
+	/* revision id */
+	put_smstate(u32, buf, 0x7efc, 0x00020064);
+
+	put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+	put_smstate(u16, buf, 0x7e90, seg.selector);
+	put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+	put_smstate(u32, buf, 0x7e94, seg.limit);
+	put_smstate(u64, buf, 0x7e98, seg.base);
+
+	kvm_x86_ops->get_idt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7e84, dt.size);
+	put_smstate(u64, buf, 0x7e88, dt.address);
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+	put_smstate(u16, buf, 0x7e70, seg.selector);
+	put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+	put_smstate(u32, buf, 0x7e74, seg.limit);
+	put_smstate(u64, buf, 0x7e78, seg.base);
+
+	kvm_x86_ops->get_gdt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7e64, dt.size);
+	put_smstate(u64, buf, 0x7e68, dt.address);
+
+	for (i = 0; i < 6; i++)
+		process_smi_save_seg_64(vcpu, buf, i);
+#else
+	WARN_ON_ONCE(1);
+#endif
+}
+
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs, ds;
+	struct desc_ptr dt;
+	char buf[512];
+	u32 cr0;
+
+	if (is_smm(vcpu)) {
+		vcpu->arch.smi_pending = true;
+		return;
+	}
+
+	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+	vcpu->arch.hflags |= HF_SMM_MASK;
+	memset(buf, 0, 512);
+	if (guest_cpuid_has_longmode(vcpu))
+		process_smi_save_state_64(vcpu, buf);
+	else
+		process_smi_save_state_32(vcpu, buf);
+
+	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+
+	if (kvm_x86_ops->get_nmi_mask(vcpu))
+		vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+	else
+		kvm_x86_ops->set_nmi_mask(vcpu, true);
+
+	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+	kvm_rip_write(vcpu, 0x8000);
+
+	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+	kvm_x86_ops->set_cr0(vcpu, cr0);
+	vcpu->arch.cr0 = cr0;
+
+	kvm_x86_ops->set_cr4(vcpu, 0);
+
+	/* Undocumented: IDT limit is set to zero on entry to SMM.  */
+	dt.address = dt.size = 0;
+	kvm_x86_ops->set_idt(vcpu, &dt);
+
+	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+
+	cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+	cs.base = vcpu->arch.smbase;
+
+	ds.selector = 0;
+	ds.base = 0;
+
+	cs.limit    = ds.limit = 0xffffffff;
+	cs.type     = ds.type = 0x3;
+	cs.dpl      = ds.dpl = 0;
+	cs.db       = ds.db = 0;
+	cs.s        = ds.s = 1;
+	cs.l        = ds.l = 0;
+	cs.g        = ds.g = 1;
+	cs.avl      = ds.avl = 0;
+	cs.present  = ds.present = 1;
+	cs.unusable = ds.unusable = 0;
+	cs.padding  = ds.padding = 0;
+
+	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+	if (guest_cpuid_has_longmode(vcpu))
+		kvm_x86_ops->set_efer(vcpu, 0);
+
+	kvm_update_cpuid(vcpu);
+	kvm_mmu_reset_context(vcpu);
+}
+
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
+{
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		return;
 
-	memset(eoi_exit_bitmap, 0, 32);
-	memset(tmr, 0, 32);
+	memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
 
-	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
-	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
-	kvm_apic_update_tmr(vcpu, tmr);
+	if (irqchip_split(vcpu->kvm))
+		kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
+	else {
+		kvm_x86_ops->sync_pir_to_irr(vcpu);
+		kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+	}
+	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
 
 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6195,7 +6335,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
 	struct page *page = NULL;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6233,8 +6373,10 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-		vcpu->run->request_interrupt_window;
+	bool req_int_win =
+		dm_request_for_irq_injection(vcpu) &&
+		kvm_cpu_accept_dm_intr(vcpu);
+
 	bool req_immediate_exit = false;
 
 	if (vcpu->requests) {
@@ -6277,16 +6419,55 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
+		if (kvm_check_request(KVM_REQ_SMI, vcpu))
+			process_smi(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
-			kvm_handle_pmu_event(vcpu);
+			kvm_pmu_handle_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
-			kvm_deliver_pmi(vcpu);
+			kvm_pmu_deliver_pmi(vcpu);
+		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+			if (test_bit(vcpu->arch.pending_ioapic_eoi,
+				     (void *) vcpu->arch.eoi_exit_bitmap)) {
+				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+				vcpu->run->eoi.vector =
+						vcpu->arch.pending_ioapic_eoi;
+				r = 0;
+				goto out;
+			}
+		}
 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 			kvm_vcpu_reload_apic_access_page(vcpu);
+		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+			r = 0;
+			goto out;
+		}
+		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+			r = 0;
+			goto out;
+		}
+	}
+
+	/*
+	 * KVM_REQ_EVENT is not set when posted interrupts are set by
+	 * VT-d hardware, so we have to update RVI unconditionally.
+	 */
+	if (kvm_lapic_enabled(vcpu)) {
+		/*
+		 * Update architecture specific hints for APIC
+		 * virtual interrupt delivery.
+		 */
+		if (kvm_x86_ops->hwapic_irr_update)
+			kvm_x86_ops->hwapic_irr_update(vcpu,
+				kvm_lapic_find_highest_irr(vcpu));
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6305,13 +6486,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->enable_irq_window(vcpu);
 
 		if (kvm_lapic_enabled(vcpu)) {
-			/*
-			 * Update architecture specific hints for APIC
-			 * virtual interrupt delivery.
-			 */
-			if (kvm_x86_ops->hwapic_irr_update)
-				kvm_x86_ops->hwapic_irr_update(vcpu,
-					kvm_lapic_find_highest_irr(vcpu));
 			update_cr8_intercept(vcpu);
 			kvm_lapic_sync_to_vapic(vcpu);
 		}
@@ -6354,7 +6528,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (req_immediate_exit)
 		smp_send_reschedule(vcpu->cpu);
 
-	kvm_guest_enter();
+	trace_kvm_entry(vcpu->vcpu_id);
+	wait_lapic_expire(vcpu);
+	__kvm_guest_enter();
 
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
@@ -6366,8 +6542,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
-	trace_kvm_entry(vcpu->vcpu_id);
-	wait_lapic_expire(vcpu);
 	kvm_x86_ops->run(vcpu);
 
 	/*
@@ -6377,12 +6551,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 * KVM_DEBUGREG_WONT_EXIT again.
 	 */
 	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
-		int i;
-
 		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
 		kvm_x86_ops->sync_dirty_debug_regs(vcpu);
-		for (i = 0; i < KVM_NR_DB_REGS; i++)
-			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+		kvm_update_dr0123(vcpu);
+		kvm_update_dr6(vcpu);
+		kvm_update_dr7(vcpu);
+		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
 	/*
@@ -6395,8 +6569,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
-							   native_read_tsc());
+	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
@@ -6447,10 +6620,15 @@ out:
 
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arch_vcpu_runnable(vcpu)) {
+	if (!kvm_arch_vcpu_runnable(vcpu) &&
+	    (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 		kvm_vcpu_block(vcpu);
 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+		if (kvm_x86_ops->post_block)
+			kvm_x86_ops->post_block(vcpu);
+
 		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			return 1;
 	}
@@ -6473,6 +6651,12 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+		!vcpu->arch.apf.halted);
+}
+
 static int vcpu_run(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -6481,11 +6665,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 
 	for (;;) {
-		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
-		    !vcpu->arch.apf.halted)
+		if (kvm_vcpu_running(vcpu)) {
 			r = vcpu_enter_guest(vcpu);
-		else
+		} else {
 			r = vcpu_block(kvm, vcpu);
+		}
+
 		if (r <= 0)
 			break;
 
@@ -6493,9 +6678,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
-		if (dm_request_for_irq_injection(vcpu)) {
-			r = -EINTR;
-			vcpu->run->exit_reason = KVM_EXIT_INTR;
+		if (dm_request_for_irq_injection(vcpu) &&
+			kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+			r = 0;
+			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 			++vcpu->stat.request_irq_exits;
 			break;
 		}
@@ -6604,11 +6790,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct fpu *fpu = &current->thread.fpu;
 	int r;
 	sigset_t sigsaved;
 
-	if (!tsk_used_math(current) && init_fpu(current))
-		return -ENOMEM;
+	fpu__activate_curr(fpu);
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -6622,7 +6808,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm)) {
+	if (!lapic_in_kernel(vcpu)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 			r = -EINVAL;
 			goto out;
@@ -6946,7 +7132,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	 */
 	kvm_set_rflags(vcpu, rflags);
 
-	kvm_x86_ops->update_db_bp_intercept(vcpu);
+	kvm_x86_ops->update_bp_intercept(vcpu);
 
 	r = 0;
 
@@ -6978,8 +7164,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct i387_fxsave_struct *fxsave =
-			&vcpu->arch.guest_fpu.state->fxsave;
+	struct fxregs_state *fxsave =
+			&vcpu->arch.guest_fpu.state.fxsave;
 
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
@@ -6995,8 +7181,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct i387_fxsave_struct *fxsave =
-			&vcpu->arch.guest_fpu.state->fxsave;
+	struct fxregs_state *fxsave =
+			&vcpu->arch.guest_fpu.state.fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -7010,33 +7196,19 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
-int fx_init(struct kvm_vcpu *vcpu)
+static void fx_init(struct kvm_vcpu *vcpu)
 {
-	int err;
-
-	err = fpu_alloc(&vcpu->arch.guest_fpu);
-	if (err)
-		return err;
-
-	fpu_finit(&vcpu->arch.guest_fpu);
+	fpstate_init(&vcpu->arch.guest_fpu.state);
 	if (cpu_has_xsaves)
-		vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
+		vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
 			host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
 	/*
 	 * Ensure guest xcr0 is valid for loading
 	 */
-	vcpu->arch.xcr0 = XSTATE_FP;
+	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-static void fx_free(struct kvm_vcpu *vcpu)
-{
-	fpu_free(&vcpu->arch.guest_fpu);
 }
 
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
@@ -7052,7 +7224,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	__kernel_fpu_begin();
-	fpu_restore_checking(&vcpu->arch.guest_fpu);
+	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
 	trace_kvm_fpu(1);
 }
 
@@ -7060,16 +7232,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	kvm_put_guest_xcr0(vcpu);
 
-	if (!vcpu->guest_fpu_loaded)
+	if (!vcpu->guest_fpu_loaded) {
+		vcpu->fpu_counter = 0;
 		return;
+	}
 
 	vcpu->guest_fpu_loaded = 0;
-	fpu_save_init(&vcpu->arch.guest_fpu);
+	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
 	__kernel_fpu_end();
 	++vcpu->stat.fpu_reload;
-	if (!vcpu->arch.eager_fpu)
-		kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
-
+	/*
+	 * If using eager FPU mode, or if the guest is a frequent user
+	 * of the FPU, just leave the FPU active for next time.
+	 * Every 255 times fpu_counter rolls over to 0; a guest that uses
+	 * the FPU in bursts will revert to loading it on demand.
+	 */
+	if (!vcpu->arch.eager_fpu) {
+		if (++vcpu->fpu_counter < 5)
+			kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+	}
 	trace_kvm_fpu(0);
 }
 
@@ -7078,7 +7259,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	kvmclock_reset(vcpu);
 
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
-	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
@@ -7094,11 +7274,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
 	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
 
-	/*
-	 * Activate fpu unconditionally in case the guest needs eager FPU.  It will be
-	 * deactivated soon if it doesn't.
-	 */
-	kvm_x86_ops->fpu_activate(vcpu);
 	return vcpu;
 }
 
@@ -7106,14 +7281,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	vcpu->arch.mtrr_state.have_fixed = 1;
+	kvm_vcpu_mtrr_init(vcpu);
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
-	kvm_vcpu_reset(vcpu);
+	kvm_vcpu_reset(vcpu, false);
 	kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
-
 	return r;
 }
 
@@ -7130,6 +7304,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 	kvm_write_tsc(vcpu, &msr);
 	vcpu_put(vcpu);
 
+	if (!kvmclock_periodic_sync)
+		return;
+
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
 }
@@ -7144,12 +7321,13 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 
-	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
+	vcpu->arch.hflags = 0;
+
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;
@@ -7175,13 +7353,16 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
-	kvm_pmu_reset(vcpu);
+	if (!init_event) {
+		kvm_pmu_reset(vcpu);
+		vcpu->arch.smbase = 0x30000;
+	}
 
 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
 	vcpu->arch.regs_avail = ~0;
 	vcpu->arch.regs_dirty = ~0;
 
-	kvm_x86_ops->vcpu_reset(vcpu);
+	kvm_x86_ops->vcpu_reset(vcpu, init_event);
 }
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7210,7 +7391,7 @@ int kvm_arch_hardware_enable(void)
 	if (ret != 0)
 		return ret;
 
-	local_tsc = native_read_tsc();
+	local_tsc = rdtsc();
 	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -7300,6 +7481,20 @@ int kvm_arch_hardware_setup(void)
 	if (r != 0)
 		return r;
 
+	if (kvm_has_tsc_control) {
+		/*
+		 * Make sure the user can only configure tsc_khz values that
+		 * fit into a signed integer.
+		 * A min value is not calculated needed because it will always
+		 * be 1 on all machines.
+		 */
+		u64 max = min(0x7fffffffULL,
+			      __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
+		kvm_max_guest_tsc_khz = max;
+
+		kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+	}
+
 	kvm_init_msr_list();
 	return 0;
 }
@@ -7314,9 +7509,20 @@ void kvm_arch_check_processor_compat(void *rtn)
 	kvm_x86_ops->check_processor_compatibility(rtn);
 }
 
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
+
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
-	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+	return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
 }
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7370,9 +7576,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 	}
 
-	r = fx_init(vcpu);
-	if (r)
-		goto fail_free_wbinvd_dirty_mask;
+	fx_init(vcpu);
 
 	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
 	vcpu->arch.pv_time_enabled = false;
@@ -7382,12 +7586,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
+	vcpu->arch.pending_external_vector = -1;
+
 	return 0;
-fail_free_wbinvd_dirty_mask:
-	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
@@ -7411,7 +7618,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 
@@ -7489,6 +7696,72 @@ void kvm_arch_sync_events(struct kvm *kvm)
 	kvm_free_pit(kvm);
 }
 
+int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+{
+	int i, r;
+	unsigned long hva;
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *slot, old;
+
+	/* Called with kvm->slots_lock held.  */
+	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
+		return -EINVAL;
+
+	slot = id_to_memslot(slots, id);
+	if (size) {
+		if (WARN_ON(slot->npages))
+			return -EEXIST;
+
+		/*
+		 * MAP_SHARED to prevent internal slot pages from being moved
+		 * by fork()/COW.
+		 */
+		hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
+			      MAP_SHARED | MAP_ANONYMOUS, 0);
+		if (IS_ERR((void *)hva))
+			return PTR_ERR((void *)hva);
+	} else {
+		if (!slot->npages)
+			return 0;
+
+		hva = 0;
+	}
+
+	old = *slot;
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		struct kvm_userspace_memory_region m;
+
+		m.slot = id | (i << 16);
+		m.flags = 0;
+		m.guest_phys_addr = gpa;
+		m.userspace_addr = hva;
+		m.memory_size = size;
+		r = __kvm_set_memory_region(kvm, &m);
+		if (r < 0)
+			return r;
+	}
+
+	if (!size) {
+		r = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+		WARN_ON(r < 0);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+
+int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+{
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+	r = __x86_set_memory_region(kvm, id, gpa, size);
+	mutex_unlock(&kvm->slots_lock);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(x86_set_memory_region);
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	if (current->mm == kvm->mm) {
@@ -7497,16 +7770,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		 * unless the the memory map has changed due to process exit
 		 * or fd copying.
 		 */
-		struct kvm_userspace_memory_region mem;
-		memset(&mem, 0, sizeof(mem));
-		mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
-		kvm_set_memory_region(kvm, &mem);
-
-		mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
-		kvm_set_memory_region(kvm, &mem);
-
-		mem.slot = TSS_PRIVATE_MEMSLOT;
-		kvm_set_memory_region(kvm, &mem);
+		x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+		x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
+		x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
 	}
 	kvm_iommu_unmap_guest(kvm);
 	kfree(kvm->arch.vpic);
@@ -7595,41 +7861,20 @@ out_free:
 	return -ENOMEM;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
 	/*
 	 * memslots->generation has been incremented.
 	 * mmio generation may have reached its maximum value.
 	 */
-	kvm_mmu_invalidate_mmio_sptes(kvm);
+	kvm_mmu_invalidate_mmio_sptes(kvm, slots);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				enum kvm_mr_change change)
 {
-	/*
-	 * Only private memory slots need to be mapped here since
-	 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
-	 */
-	if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
-		unsigned long userspace_addr;
-
-		/*
-		 * MAP_SHARED to prevent internal slot pages from being moved
-		 * by fork()/COW.
-		 */
-		userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
-					 PROT_READ | PROT_WRITE,
-					 MAP_SHARED | MAP_ANONYMOUS, 0);
-
-		if (IS_ERR((void *)userspace_addr))
-			return PTR_ERR((void *)userspace_addr);
-
-		memslot->userspace_addr = userspace_addr;
-	}
-
 	return 0;
 }
 
@@ -7684,33 +7929,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
-	struct kvm_memory_slot *new;
 	int nr_mmu_pages = 0;
 
-	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
-		int ret;
-
-		ret = vm_munmap(old->userspace_addr,
-				old->npages * PAGE_SIZE);
-		if (ret < 0)
-			printk(KERN_WARNING
-			       "kvm_vm_ioctl_set_memory_region: "
-			       "failed to munmap memory\n");
-	}
-
 	if (!kvm->arch.n_requested_mmu_pages)
 		nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
 
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 
-	/* It's OK to get 'new' slot here as it has already been installed */
-	new = id_to_memslot(kvm->memslots, mem->slot);
-
 	/*
 	 * Dirty logging tracks sptes in 4k granularity, meaning that large
 	 * sptes have to be split.  If live migration is successful, the guest
@@ -7735,9 +7966,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * been zapped so no dirty logging staff is needed for old slot. For
 	 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
 	 * new and it's also covered when dealing with the new slot.
+	 *
+	 * FIXME: const-ify all uses of struct kvm_memory_slot.
 	 */
 	if (change != KVM_MR_DELETE)
-		kvm_mmu_slot_apply_flags(kvm, new);
+		kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7751,19 +7984,36 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 	kvm_mmu_invalidate_zap_all_pages(kvm);
 }
 
+static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+{
+	if (!list_empty_careful(&vcpu->async_pf.done))
+		return true;
+
+	if (kvm_apic_has_events(vcpu))
+		return true;
+
+	if (vcpu->arch.pv.pv_unhalted)
+		return true;
+
+	if (atomic_read(&vcpu->arch.nmi_queued))
+		return true;
+
+	if (test_bit(KVM_REQ_SMI, &vcpu->requests))
+		return true;
+
+	if (kvm_arch_interrupt_allowed(vcpu) &&
+	    kvm_cpu_has_interrupt(vcpu))
+		return true;
+
+	return false;
+}
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
 		kvm_x86_ops->check_nested_events(vcpu, false);
 
-	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
-		!vcpu->arch.apf.halted)
-		|| !list_empty_careful(&vcpu->async_pf.done)
-		|| kvm_apic_has_events(vcpu)
-		|| vcpu->arch.pv.pv_unhalted
-		|| atomic_read(&vcpu->arch.nmi_queued) ||
-		(kvm_arch_interrupt_allowed(vcpu) &&
-		 kvm_cpu_has_interrupt(vcpu));
+	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 }
 
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
@@ -7959,6 +8209,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+	atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+	atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+	return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
 {
 	atomic_inc(&kvm->arch.noncoherent_dma_count);
@@ -7977,7 +8245,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	if (kvm_x86_ops->update_pi_irte) {
+		irqfd->producer = prod;
+		return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+				prod->irq, irqfd->gsi, 1);
+	}
+
+	return -EINVAL;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	int ret;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	if (!kvm_x86_ops->update_pi_irte) {
+		WARN_ON(irqfd->producer != NULL);
+		return;
+	}
+
+	WARN_ON(irqfd->producer != prod);
+	irqfd->producer = NULL;
+
+	/*
+	 * When producer of consumer is unregistered, we change back to
+	 * remapped mode, so we can re-use the current implementation
+	 * when the irq is masked/disabed or the consumer side (KVM
+	 * int this case doesn't want to receive the interrupts.
+	*/
+	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+	if (ret)
+		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+		       " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+				   uint32_t guest_irq, bool set)
+{
+	if (!kvm_x86_ops->update_pi_irte)
+		return -EINVAL;
+
+	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -7992,3 +8312,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
diff --git a/kernel/arch/x86/kvm/x86.h b/kernel/arch/x86/kvm/x86.h
index f5fef1868..f2afa5fe4 100644
--- a/kernel/arch/x86/kvm/x86.h
+++ b/kernel/arch/x86/kvm/x86.h
@@ -4,6 +4,8 @@
 #include <linux/kvm_host.h>
 #include "kvm_cache_regs.h"
 
+#define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
+
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.exception.pending = false;
@@ -145,6 +147,16 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
 	return kvm_register_write(vcpu, reg, val);
 }
 
+static inline u64 get_kernel_ns(void)
+{
+	return ktime_get_boot_ns();
+}
+
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+	return !(kvm->arch.disabled_quirks & quirk);
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
@@ -160,11 +172,17 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
 	struct x86_exception *exception);
 
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
-
-#define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
-				| XSTATE_BNDREGS | XSTATE_BNDCSR \
-				| XSTATE_AVX512)
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+					  int page_num);
+
+#define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
diff --git a/kernel/arch/x86/lguest/boot.c b/kernel/arch/x86/lguest/boot.c
index 8f9a133cc..a43b2eafc 100644
--- a/kernel/arch/x86/lguest/boot.c
+++ b/kernel/arch/x86/lguest/boot.c
@@ -70,7 +70,7 @@
 #include <asm/e820.h>
 #include <asm/mce.h>
 #include <asm/io.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/stackprotector.h>
 #include <asm/reboot.h>		/* for struct machine_ops */
 #include <asm/kvm_para.h>
@@ -90,7 +90,7 @@ struct lguest_data lguest_data = {
 	.noirq_iret = (u32)lguest_noirq_iret,
 	.kernel_address = PAGE_OFFSET,
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = SYSCALL_VECTOR,
+	.syscall_vec = IA32_SYSCALL_VECTOR,
 };
 
 /*G:037
@@ -835,16 +835,46 @@ static struct irq_chip lguest_irq_controller = {
 	.irq_unmask	= enable_lguest_irq,
 };
 
+/*
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
+ */
+static int lguest_setup_irq(unsigned int irq)
+{
+	struct irq_desc *desc;
+	int err;
+
+	/* Returns -ve error or vector number. */
+	err = irq_alloc_desc_at(irq, 0);
+	if (err < 0 && err != -EEXIST)
+		return err;
+
+	/*
+	 * Tell the Linux infrastructure that the interrupt is
+	 * controlled by our level-based lguest interrupt controller.
+	 */
+	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
+				      handle_level_irq, "level");
+
+	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
+	desc = irq_to_desc(irq);
+	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
+	return 0;
+}
+
 static int lguest_enable_irq(struct pci_dev *dev)
 {
+	int err;
 	u8 line = 0;
 
 	/* We literally use the PCI interrupt line as the irq number. */
 	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-	irq_set_chip_and_handler_name(line, &lguest_irq_controller,
-				      handle_level_irq, "level");
-	dev->irq = line;
-	return 0;
+	err = lguest_setup_irq(line);
+	if (!err)
+		dev->irq = line;
+	return err;
 }
 
 /* We don't do hotplug PCI, so this shouldn't be called. */
@@ -855,18 +885,14 @@ static void lguest_disable_irq(struct pci_dev *dev)
 
 /*
  * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls), and then tells the
- * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller.
+ * interrupt (except 128, which is used for system calls).
  */
 static void __init lguest_init_IRQ(void)
 {
 	unsigned int i;
 
 	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-		/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-		__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
-		if (i != SYSCALL_VECTOR)
+		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, irq_entries_start +
 					8 * (i - FIRST_EXTERNAL_VECTOR));
 	}
@@ -879,26 +905,6 @@ static void __init lguest_init_IRQ(void)
 }
 
 /*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-int lguest_setup_irq(unsigned int irq)
-{
-	int err;
-
-	/* Returns -ve error or vector number. */
-	err = irq_alloc_desc_at(irq, 0);
-	if (err < 0 && err != -EEXIST)
-		return err;
-
-	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-				      handle_level_irq, "level");
-	return 0;
-}
-
-/*
  * Time.
  *
  * It would be far better for everyone if the Guest had its own clock, but
@@ -985,23 +991,11 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
 	return 0;
 }
 
-static void lguest_clockevent_set_mode(enum clock_event_mode mode,
-                                      struct clock_event_device *evt)
+static int lguest_clockevent_shutdown(struct clock_event_device *evt)
 {
-	switch (mode) {
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		/* A 0 argument shuts the clock down. */
-		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-		break;
-	case CLOCK_EVT_MODE_ONESHOT:
-		/* This is what we expect. */
-		break;
-	case CLOCK_EVT_MODE_PERIODIC:
-		BUG();
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
+	/* A 0 argument shuts the clock down. */
+	hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
+	return 0;
 }
 
 /* This describes our primitive timer chip. */
@@ -1009,7 +1003,7 @@ static struct clock_event_device lguest_clockevent = {
 	.name                   = "lguest",
 	.features               = CLOCK_EVT_FEAT_ONESHOT,
 	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_mode               = lguest_clockevent_set_mode,
+	.set_state_shutdown	= lguest_clockevent_shutdown,
 	.rating                 = INT_MAX,
 	.mult                   = 1,
 	.shift                  = 0,
@@ -1021,7 +1015,7 @@ static struct clock_event_device lguest_clockevent = {
  * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
  * call the clockevent infrastructure and it does whatever needs doing.
  */
-static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+static void lguest_time_irq(struct irq_desc *desc)
 {
 	unsigned long flags;
 
@@ -1040,7 +1034,8 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
 static void lguest_time_init(void)
 {
 	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	lguest_setup_irq(0);
+	if (lguest_setup_irq(0) != 0)
+		panic("Could not set up timer irq");
 	irq_set_handler(0, lguest_time_irq);
 
 	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
@@ -1419,6 +1414,7 @@ __init void lguest_init(void)
 	pv_info.kernel_rpl = 1;
 	/* Everyone except Xen runs with this set. */
 	pv_info.shared_kernel_pmd = 1;
+	pv_info.features = 0;
 
 	/*
 	 * We set up all the lguest overrides for sensitive operations.  These
diff --git a/kernel/arch/x86/lib/Makefile b/kernel/arch/x86/lib/Makefile
index 1530afb07..f2587888d 100644
--- a/kernel/arch/x86/lib/Makefile
+++ b/kernel/arch/x86/lib/Makefile
@@ -17,7 +17,6 @@ clean-files := inat-tables.c
 obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
 
 lib-y := delay.o misc.o cmdline.o
-lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
@@ -40,6 +39,6 @@ else
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o copy_user_nocache_64.o
+        lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
diff --git a/kernel/arch/x86/lib/atomic64_386_32.S b/kernel/arch/x86/lib/atomic64_386_32.S
index 00933d5e9..9b0ca8fe8 100644
--- a/kernel/arch/x86/lib/atomic64_386_32.S
+++ b/kernel/arch/x86/lib/atomic64_386_32.S
@@ -11,26 +11,23 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 /* if you want SMP support, implement these with real spinlocks */
 .macro LOCK reg
-	pushfl_cfi
+	pushfl
 	cli
 .endm
 
 .macro UNLOCK reg
-	popfl_cfi
+	popfl
 .endm
 
 #define BEGIN(op) \
 .macro endp; \
-	CFI_ENDPROC; \
 ENDPROC(atomic64_##op##_386); \
 .purgem endp; \
 .endm; \
 ENTRY(atomic64_##op##_386); \
-	CFI_STARTPROC; \
 	LOCK v;
 
 #define ENDP endp
diff --git a/kernel/arch/x86/lib/atomic64_cx8_32.S b/kernel/arch/x86/lib/atomic64_cx8_32.S
index 082a85167..db3ae8544 100644
--- a/kernel/arch/x86/lib/atomic64_cx8_32.S
+++ b/kernel/arch/x86/lib/atomic64_cx8_32.S
@@ -11,7 +11,6 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 .macro read64 reg
 	movl %ebx, %eax
@@ -22,16 +21,11 @@
 .endm
 
 ENTRY(atomic64_read_cx8)
-	CFI_STARTPROC
-
 	read64 %ecx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_read_cx8)
 
 ENTRY(atomic64_set_cx8)
-	CFI_STARTPROC
-
 1:
 /* we don't need LOCK_PREFIX since aligned 64-bit writes
  * are atomic on 586 and newer */
@@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8)
 	jne 1b
 
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_set_cx8)
 
 ENTRY(atomic64_xchg_cx8)
-	CFI_STARTPROC
-
 1:
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b
 
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_xchg_cx8)
 
 .macro addsub_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebp
-	pushl_cfi_reg ebx
-	pushl_cfi_reg esi
-	pushl_cfi_reg edi
+	pushl %ebp
+	pushl %ebx
+	pushl %esi
+	pushl %edi
 
 	movl %eax, %esi
 	movl %edx, %edi
@@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg edi
-	popl_cfi_reg esi
-	popl_cfi_reg ebx
-	popl_cfi_reg ebp
+	popl %edi
+	popl %esi
+	popl %ebx
+	popl %ebp
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
 .endm
 
@@ -93,8 +81,7 @@ addsub_return sub sub sbb
 
 .macro incdec_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
 .endm
 
@@ -119,8 +105,7 @@ incdec_return inc add adc
 incdec_return dec sub sbb
 
 ENTRY(atomic64_dec_if_positive_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8)
 2:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_dec_if_positive_cx8)
 
 ENTRY(atomic64_add_unless_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebp
-	pushl_cfi_reg ebx
+	pushl %ebp
+	pushl %ebx
 /* these just push these two parameters on the stack */
-	pushl_cfi_reg edi
-	pushl_cfi_reg ecx
+	pushl %edi
+	pushl %ecx
 
 	movl %eax, %ebp
 	movl %edx, %edi
@@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8)
 	movl $1, %eax
 3:
 	addl $8, %esp
-	CFI_ADJUST_CFA_OFFSET -8
-	popl_cfi_reg ebx
-	popl_cfi_reg ebp
+	popl %ebx
+	popl %ebp
 	ret
 4:
 	cmpl %edx, 4(%esp)
 	jne 2b
 	xorl %eax, %eax
 	jmp 3b
-	CFI_ENDPROC
 ENDPROC(atomic64_add_unless_cx8)
 
 ENTRY(atomic64_inc_not_zero_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8)
 
 	movl $1, %eax
 3:
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/kernel/arch/x86/lib/checksum_32.S b/kernel/arch/x86/lib/checksum_32.S
index 9bc944a91..c1e623209 100644
--- a/kernel/arch/x86/lib/checksum_32.S
+++ b/kernel/arch/x86/lib/checksum_32.S
@@ -26,7 +26,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 				
@@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 	   * alignment for the unrolled loop.
 	   */		
 ENTRY(csum_partial)
-	CFI_STARTPROC
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %esi
+	pushl %ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg: unsigned char *buff
@@ -129,10 +127,9 @@ ENTRY(csum_partial)
 	jz 8f
 	roll $8, %eax
 8:
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
+	popl %ebx
+	popl %esi
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial)
 
 #else
@@ -140,9 +137,8 @@ ENDPROC(csum_partial)
 /* Version for PentiumII/PPro */
 
 ENTRY(csum_partial)
-	CFI_STARTPROC
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %esi
+	pushl %ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
@@ -249,10 +245,9 @@ ENTRY(csum_partial)
 	jz 90f
 	roll $8, %eax
 90: 
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
+	popl %ebx
+	popl %esi
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial)
 				
 #endif
@@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
 #define FP		12
 		
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
 	subl  $4,%esp	
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %edi
+	pushl %esi
+	pushl %ebx
 	movl ARGBASE+16(%esp),%eax	# sum
 	movl ARGBASE+12(%esp),%ecx	# len
 	movl ARGBASE+4(%esp),%esi	# src
@@ -401,12 +394,11 @@ DST(	movb %cl, (%edi)	)
 
 .previous
 
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi %ecx			# equivalent to addl $4,%esp
+	popl %ebx
+	popl %esi
+	popl %edi
+	popl %ecx			# equivalent to addl $4,%esp
 	ret	
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
 
 #else
@@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic)
 #define ARGBASE 12
 		
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
+	pushl %ebx
+	pushl %edi
+	pushl %esi
 	movl ARGBASE+4(%esp),%esi	#src
 	movl ARGBASE+8(%esp),%edi	#dst	
 	movl ARGBASE+12(%esp),%ecx	#len
@@ -489,11 +480,10 @@ DST(	movb %dl, (%edi)         )
 	jmp  7b			
 .previous				
 
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi_reg ebx
+	popl %esi
+	popl %edi
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
 				
 #undef ROUND
diff --git a/kernel/arch/x86/lib/clear_page_64.S b/kernel/arch/x86/lib/clear_page_64.S
index e67e579c9..a2fe51b00 100644
--- a/kernel/arch/x86/lib/clear_page_64.S
+++ b/kernel/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,4 @@
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -15,7 +14,6 @@
  * %rdi	- page
  */
 ENTRY(clear_page)
-	CFI_STARTPROC
 
 	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
 		      "jmp clear_page_c_e", X86_FEATURE_ERMS
@@ -24,11 +22,9 @@ ENTRY(clear_page)
 	xorl %eax,%eax
 	rep stosq
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page)
 
 ENTRY(clear_page_orig)
-	CFI_STARTPROC
 
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
@@ -48,14 +44,11 @@ ENTRY(clear_page_orig)
 	jnz	.Lloop
 	nop
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page_orig)
 
 ENTRY(clear_page_c_e)
-	CFI_STARTPROC
 	movl $4096,%ecx
 	xorl %eax,%eax
 	rep stosb
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page_c_e)
diff --git a/kernel/arch/x86/lib/cmpxchg16b_emu.S b/kernel/arch/x86/lib/cmpxchg16b_emu.S
index 40a172541..9b330242e 100644
--- a/kernel/arch/x86/lib/cmpxchg16b_emu.S
+++ b/kernel/arch/x86/lib/cmpxchg16b_emu.S
@@ -6,7 +6,6 @@
  *
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/percpu.h>
 
 .text
@@ -21,7 +20,6 @@
  * %al  : Operation successful
  */
 ENTRY(this_cpu_cmpxchg16b_emu)
-CFI_STARTPROC
 
 #
 # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
@@ -32,7 +30,7 @@ CFI_STARTPROC
 # *atomic* on a single cpu (as provided by the this_cpu_xx class of
 # macros).
 #
-	pushfq_cfi
+	pushfq
 	cli
 
 	cmpq PER_CPU_VAR((%rsi)), %rax
@@ -43,17 +41,13 @@ CFI_STARTPROC
 	movq %rbx, PER_CPU_VAR((%rsi))
 	movq %rcx, PER_CPU_VAR(8(%rsi))
 
-	CFI_REMEMBER_STATE
-	popfq_cfi
+	popfq
 	mov $1, %al
 	ret
 
-	CFI_RESTORE_STATE
 .Lnot_same:
-	popfq_cfi
+	popfq
 	xor %al,%al
 	ret
 
-CFI_ENDPROC
-
 ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/kernel/arch/x86/lib/cmpxchg8b_emu.S b/kernel/arch/x86/lib/cmpxchg8b_emu.S
index b4807fce5..ad5349778 100644
--- a/kernel/arch/x86/lib/cmpxchg8b_emu.S
+++ b/kernel/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,7 +7,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 .text
 
@@ -20,14 +19,13 @@
  * %ecx : high 32 bits of new value
  */
 ENTRY(cmpxchg8b_emu)
-CFI_STARTPROC
 
 #
 # Emulate 'cmpxchg8b (%esi)' on UP except we don't
 # set the whole ZF thing (caller will just compare
 # eax:edx with the expected value)
 #
-	pushfl_cfi
+	pushfl
 	cli
 
 	cmpl  (%esi), %eax
@@ -38,18 +36,15 @@ CFI_STARTPROC
 	movl %ebx,  (%esi)
 	movl %ecx, 4(%esi)
 
-	CFI_REMEMBER_STATE
-	popfl_cfi
+	popfl
 	ret
 
-	CFI_RESTORE_STATE
 .Lnot_same:
 	movl  (%esi), %eax
 .Lhalf_same:
 	movl 4(%esi), %edx
 
-	popfl_cfi
+	popfl
 	ret
 
-CFI_ENDPROC
 ENDPROC(cmpxchg8b_emu)
diff --git a/kernel/arch/x86/lib/copy_page_64.S b/kernel/arch/x86/lib/copy_page_64.S
index 8239dbcbf..009f98216 100644
--- a/kernel/arch/x86/lib/copy_page_64.S
+++ b/kernel/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,6 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -13,22 +12,16 @@
  */
 	ALIGN
 ENTRY(copy_page)
-	CFI_STARTPROC
 	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
-	CFI_ENDPROC
 ENDPROC(copy_page)
 
 ENTRY(copy_page_regs)
-	CFI_STARTPROC
 	subq	$2*8,	%rsp
-	CFI_ADJUST_CFA_OFFSET 2*8
 	movq	%rbx,	(%rsp)
-	CFI_REL_OFFSET rbx, 0
 	movq	%r12,	1*8(%rsp)
-	CFI_REL_OFFSET r12, 1*8
 
 	movl	$(4096/64)-5,	%ecx
 	.p2align 4
@@ -87,11 +80,7 @@ ENTRY(copy_page_regs)
 	jnz	.Loop2
 
 	movq	(%rsp), %rbx
-	CFI_RESTORE rbx
 	movq	1*8(%rsp), %r12
-	CFI_RESTORE r12
 	addq	$2*8, %rsp
-	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
-	CFI_ENDPROC
 ENDPROC(copy_page_regs)
diff --git a/kernel/arch/x86/lib/copy_user_64.S b/kernel/arch/x86/lib/copy_user_64.S
index fa997dfae..27f89c79a 100644
--- a/kernel/arch/x86/lib/copy_user_64.S
+++ b/kernel/arch/x86/lib/copy_user_64.S
@@ -7,7 +7,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
@@ -16,33 +15,8 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-	.macro ALIGN_DESTINATION
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-	.endm
-
 /* Standard copy_to_user with segment limit checking */
 ENTRY(_copy_to_user)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rdi,%rcx
 	addq %rdx,%rcx
@@ -54,12 +28,10 @@ ENTRY(_copy_to_user)
 		      X86_FEATURE_REP_GOOD,			\
 		      "jmp copy_user_enhanced_fast_string",	\
 		      X86_FEATURE_ERMS
-	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
 /* Standard copy_from_user with segment limit checking */
 ENTRY(_copy_from_user)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rsi,%rcx
 	addq %rdx,%rcx
@@ -71,14 +43,12 @@ ENTRY(_copy_from_user)
 		      X86_FEATURE_REP_GOOD,			\
 		      "jmp copy_user_enhanced_fast_string",	\
 		      X86_FEATURE_ERMS
-	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
 	.section .fixup,"ax"
 	/* must zero dest */
 ENTRY(bad_from_user)
 bad_from_user:
-	CFI_STARTPROC
 	movl %edx,%ecx
 	xorl %eax,%eax
 	rep
@@ -86,7 +56,6 @@ bad_from_user:
 bad_to_user:
 	movl %edx,%eax
 	ret
-	CFI_ENDPROC
 ENDPROC(bad_from_user)
 	.previous
 
@@ -104,7 +73,6 @@ ENDPROC(bad_from_user)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_generic_unrolled)
-	CFI_STARTPROC
 	ASM_STAC
 	cmpl $8,%edx
 	jb 20f		/* less then 8 bytes, go to byte copy loop */
@@ -186,7 +154,6 @@ ENTRY(copy_user_generic_unrolled)
 	_ASM_EXTABLE(19b,40b)
 	_ASM_EXTABLE(21b,50b)
 	_ASM_EXTABLE(22b,50b)
-	CFI_ENDPROC
 ENDPROC(copy_user_generic_unrolled)
 
 /* Some CPUs run faster using the string copy instructions.
@@ -208,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_generic_string)
-	CFI_STARTPROC
 	ASM_STAC
 	cmpl $8,%edx
 	jb 2f		/* less than 8 bytes, go to byte copy loop */
@@ -233,7 +199,6 @@ ENTRY(copy_user_generic_string)
 
 	_ASM_EXTABLE(1b,11b)
 	_ASM_EXTABLE(3b,12b)
-	CFI_ENDPROC
 ENDPROC(copy_user_generic_string)
 
 /*
@@ -249,7 +214,6 @@ ENDPROC(copy_user_generic_string)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_enhanced_fast_string)
-	CFI_STARTPROC
 	ASM_STAC
 	movl %edx,%ecx
 1:	rep
@@ -264,5 +228,154 @@ ENTRY(copy_user_enhanced_fast_string)
 	.previous
 
 	_ASM_EXTABLE(1b,12b)
-	CFI_ENDPROC
 ENDPROC(copy_user_enhanced_fast_string)
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination out of cache for more performance.
+ *
+ * Note: Cached memory copy is used when destination or size is not
+ * naturally aligned. That is:
+ *  - Require 8-byte alignment when size is 8 bytes or larger.
+ *  - Require 4-byte alignment when size is 4 bytes.
+ */
+ENTRY(__copy_user_nocache)
+	ASM_STAC
+
+	/* If size is less than 8 bytes, go to 4-byte copy */
+	cmpl $8,%edx
+	jb .L_4b_nocache_copy_entry
+
+	/* If destination is not 8-byte aligned, "cache" copy to align it */
+	ALIGN_DESTINATION
+
+	/* Set 4x8-byte copy count and remainder */
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
+
+	/* Perform 4x8-byte nocache loop-copy */
+.L_4x8b_nocache_copy_loop:
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+5:	movnti %r8,(%rdi)
+6:	movnti %r9,1*8(%rdi)
+7:	movnti %r10,2*8(%rdi)
+8:	movnti %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+13:	movnti %r8,4*8(%rdi)
+14:	movnti %r9,5*8(%rdi)
+15:	movnti %r10,6*8(%rdi)
+16:	movnti %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz .L_4x8b_nocache_copy_loop
+
+	/* Set 8-byte copy count and remainder */
+.L_8b_nocache_copy_entry:
+	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
+
+	/* Perform 8-byte nocache loop-copy */
+.L_8b_nocache_copy_loop:
+20:	movq (%rsi),%r8
+21:	movnti %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz .L_8b_nocache_copy_loop
+
+	/* If no byte left, we're done */
+.L_4b_nocache_copy_entry:
+	andl %edx,%edx
+	jz .L_finish_copy
+
+	/* If destination is not 4-byte aligned, go to byte copy: */
+	movl %edi,%ecx
+	andl $3,%ecx
+	jnz .L_1b_cache_copy_entry
+
+	/* Set 4-byte copy count (1 or 0) and remainder */
+	movl %edx,%ecx
+	andl $3,%edx
+	shrl $2,%ecx
+	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
+
+	/* Perform 4-byte nocache copy: */
+30:	movl (%rsi),%r8d
+31:	movnti %r8d,(%rdi)
+	leaq 4(%rsi),%rsi
+	leaq 4(%rdi),%rdi
+
+	/* If no bytes left, we're done: */
+	andl %edx,%edx
+	jz .L_finish_copy
+
+	/* Perform byte "cache" loop-copy for the remainder */
+.L_1b_cache_copy_entry:
+	movl %edx,%ecx
+.L_1b_cache_copy_loop:
+40:	movb (%rsi),%al
+41:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .L_1b_cache_copy_loop
+
+	/* Finished copying; fence the prior stores */
+.L_finish_copy:
+	xorl %eax,%eax
+	ASM_CLAC
+	sfence
+	ret
+
+	.section .fixup,"ax"
+.L_fixup_4x8b_copy:
+	shll $6,%ecx
+	addl %ecx,%edx
+	jmp .L_fixup_handle_tail
+.L_fixup_8b_copy:
+	lea (%rdx,%rcx,8),%rdx
+	jmp .L_fixup_handle_tail
+.L_fixup_4b_copy:
+	lea (%rdx,%rcx,4),%rdx
+	jmp .L_fixup_handle_tail
+.L_fixup_1b_copy:
+	movl %ecx,%edx
+.L_fixup_handle_tail:
+	sfence
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(1b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(2b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(3b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(4b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(5b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(6b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(7b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(8b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(9b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(10b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(11b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(12b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(13b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(14b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(15b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(16b,.L_fixup_4x8b_copy)
+	_ASM_EXTABLE(20b,.L_fixup_8b_copy)
+	_ASM_EXTABLE(21b,.L_fixup_8b_copy)
+	_ASM_EXTABLE(30b,.L_fixup_4b_copy)
+	_ASM_EXTABLE(31b,.L_fixup_4b_copy)
+	_ASM_EXTABLE(40b,.L_fixup_1b_copy)
+	_ASM_EXTABLE(41b,.L_fixup_1b_copy)
+ENDPROC(__copy_user_nocache)
diff --git a/kernel/arch/x86/lib/copy_user_nocache_64.S b/kernel/arch/x86/lib/copy_user_nocache_64.S
deleted file mode 100644
index 6a4f43c2d..000000000
--- a/kernel/arch/x86/lib/copy_user_nocache_64.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
- *
- * Functions to copy from and to user space.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-	.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-#endif
-	.endm
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- */
-ENTRY(__copy_user_nocache)
-	CFI_STARTPROC
-	ASM_STAC
-	cmpl $8,%edx
-	jb 20f		/* less then 8 bytes, go to byte copy loop */
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz 17f
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movnti %r8,(%rdi)
-6:	movnti %r9,1*8(%rdi)
-7:	movnti %r10,2*8(%rdi)
-8:	movnti %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movnti %r8,4*8(%rdi)
-14:	movnti %r9,5*8(%rdi)
-15:	movnti %r10,6*8(%rdi)
-16:	movnti %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz 1b
-17:	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz 20f
-18:	movq (%rsi),%r8
-19:	movnti %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz 18b
-20:	andl %edx,%edx
-	jz 23f
-	movl %edx,%ecx
-21:	movb (%rsi),%al
-22:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 21b
-23:	xorl %eax,%eax
-	ASM_CLAC
-	sfence
-	ret
-
-	.section .fixup,"ax"
-30:	shll $6,%ecx
-	addl %ecx,%edx
-	jmp 60f
-40:	lea (%rdx,%rcx,8),%rdx
-	jmp 60f
-50:	movl %ecx,%edx
-60:	sfence
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(1b,30b)
-	_ASM_EXTABLE(2b,30b)
-	_ASM_EXTABLE(3b,30b)
-	_ASM_EXTABLE(4b,30b)
-	_ASM_EXTABLE(5b,30b)
-	_ASM_EXTABLE(6b,30b)
-	_ASM_EXTABLE(7b,30b)
-	_ASM_EXTABLE(8b,30b)
-	_ASM_EXTABLE(9b,30b)
-	_ASM_EXTABLE(10b,30b)
-	_ASM_EXTABLE(11b,30b)
-	_ASM_EXTABLE(12b,30b)
-	_ASM_EXTABLE(13b,30b)
-	_ASM_EXTABLE(14b,30b)
-	_ASM_EXTABLE(15b,30b)
-	_ASM_EXTABLE(16b,30b)
-	_ASM_EXTABLE(18b,40b)
-	_ASM_EXTABLE(19b,40b)
-	_ASM_EXTABLE(21b,50b)
-	_ASM_EXTABLE(22b,50b)
-	CFI_ENDPROC
-ENDPROC(__copy_user_nocache)
diff --git a/kernel/arch/x86/lib/csum-copy_64.S b/kernel/arch/x86/lib/csum-copy_64.S
index 973418296..7e48807b2 100644
--- a/kernel/arch/x86/lib/csum-copy_64.S
+++ b/kernel/arch/x86/lib/csum-copy_64.S
@@ -6,7 +6,6 @@
  * for more details. No warranty for anything given at all.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 
@@ -47,23 +46,16 @@
 
 
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
 	cmpl	$3*64, %edx
 	jle	.Lignore
 
 .Lignore:
 	subq  $7*8, %rsp
-	CFI_ADJUST_CFA_OFFSET 7*8
 	movq  %rbx, 2*8(%rsp)
-	CFI_REL_OFFSET rbx, 2*8
 	movq  %r12, 3*8(%rsp)
-	CFI_REL_OFFSET r12, 3*8
 	movq  %r14, 4*8(%rsp)
-	CFI_REL_OFFSET r14, 4*8
 	movq  %r13, 5*8(%rsp)
-	CFI_REL_OFFSET r13, 5*8
 	movq  %rbp, 6*8(%rsp)
-	CFI_REL_OFFSET rbp, 6*8
 
 	movq  %r8, (%rsp)
 	movq  %r9, 1*8(%rsp)
@@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic)
 	addl %ebx, %eax
 	adcl %r9d, %eax		/* carry */
 
-	CFI_REMEMBER_STATE
 .Lende:
 	movq 2*8(%rsp), %rbx
-	CFI_RESTORE rbx
 	movq 3*8(%rsp), %r12
-	CFI_RESTORE r12
 	movq 4*8(%rsp), %r14
-	CFI_RESTORE r14
 	movq 5*8(%rsp), %r13
-	CFI_RESTORE r13
 	movq 6*8(%rsp), %rbp
-	CFI_RESTORE rbp
 	addq $7*8, %rsp
-	CFI_ADJUST_CFA_OFFSET -7*8
 	ret
-	CFI_RESTORE_STATE
 
 	/* Exception handlers. Very simple, zeroing is done in the wrappers */
 .Lbad_source:
@@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic)
 	jz   .Lende
 	movl $-EFAULT, (%rax)
 	jmp .Lende
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
diff --git a/kernel/arch/x86/lib/delay.c b/kernel/arch/x86/lib/delay.c
index 39d6a3db0..e912b2f6d 100644
--- a/kernel/arch/x86/lib/delay.c
+++ b/kernel/arch/x86/lib/delay.c
@@ -20,6 +20,7 @@
 #include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/timer.h>
+#include <asm/mwait.h>
 
 #ifdef CONFIG_SMP
 # include <asm/smp.h>
@@ -49,16 +50,14 @@ static void delay_loop(unsigned long loops)
 /* TSC based delay: */
 static void delay_tsc(unsigned long __loops)
 {
-	u32 bclock, now, loops = __loops;
+	u64 bclock, now, loops = __loops;
 	int cpu;
 
 	preempt_disable();
 	cpu = smp_processor_id();
-	rdtsc_barrier();
-	rdtscl(bclock);
+	bclock = rdtsc_ordered();
 	for (;;) {
-		rdtsc_barrier();
-		rdtscl(now);
+		now = rdtsc_ordered();
 		if ((now - bclock) >= loops)
 			break;
 
@@ -79,14 +78,51 @@ static void delay_tsc(unsigned long __loops)
 		if (unlikely(cpu != smp_processor_id())) {
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
-			rdtsc_barrier();
-			rdtscl(bclock);
+			bclock = rdtsc_ordered();
 		}
 	}
 	preempt_enable();
 }
 
 /*
+ * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. The input value is the loop of the
+ * counter, it will exit when the timer expires.
+ */
+static void delay_mwaitx(unsigned long __loops)
+{
+	u64 start, end, delay, loops = __loops;
+
+	start = rdtsc_ordered();
+
+	for (;;) {
+		delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+
+		/*
+		 * Use cpu_tss as a cacheline-aligned, seldomly
+		 * accessed per-cpu variable as the monitor target.
+		 */
+		__monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+
+		/*
+		 * AMD, like Intel, supports the EAX hint and EAX=0xf
+		 * means, do not enter any deep C-state and we use it
+		 * here in delay() to minimize wakeup latency.
+		 */
+		__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+
+		end = rdtsc_ordered();
+
+		if (loops <= end - start)
+			break;
+
+		loops -= end - start;
+
+		start = end;
+	}
+}
+
+/*
  * Since we calibrate only once at boot, this
  * function should be set once at boot and not changed
  */
@@ -94,13 +130,19 @@ static void (*delay_fn)(unsigned long) = delay_loop;
 
 void use_tsc_delay(void)
 {
-	delay_fn = delay_tsc;
+	if (delay_fn == delay_loop)
+		delay_fn = delay_tsc;
+}
+
+void use_mwaitx_delay(void)
+{
+	delay_fn = delay_mwaitx;
 }
 
 int read_current_timer(unsigned long *timer_val)
 {
 	if (delay_fn == delay_tsc) {
-		rdtscll(*timer_val);
+		*timer_val = rdtsc();
 		return 0;
 	}
 	return -1;
diff --git a/kernel/arch/x86/lib/getuser.S b/kernel/arch/x86/lib/getuser.S
index a45123596..46668cda4 100644
--- a/kernel/arch/x86/lib/getuser.S
+++ b/kernel/arch/x86/lib/getuser.S
@@ -26,7 +26,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/page_types.h>
 #include <asm/errno.h>
 #include <asm/asm-offsets.h>
@@ -36,7 +35,6 @@
 
 	.text
 ENTRY(__get_user_1)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
@@ -45,11 +43,9 @@ ENTRY(__get_user_1)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_1)
 
 ENTRY(__get_user_2)
-	CFI_STARTPROC
 	add $1,%_ASM_AX
 	jc bad_get_user
 	GET_THREAD_INFO(%_ASM_DX)
@@ -60,11 +56,9 @@ ENTRY(__get_user_2)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_2)
 
 ENTRY(__get_user_4)
-	CFI_STARTPROC
 	add $3,%_ASM_AX
 	jc bad_get_user
 	GET_THREAD_INFO(%_ASM_DX)
@@ -75,11 +69,9 @@ ENTRY(__get_user_4)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_4)
 
 ENTRY(__get_user_8)
-	CFI_STARTPROC
 #ifdef CONFIG_X86_64
 	add $7,%_ASM_AX
 	jc bad_get_user
@@ -104,28 +96,23 @@ ENTRY(__get_user_8)
 	ASM_CLAC
 	ret
 #endif
-	CFI_ENDPROC
 ENDPROC(__get_user_8)
 
 
 bad_get_user:
-	CFI_STARTPROC
 	xor %edx,%edx
 	mov $(-EFAULT),%_ASM_AX
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 END(bad_get_user)
 
 #ifdef CONFIG_X86_32
 bad_get_user_8:
-	CFI_STARTPROC
 	xor %edx,%edx
 	xor %ecx,%ecx
 	mov $(-EFAULT),%_ASM_AX
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 END(bad_get_user_8)
 #endif
 
diff --git a/kernel/arch/x86/lib/iomap_copy_64.S b/kernel/arch/x86/lib/iomap_copy_64.S
index 05a95e713..33147fef3 100644
--- a/kernel/arch/x86/lib/iomap_copy_64.S
+++ b/kernel/arch/x86/lib/iomap_copy_64.S
@@ -16,15 +16,12 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 /*
  * override generic version in lib/iomap_copy.c
  */
 ENTRY(__iowrite32_copy)
-	CFI_STARTPROC
 	movl %edx,%ecx
 	rep movsd
 	ret
-	CFI_ENDPROC
 ENDPROC(__iowrite32_copy)
diff --git a/kernel/arch/x86/lib/memcpy_64.S b/kernel/arch/x86/lib/memcpy_64.S
index b046664f5..16698bba8 100644
--- a/kernel/arch/x86/lib/memcpy_64.S
+++ b/kernel/arch/x86/lib/memcpy_64.S
@@ -2,7 +2,6 @@
 
 #include <linux/linkage.h>
 #include <asm/cpufeature.h>
-#include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
 /*
@@ -53,7 +52,6 @@ ENTRY(memcpy_erms)
 ENDPROC(memcpy_erms)
 
 ENTRY(memcpy_orig)
-	CFI_STARTPROC
 	movq %rdi, %rax
 
 	cmpq $0x20, %rdx
@@ -178,5 +176,4 @@ ENTRY(memcpy_orig)
 
 .Lend:
 	retq
-	CFI_ENDPROC
 ENDPROC(memcpy_orig)
diff --git a/kernel/arch/x86/lib/memmove_64.S b/kernel/arch/x86/lib/memmove_64.S
index 0f8a0d033..ca2afdd6d 100644
--- a/kernel/arch/x86/lib/memmove_64.S
+++ b/kernel/arch/x86/lib/memmove_64.S
@@ -6,7 +6,6 @@
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -27,7 +26,6 @@
 
 ENTRY(memmove)
 ENTRY(__memmove)
-	CFI_STARTPROC
 
 	/* Handle more 32 bytes in loop */
 	mov %rdi, %rax
@@ -207,6 +205,5 @@ ENTRY(__memmove)
 	movb %r11b, (%rdi)
 13:
 	retq
-	CFI_ENDPROC
 ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/kernel/arch/x86/lib/memset_64.S b/kernel/arch/x86/lib/memset_64.S
index 93118fb23..2661fad05 100644
--- a/kernel/arch/x86/lib/memset_64.S
+++ b/kernel/arch/x86/lib/memset_64.S
@@ -1,7 +1,6 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -66,7 +65,6 @@ ENTRY(memset_erms)
 ENDPROC(memset_erms)
 
 ENTRY(memset_orig)
-	CFI_STARTPROC
 	movq %rdi,%r10
 
 	/* expand byte value  */
@@ -78,7 +76,6 @@ ENTRY(memset_orig)
 	movl  %edi,%r9d
 	andl  $7,%r9d
 	jnz  .Lbad_alignment
-	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
 	movq  %rdx,%rcx
@@ -128,7 +125,6 @@ ENTRY(memset_orig)
 	movq	%r10,%rax
 	ret
 
-	CFI_RESTORE_STATE
 .Lbad_alignment:
 	cmpq $7,%rdx
 	jbe	.Lhandle_7
@@ -139,5 +135,4 @@ ENTRY(memset_orig)
 	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
-	CFI_ENDPROC
 ENDPROC(memset_orig)
diff --git a/kernel/arch/x86/lib/mmx_32.c b/kernel/arch/x86/lib/mmx_32.c
index c9f2d9ba8..e5e3ed8dc 100644
--- a/kernel/arch/x86/lib/mmx_32.c
+++ b/kernel/arch/x86/lib/mmx_32.c
@@ -22,7 +22,7 @@
 #include <linux/sched.h>
 #include <linux/types.h>
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/asm.h>
 
 void *_mmx_memcpy(void *to, const void *from, size_t len)
diff --git a/kernel/arch/x86/lib/msr-reg.S b/kernel/arch/x86/lib/msr-reg.S
index 3ca5218fb..c81556409 100644
--- a/kernel/arch/x86/lib/msr-reg.S
+++ b/kernel/arch/x86/lib/msr-reg.S
@@ -1,6 +1,5 @@
 #include <linux/linkage.h>
 #include <linux/errno.h>
-#include <asm/dwarf2.h>
 #include <asm/asm.h>
 #include <asm/msr.h>
 
@@ -13,9 +12,8 @@
  */
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
-	CFI_STARTPROC
-	pushq_cfi_reg rbx
-	pushq_cfi_reg rbp
+	pushq %rbx
+	pushq %rbp
 	movq	%rdi, %r10	/* Save pointer */
 	xorl	%r11d, %r11d	/* Return value */
 	movl    (%rdi), %eax
@@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs)
 	movl    20(%rdi), %ebp
 	movl    24(%rdi), %esi
 	movl    28(%rdi), %edi
-	CFI_REMEMBER_STATE
 1:	\op
 2:	movl    %eax, (%r10)
 	movl	%r11d, %eax	/* Return value */
@@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs)
 	movl    %ebp, 20(%r10)
 	movl    %esi, 24(%r10)
 	movl    %edi, 28(%r10)
-	popq_cfi_reg rbp
-	popq_cfi_reg rbx
+	popq %rbp
+	popq %rbx
 	ret
 3:
-	CFI_RESTORE_STATE
 	movl    $-EIO, %r11d
 	jmp     2b
 
 	_ASM_EXTABLE(1b, 3b)
-	CFI_ENDPROC
 ENDPROC(\op\()_safe_regs)
 .endm
 
@@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs)
 
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
-	pushl_cfi_reg ebp
-	pushl_cfi_reg esi
-	pushl_cfi_reg edi
-	pushl_cfi $0              /* Return value */
-	pushl_cfi %eax
+	pushl %ebx
+	pushl %ebp
+	pushl %esi
+	pushl %edi
+	pushl $0              /* Return value */
+	pushl %eax
 	movl    4(%eax), %ecx
 	movl    8(%eax), %edx
 	movl    12(%eax), %ebx
@@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs)
 	movl    24(%eax), %esi
 	movl    28(%eax), %edi
 	movl    (%eax), %eax
-	CFI_REMEMBER_STATE
 1:	\op
-2:	pushl_cfi %eax
+2:	pushl %eax
 	movl    4(%esp), %eax
-	popl_cfi (%eax)
+	popl (%eax)
 	addl    $4, %esp
-	CFI_ADJUST_CFA_OFFSET -4
 	movl    %ecx, 4(%eax)
 	movl    %edx, 8(%eax)
 	movl    %ebx, 12(%eax)
 	movl    %ebp, 20(%eax)
 	movl    %esi, 24(%eax)
 	movl    %edi, 28(%eax)
-	popl_cfi %eax
-	popl_cfi_reg edi
-	popl_cfi_reg esi
-	popl_cfi_reg ebp
-	popl_cfi_reg ebx
+	popl %eax
+	popl %edi
+	popl %esi
+	popl %ebp
+	popl %ebx
 	ret
 3:
-	CFI_RESTORE_STATE
 	movl    $-EIO, 4(%esp)
 	jmp     2b
 
 	_ASM_EXTABLE(1b, 3b)
-	CFI_ENDPROC
 ENDPROC(\op\()_safe_regs)
 .endm
 
diff --git a/kernel/arch/x86/lib/putuser.S b/kernel/arch/x86/lib/putuser.S
index fc6ba17a7..e0817a12d 100644
--- a/kernel/arch/x86/lib/putuser.S
+++ b/kernel/arch/x86/lib/putuser.S
@@ -11,7 +11,6 @@
  * return value.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
@@ -30,11 +29,9 @@
  * as they get called from within inline assembly.
  */
 
-#define ENTER	CFI_STARTPROC ; \
-		GET_THREAD_INFO(%_ASM_BX)
+#define ENTER	GET_THREAD_INFO(%_ASM_BX)
 #define EXIT	ASM_CLAC ;	\
-		ret ;		\
-		CFI_ENDPROC
+		ret
 
 .text
 ENTRY(__put_user_1)
@@ -87,7 +84,6 @@ ENTRY(__put_user_8)
 ENDPROC(__put_user_8)
 
 bad_put_user:
-	CFI_STARTPROC
 	movl $-EFAULT,%eax
 	EXIT
 END(bad_put_user)
diff --git a/kernel/arch/x86/lib/rwsem.S b/kernel/arch/x86/lib/rwsem.S
index 2322abe4d..40027db99 100644
--- a/kernel/arch/x86/lib/rwsem.S
+++ b/kernel/arch/x86/lib/rwsem.S
@@ -15,7 +15,6 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 #define __ASM_HALF_REG(reg)	__ASM_SEL(reg, e##reg)
 #define __ASM_HALF_SIZE(inst)	__ASM_SEL(inst##w, inst##l)
@@ -34,10 +33,10 @@
  */
 
 #define save_common_regs \
-	pushl_cfi_reg ecx
+	pushl %ecx
 
 #define restore_common_regs \
-	popl_cfi_reg ecx
+	popl %ecx
 
 	/* Avoid uglifying the argument copying x86-64 needs to do. */
 	.macro movq src, dst
@@ -64,50 +63,45 @@
  */
 
 #define save_common_regs \
-	pushq_cfi_reg rdi; \
-	pushq_cfi_reg rsi; \
-	pushq_cfi_reg rcx; \
-	pushq_cfi_reg r8;  \
-	pushq_cfi_reg r9;  \
-	pushq_cfi_reg r10; \
-	pushq_cfi_reg r11
+	pushq %rdi; \
+	pushq %rsi; \
+	pushq %rcx; \
+	pushq %r8;  \
+	pushq %r9;  \
+	pushq %r10; \
+	pushq %r11
 
 #define restore_common_regs \
-	popq_cfi_reg r11; \
-	popq_cfi_reg r10; \
-	popq_cfi_reg r9; \
-	popq_cfi_reg r8; \
-	popq_cfi_reg rcx; \
-	popq_cfi_reg rsi; \
-	popq_cfi_reg rdi
+	popq %r11; \
+	popq %r10; \
+	popq %r9; \
+	popq %r8; \
+	popq %rcx; \
+	popq %rsi; \
+	popq %rdi
 
 #endif
 
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
-	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_down_read_failed
-	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_down_read_failed)
 
 ENTRY(call_rwsem_down_write_failed)
-	CFI_STARTPROC
 	save_common_regs
 	movq %rax,%rdi
 	call rwsem_down_write_failed
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
-	CFI_STARTPROC
 	/* do nothing if still outstanding active readers */
 	__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
 	jnz 1f
@@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake)
 	call rwsem_wake
 	restore_common_regs
 1:	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_wake)
 
 ENTRY(call_rwsem_downgrade_wake)
-	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
-	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/kernel/arch/x86/lib/usercopy.c b/kernel/arch/x86/lib/usercopy.c
index ddf9ecb53..e342586db 100644
--- a/kernel/arch/x86/lib/usercopy.c
+++ b/kernel/arch/x86/lib/usercopy.c
@@ -20,7 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	unsigned long ret;
 
 	if (__range_not_ok(from, n, TASK_SIZE))
-		return 0;
+		return n;
 
 	/*
 	 * Even though this function is typically called from NMI/IRQ context
diff --git a/kernel/arch/x86/lib/x86-opcode-map.txt b/kernel/arch/x86/lib/x86-opcode-map.txt
index 816488c0b..d388de72e 100644
--- a/kernel/arch/x86/lib/x86-opcode-map.txt
+++ b/kernel/arch/x86/lib/x86-opcode-map.txt
@@ -353,8 +353,12 @@ AVXcode: 1
 17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
 18: Grp16 (1A)
 19:
-1a: BNDCL Ev,Gv | BNDCU Ev,Gv | BNDMOV Gv,Ev | BNDLDX Gv,Ev,Gv
-1b: BNDCN Ev,Gv | BNDMOV Ev,Gv | BNDMK Gv,Ev | BNDSTX Ev,GV,Gv
+# Intel SDM opcode map does not list MPX instructions. For now using Gv for
+# bnd registers and Ev for everything else is OK because the instruction
+# decoder does not use the information except as an indication that there is
+# a ModR/M byte.
+1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev
+1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
 1c:
 1d:
 1e:
@@ -732,6 +736,12 @@ bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
 be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
 bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
 # 0x0f 0x38 0xc0-0xff
+c8: sha1nexte Vdq,Wdq
+c9: sha1msg1 Vdq,Wdq
+ca: sha1msg2 Vdq,Wdq
+cb: sha256rnds2 Vdq,Wdq
+cc: sha256msg1 Vdq,Wdq
+cd: sha256msg2 Vdq,Wdq
 db: VAESIMC Vdq,Wdq (66),(v1)
 dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
 dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
@@ -790,6 +800,7 @@ AVXcode: 3
 61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
 62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
 63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
+cc: sha1rnds4 Vdq,Wdq,Ib
 df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
 f0: RORX Gy,Ey,Ib (F2),(v)
 EndTable
@@ -874,7 +885,7 @@ GrpTable: Grp7
 2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
-5:
+5: rdpkru (110),(11B) | wrpkru (111),(11B)
 6: LMSW Ew
 7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
 EndTable
@@ -888,6 +899,9 @@ EndTable
 
 GrpTable: Grp9
 1: CMPXCHG8B/16B Mq/Mdq
+3: xrstors
+4: xsavec
+5: xsaves
 6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
 7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
 EndTable
@@ -932,8 +946,8 @@ GrpTable: Grp15
 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
 4: XSAVE
 5: XRSTOR | lfence (11B)
-6: XSAVEOPT | mfence (11B)
-7: clflush | sfence (11B)
+6: XSAVEOPT | clwb (66) | mfence (11B)
+7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B)
 EndTable
 
 GrpTable: Grp16
diff --git a/kernel/arch/x86/math-emu/fpu_aux.c b/kernel/arch/x86/math-emu/fpu_aux.c
index dc8adad10..024f6e971 100644
--- a/kernel/arch/x86/math-emu/fpu_aux.c
+++ b/kernel/arch/x86/math-emu/fpu_aux.c
@@ -30,7 +30,7 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit_soft_fpu(struct i387_soft_struct *soft)
+void fpstate_init_soft(struct swregs_state *soft)
 {
 	struct address *oaddr, *iaddr;
 	memset(soft, 0, sizeof(*soft));
@@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft)
 
 void finit(void)
 {
-	finit_soft_fpu(&current->thread.fpu.state->soft);
+	fpstate_init_soft(&current->thread.fpu.state.soft);
 }
 
 /*
@@ -169,6 +169,76 @@ void fxch_i(void)
 	fpu_tag_word = tag_word;
 }
 
+static void fcmovCC(void)
+{
+	/* fcmovCC st(i) */
+	int i = FPU_rm;
+	FPU_REG *st0_ptr = &st(0);
+	FPU_REG *sti_ptr = &st(i);
+	long tag_word = fpu_tag_word;
+	int regnr = top & 7;
+	int regnri = (top + i) & 7;
+	u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
+
+	if (sti_tag == TAG_Empty) {
+		FPU_stack_underflow();
+		clear_C1();
+		return;
+	}
+	reg_copy(sti_ptr, st0_ptr);
+	tag_word &= ~(3 << (regnr * 2));
+	tag_word |= (sti_tag << (regnr * 2));
+	fpu_tag_word = tag_word;
+}
+
+void fcmovb(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_CF)
+		fcmovCC();
+}
+
+void fcmove(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_ZF)
+		fcmovCC();
+}
+
+void fcmovbe(void)
+{
+	if (FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF))
+		fcmovCC();
+}
+
+void fcmovu(void)
+{
+	if (FPU_EFLAGS & X86_EFLAGS_PF)
+		fcmovCC();
+}
+
+void fcmovnb(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_CF))
+		fcmovCC();
+}
+
+void fcmovne(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_ZF))
+		fcmovCC();
+}
+
+void fcmovnbe(void)
+{
+	if (!(FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF)))
+		fcmovCC();
+}
+
+void fcmovnu(void)
+{
+	if (!(FPU_EFLAGS & X86_EFLAGS_PF))
+		fcmovCC();
+}
+
 void ffree_(void)
 {
 	/* ffree st(i) */
diff --git a/kernel/arch/x86/math-emu/fpu_emu.h b/kernel/arch/x86/math-emu/fpu_emu.h
index 4dae511c8..afbc4d805 100644
--- a/kernel/arch/x86/math-emu/fpu_emu.h
+++ b/kernel/arch/x86/math-emu/fpu_emu.h
@@ -71,7 +71,7 @@
 
 #include "fpu_system.h"
 
-#include <asm/sigcontext.h>	/* for struct _fpstate */
+#include <uapi/asm/sigcontext.h>	/* for struct _fpstate */
 #include <asm/math_emu.h>
 #include <linux/linkage.h>
 
diff --git a/kernel/arch/x86/math-emu/fpu_entry.c b/kernel/arch/x86/math-emu/fpu_entry.c
index 274a52b11..e945fedf1 100644
--- a/kernel/arch/x86/math-emu/fpu_entry.c
+++ b/kernel/arch/x86/math-emu/fpu_entry.c
@@ -30,7 +30,7 @@
 #include <asm/uaccess.h>
 #include <asm/traps.h>
 #include <asm/user.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
 
 #include "fpu_system.h"
 #include "fpu_emu.h"
@@ -40,49 +40,33 @@
 
 #define __BAD__ FPU_illegal	/* Illegal on an 80486, causes SIGILL */
 
-#ifndef NO_UNDOC_CODE		/* Un-documented FPU op-codes supported by default. */
+/* fcmovCC and f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */
 
-/* WARNING: These codes are not documented by Intel in their 80486 manual
-   and may not work on FPU clones or later Intel FPUs. */
-
-/* Changes to support the un-doc codes provided by Linus Torvalds. */
-
-#define _d9_d8_ fstp_i		/* unofficial code (19) */
-#define _dc_d0_ fcom_st		/* unofficial code (14) */
-#define _dc_d8_ fcompst		/* unofficial code (1c) */
-#define _dd_c8_ fxch_i		/* unofficial code (0d) */
-#define _de_d0_ fcompst		/* unofficial code (16) */
-#define _df_c0_ ffreep		/* unofficial code (07) ffree + pop */
-#define _df_c8_ fxch_i		/* unofficial code (0f) */
-#define _df_d0_ fstp_i		/* unofficial code (17) */
-#define _df_d8_ fstp_i		/* unofficial code (1f) */
+/* WARNING: "u" entries are not documented by Intel in their 80486 manual
+   and may not work on FPU clones or later Intel FPUs.
+   Changes to support them provided by Linus Torvalds. */
 
 static FUNC const st_instr_table[64] = {
-	fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_,
-	fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_,
-	fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_,
-	fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_,
-	fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
-	fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
-	fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
-	fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
+/* Opcode:	d8		d9		da		db */
+/*		dc		dd		de		df */
+/* c0..7 */	fadd__,		fld_i_,		fcmovb,		fcmovnb,
+/* c0..7 */	fadd_i,		ffree_,		faddp_,		ffreep,/*u*/
+/* c8..f */	fmul__,		fxch_i,		fcmove,		fcmovne,
+/* c8..f */	fmul_i,		fxch_i,/*u*/	fmulp_,		fxch_i,/*u*/
+/* d0..7 */	fcom_st,	fp_nop,		fcmovbe,	fcmovnbe,
+/* d0..7 */	fcom_st,/*u*/	fst_i_,		fcompst,/*u*/	fstp_i,/*u*/
+/* d8..f */	fcompst,	fstp_i,/*u*/	fcmovu,		fcmovnu,
+/* d8..f */	fcompst,/*u*/	fstp_i,		fcompp,		fstp_i,/*u*/
+/* e0..7 */	fsub__,		FPU_etc,	__BAD__,	finit_,
+/* e0..7 */	fsubri,		fucom_,		fsubrp,		fstsw_,
+/* e8..f */	fsubr_,		fconst,		fucompp,	fucomi_,
+/* e8..f */	fsub_i,		fucomp,		fsubp_,		fucomip,
+/* f0..7 */	fdiv__,		FPU_triga,	__BAD__,	fcomi_,
+/* f0..7 */	fdivri,		__BAD__,	fdivrp,		fcomip,
+/* f8..f */	fdivr_,		FPU_trigb,	__BAD__,	__BAD__,
+/* f8..f */	fdiv_i,		__BAD__,	fdivp_,		__BAD__,
 };
 
-#else /* Support only documented FPU op-codes */
-
-static FUNC const st_instr_table[64] = {
-	fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__,
-	fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__,
-	fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__,
-	fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__,
-	fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_,
-	fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__,
-	fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__,
-	fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__,
-};
-
-#endif /* NO_UNDOC_CODE */
-
 #define _NONE_ 0		/* Take no special action */
 #define _REG0_ 1		/* Need to check for not empty st(0) */
 #define _REGI_ 2		/* Need to check for not empty st(0) and st(rm) */
@@ -94,36 +78,18 @@ static FUNC const st_instr_table[64] = {
 #define _REGIc 0		/* Compare st(0) and st(rm) */
 #define _REGIn 0		/* Uses st(0) and st(rm), but handle checks later */
 
-#ifndef NO_UNDOC_CODE
-
-/* Un-documented FPU op-codes supported by default. (see above) */
-
 static u_char const type_table[64] = {
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_,
-	_REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_,
-	_REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-	_REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_,
-	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-	_REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
+/* Opcode:	d8	d9	da	db	dc	dd	de	df */
+/* c0..7 */	_REGI_, _NONE_, _REGIn, _REGIn, _REGIi, _REGi_, _REGIp, _REGi_,
+/* c8..f */	_REGI_, _REGIn, _REGIn, _REGIn, _REGIi, _REGI_, _REGIp, _REGI_,
+/* d0..7 */	_REGIc, _NONE_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* d8..f */	_REGIc, _REG0_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* e0..7 */	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+/* e8..f */	_REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc,
+/* f0..7 */	_REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc,
+/* f8..f */	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
 };
 
-#else /* Support only documented FPU op-codes */
-
-static u_char const type_table[64] = {
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_,
-	_REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_,
-	_REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_,
-	_REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
-	_REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
-	_REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_
-};
-
-#endif /* NO_UNDOC_CODE */
-
 #ifdef RE_ENTRANT_CHECKING
 u_char emulating = 0;
 #endif /* RE_ENTRANT_CHECKING */
@@ -146,13 +112,9 @@ void math_emulate(struct math_emu_info *info)
 	unsigned long code_base = 0;
 	unsigned long code_limit = 0;	/* Initialized to stop compiler warnings */
 	struct desc_struct code_descriptor;
+	struct fpu *fpu = &current->thread.fpu;
 
-	if (!used_math()) {
-		if (init_fpu(current)) {
-			do_group_exit(SIGKILL);
-			return;
-		}
-	}
+	fpu__activate_curr(fpu);
 
 #ifdef RE_ENTRANT_CHECKING
 	if (emulating) {
@@ -672,7 +634,7 @@ void math_abort(struct math_emu_info *info, unsigned int signal)
 #endif /* PARANOID */
 }
 
-#define S387 ((struct i387_soft_struct *)s387)
+#define S387 ((struct swregs_state *)s387)
 #define sstatus_word() \
   ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
 
@@ -681,14 +643,14 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+	struct swregs_state *s387 = &target->thread.fpu.state.soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
 
 	RE_ENTRANT_CHECK_OFF;
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
-				 offsetof(struct i387_soft_struct, st_space));
+				 offsetof(struct swregs_state, st_space));
 	RE_ENTRANT_CHECK_ON;
 
 	if (ret)
@@ -733,7 +695,7 @@ int fpregs_soft_get(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    void *kbuf, void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+	struct swregs_state *s387 = &target->thread.fpu.state.soft;
 	const void *space = s387->st_space;
 	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
@@ -751,7 +713,7 @@ int fpregs_soft_get(struct task_struct *target,
 #endif /* PECULIAR_486 */
 
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
-				  offsetof(struct i387_soft_struct, st_space));
+				  offsetof(struct swregs_state, st_space));
 
 	/* Copy all registers in stack order. */
 	if (!ret)
diff --git a/kernel/arch/x86/math-emu/fpu_proto.h b/kernel/arch/x86/math-emu/fpu_proto.h
index 9779df436..caff438b9 100644
--- a/kernel/arch/x86/math-emu/fpu_proto.h
+++ b/kernel/arch/x86/math-emu/fpu_proto.h
@@ -46,6 +46,14 @@ extern void fstsw_(void);
 extern void fp_nop(void);
 extern void fld_i_(void);
 extern void fxch_i(void);
+extern void fcmovb(void);
+extern void fcmove(void);
+extern void fcmovbe(void);
+extern void fcmovu(void);
+extern void fcmovnb(void);
+extern void fcmovne(void);
+extern void fcmovnbe(void);
+extern void fcmovnu(void);
 extern void ffree_(void);
 extern void ffreep(void);
 extern void fst_i_(void);
@@ -108,6 +116,10 @@ extern void fcompp(void);
 extern void fucom_(void);
 extern void fucomp(void);
 extern void fucompp(void);
+extern void fcomi_(void);
+extern void fcomip(void);
+extern void fucomi_(void);
+extern void fucomip(void);
 /* reg_constant.c */
 extern void fconst(void);
 /* reg_ld_str.c */
diff --git a/kernel/arch/x86/math-emu/fpu_system.h b/kernel/arch/x86/math-emu/fpu_system.h
index d342fce49..5e044d506 100644
--- a/kernel/arch/x86/math-emu/fpu_system.h
+++ b/kernel/arch/x86/math-emu/fpu_system.h
@@ -46,7 +46,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
 #define SEG_EXPAND_DOWN(s)	(((s).b & ((1 << 11) | (1 << 10))) \
 				 == (1 << 10))
 
-#define I387			(current->thread.fpu.state)
+#define I387			(&current->thread.fpu.state)
 #define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/kernel/arch/x86/math-emu/get_address.c b/kernel/arch/x86/math-emu/get_address.c
index 8300db71c..8db26591d 100644
--- a/kernel/arch/x86/math-emu/get_address.c
+++ b/kernel/arch/x86/math-emu/get_address.c
@@ -20,6 +20,7 @@
 #include <linux/stddef.h>
 
 #include <asm/uaccess.h>
+#include <asm/vm86.h>
 
 #include "fpu_system.h"
 #include "exception.h"
diff --git a/kernel/arch/x86/math-emu/load_store.c b/kernel/arch/x86/math-emu/load_store.c
index 2931ff355..95228ff04 100644
--- a/kernel/arch/x86/math-emu/load_store.c
+++ b/kernel/arch/x86/math-emu/load_store.c
@@ -33,11 +33,12 @@
 
 #define pop_0()	{ FPU_settag0(TAG_Empty); top++; }
 
+/* index is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
 static u_char const type_table[32] = {
-	_PUSH_, _PUSH_, _PUSH_, _PUSH_,
-	_null_, _null_, _null_, _null_,
-	_REG0_, _REG0_, _REG0_, _REG0_,
-	_REG0_, _REG0_, _REG0_, _REG0_,
+	_PUSH_, _PUSH_, _PUSH_, _PUSH_, /* /0: d9:fld f32,  db:fild m32,  dd:fld f64,  df:fild m16 */
+	_null_, _REG0_, _REG0_, _REG0_, /* /1: d9:undef,    db,dd,df:fisttp m32/64/16 */
+	_REG0_, _REG0_, _REG0_, _REG0_, /* /2: d9:fst f32,  db:fist m32,  dd:fst f64,  df:fist m16 */
+	_REG0_, _REG0_, _REG0_, _REG0_, /* /3: d9:fstp f32, db:fistp m32, dd:fstp f64, df:fistp m16 */
 	_NONE_, _null_, _NONE_, _PUSH_,
 	_NONE_, _PUSH_, _null_, _PUSH_,
 	_NONE_, _null_, _NONE_, _REG0_,
@@ -45,15 +46,19 @@ static u_char const type_table[32] = {
 };
 
 u_char const data_sizes_16[32] = {
-	4, 4, 8, 2, 0, 0, 0, 0,
-	4, 4, 8, 2, 4, 4, 8, 2,
+	4, 4, 8, 2,
+	0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+	4, 4, 8, 2,
+	4, 4, 8, 2,
 	14, 0, 94, 10, 2, 10, 0, 8,
 	14, 0, 94, 10, 2, 10, 2, 8
 };
 
 static u_char const data_sizes_32[32] = {
-	4, 4, 8, 2, 0, 0, 0, 0,
-	4, 4, 8, 2, 4, 4, 8, 2,
+	4, 4, 8, 2,
+	0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+	4, 4, 8, 2,
+	4, 4, 8, 2,
 	28, 0, 108, 10, 2, 10, 0, 8,
 	28, 0, 108, 10, 2, 10, 2, 8
 };
@@ -65,6 +70,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 	FPU_REG *st0_ptr;
 	u_char st0_tag = TAG_Empty;	/* This is just to stop a gcc warning. */
 	u_char loaded_tag;
+	int sv_cw;
 
 	st0_ptr = NULL;		/* Initialized just to stop compiler warnings. */
 
@@ -111,7 +117,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 	}
 
 	switch (type) {
-	case 000:		/* fld m32real */
+	/* type is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
+	case 000:		/* fld m32real (d9 /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_single((float __user *)data_address, &loaded_data);
@@ -123,13 +130,13 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		}
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 001:		/* fild m32int */
+	case 001:		/* fild m32int (db /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_int32((long __user *)data_address, &loaded_data);
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 002:		/* fld m64real */
+	case 002:		/* fld m64real (dd /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_double((double __user *)data_address,
@@ -142,12 +149,44 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
 		}
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
-	case 003:		/* fild m16int */
+	case 003:		/* fild m16int (df /0) */
 		clear_C1();
 		loaded_tag =
 		    FPU_load_int16((short __user *)data_address, &loaded_data);
 		FPU_copy_to_reg0(&loaded_data, loaded_tag);
 		break;
+	/* case 004: undefined (d9 /1) */
+	/* fisttp are enabled if CPUID(1).ECX(0) "sse3" is set */
+	case 005:		/* fisttp m32int (db /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int32
+		    (st0_ptr, st0_tag, (long __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
+	case 006:		/* fisttp m64int (dd /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int64
+		    (st0_ptr, st0_tag, (long long __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
+	case 007:		/* fisttp m16int (df /1) */
+		clear_C1();
+		sv_cw = control_word;
+		control_word |= RC_CHOP;
+		if (FPU_store_int16
+		    (st0_ptr, st0_tag, (short __user *)data_address))
+			pop_0();	/* pop only if the number was actually stored
+					   (see the 80486 manual p16-28) */
+		control_word = sv_cw;
+		break;
 	case 010:		/* fst m32real */
 		clear_C1();
 		FPU_store_single(st0_ptr, st0_tag,
diff --git a/kernel/arch/x86/math-emu/reg_compare.c b/kernel/arch/x86/math-emu/reg_compare.c
index ecce55fc2..b77360fdb 100644
--- a/kernel/arch/x86/math-emu/reg_compare.c
+++ b/kernel/arch/x86/math-emu/reg_compare.c
@@ -249,6 +249,54 @@ static int compare_st_st(int nr)
 	return 0;
 }
 
+static int compare_i_st_st(int nr)
+{
+	int f, c;
+	FPU_REG *st_ptr;
+
+	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		/* Stack fault */
+		EXCEPTION(EX_StackUnder);
+		return !(control_word & CW_Invalid);
+	}
+
+	partial_status &= ~SW_C0;
+	st_ptr = &st(nr);
+	c = compare(st_ptr, FPU_gettagi(nr));
+	if (c & COMP_NaN) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		EXCEPTION(EX_Invalid);
+		return !(control_word & CW_Invalid);
+	}
+
+	switch (c & 7) {
+	case COMP_A_lt_B:
+		f = X86_EFLAGS_CF;
+		break;
+	case COMP_A_eq_B:
+		f = X86_EFLAGS_ZF;
+		break;
+	case COMP_A_gt_B:
+		f = 0;
+		break;
+	case COMP_No_Comp:
+		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+		break;
+#ifdef PARANOID
+	default:
+		EXCEPTION(EX_INTERNAL | 0x122);
+		f = 0;
+		break;
+#endif /* PARANOID */
+	}
+	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+	if (c & COMP_Denormal) {
+		return denormal_operand() < 0;
+	}
+	return 0;
+}
+
 static int compare_u_st_st(int nr)
 {
 	int f = 0, c;
@@ -299,6 +347,58 @@ static int compare_u_st_st(int nr)
 	return 0;
 }
 
+static int compare_ui_st_st(int nr)
+{
+	int f = 0, c;
+	FPU_REG *st_ptr;
+
+	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		/* Stack fault */
+		EXCEPTION(EX_StackUnder);
+		return !(control_word & CW_Invalid);
+	}
+
+	partial_status &= ~SW_C0;
+	st_ptr = &st(nr);
+	c = compare(st_ptr, FPU_gettagi(nr));
+	if (c & COMP_NaN) {
+		FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+		if (c & COMP_SNaN) {	/* This is the only difference between
+					   un-ordered and ordinary comparisons */
+			EXCEPTION(EX_Invalid);
+			return !(control_word & CW_Invalid);
+		}
+		return 0;
+	}
+
+	switch (c & 7) {
+	case COMP_A_lt_B:
+		f = X86_EFLAGS_CF;
+		break;
+	case COMP_A_eq_B:
+		f = X86_EFLAGS_ZF;
+		break;
+	case COMP_A_gt_B:
+		f = 0;
+		break;
+	case COMP_No_Comp:
+		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+		break;
+#ifdef PARANOID
+	default:
+		EXCEPTION(EX_INTERNAL | 0x123);
+		f = 0;
+		break;
+#endif /* PARANOID */
+	}
+	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+	if (c & COMP_Denormal) {
+		return denormal_operand() < 0;
+	}
+	return 0;
+}
+
 /*---------------------------------------------------------------------------*/
 
 void fcom_st(void)
@@ -348,3 +448,31 @@ void fucompp(void)
 	} else
 		FPU_illegal();
 }
+
+/* P6+ compare-to-EFLAGS ops */
+
+void fcomi_(void)
+{
+	/* fcomi st(i) */
+	compare_i_st_st(FPU_rm);
+}
+
+void fcomip(void)
+{
+	/* fcomip st(i) */
+	if (!compare_i_st_st(FPU_rm))
+		FPU_pop();
+}
+
+void fucomi_(void)
+{
+	/* fucomi st(i) */
+	compare_ui_st_st(FPU_rm);
+}
+
+void fucomip(void)
+{
+	/* fucomip st(i) */
+	if (!compare_ui_st_st(FPU_rm))
+		FPU_pop();
+}
diff --git a/kernel/arch/x86/mm/Makefile b/kernel/arch/x86/mm/Makefile
index a482d1051..65c47fda2 100644
--- a/kernel/arch/x86/mm/Makefile
+++ b/kernel/arch/x86/mm/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_SMP)		+= tlb.o
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
+obj-$(CONFIG_X86_PTDUMP_CORE)	+= dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
diff --git a/kernel/arch/x86/mm/dump_pagetables.c b/kernel/arch/x86/mm/dump_pagetables.c
index f0cedf339..0f1c6fc3d 100644
--- a/kernel/arch/x86/mm/dump_pagetables.c
+++ b/kernel/arch/x86/mm/dump_pagetables.c
@@ -32,6 +32,8 @@ struct pg_state {
 	const struct addr_marker *marker;
 	unsigned long lines;
 	bool to_dmesg;
+	bool check_wx;
+	unsigned long wx_pages;
 };
 
 struct addr_marker {
@@ -87,7 +89,7 @@ static struct addr_marker address_markers[] = {
 	{ 0/* VMALLOC_START */, "vmalloc() Area" },
 	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
 # ifdef CONFIG_HIGHMEM
-	{ 0/*PKMAP_BASE*/,      "Persisent kmap() Area" },
+	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
 # endif
 	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
 #endif
@@ -155,7 +157,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 			pt_dump_cont_printf(m, dmsg, "    ");
 		if ((level == 4 && pr & _PAGE_PAT) ||
 		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
-			pt_dump_cont_printf(m, dmsg, "pat ");
+			pt_dump_cont_printf(m, dmsg, "PAT ");
 		else
 			pt_dump_cont_printf(m, dmsg, "    ");
 		if (pr & _PAGE_GLOBAL)
@@ -198,8 +200,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
 	 * we have now. "break" is either changing perms, levels or
 	 * address space marker.
 	 */
-	prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
-	cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
+	prot = pgprot_val(new_prot);
+	cur = pgprot_val(st->current_prot);
 
 	if (!st->level) {
 		/* First entry */
@@ -214,6 +216,16 @@ static void note_page(struct seq_file *m, struct pg_state *st,
 		const char *unit = units;
 		unsigned long delta;
 		int width = sizeof(unsigned long) * 2;
+		pgprotval_t pr = pgprot_val(st->current_prot);
+
+		if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
+			WARN_ONCE(1,
+				  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
+				  (void *)st->start_address,
+				  (void *)st->start_address);
+			st->wx_pages += (st->current_address -
+					 st->start_address) / PAGE_SIZE;
+		}
 
 		/*
 		 * Now print the actual finished series
@@ -269,13 +281,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
 {
 	int i;
 	pte_t *start;
+	pgprotval_t prot;
 
 	start = (pte_t *) pmd_page_vaddr(addr);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		pgprot_t prot = pte_pgprot(*start);
-
+		prot = pte_flags(*start);
 		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
-		note_page(m, st, prot, 4);
+		note_page(m, st, __pgprot(prot), 4);
 		start++;
 	}
 }
@@ -287,18 +299,19 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
 {
 	int i;
 	pmd_t *start;
+	pgprotval_t prot;
 
 	start = (pmd_t *) pud_page_vaddr(addr);
 	for (i = 0; i < PTRS_PER_PMD; i++) {
 		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
 		if (!pmd_none(*start)) {
-			pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
-
-			if (pmd_large(*start) || !pmd_present(*start))
+			if (pmd_large(*start) || !pmd_present(*start)) {
+				prot = pmd_flags(*start);
 				note_page(m, st, __pgprot(prot), 3);
-			else
+			} else {
 				walk_pte_level(m, st, *start,
 					       P + i * PMD_LEVEL_MULT);
+			}
 		} else
 			note_page(m, st, __pgprot(0), 3);
 		start++;
@@ -318,19 +331,20 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 {
 	int i;
 	pud_t *start;
+	pgprotval_t prot;
 
 	start = (pud_t *) pgd_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_PUD; i++) {
 		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
 		if (!pud_none(*start)) {
-			pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
-
-			if (pud_large(*start) || !pud_present(*start))
+			if (pud_large(*start) || !pud_present(*start)) {
+				prot = pud_flags(*start);
 				note_page(m, st, __pgprot(prot), 2);
-			else
+			} else {
 				walk_pmd_level(m, st, *start,
 					       P + i * PUD_LEVEL_MULT);
+			}
 		} else
 			note_page(m, st, __pgprot(0), 2);
 
@@ -344,13 +358,30 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 #define pgd_none(a)  pud_none(__pud(pgd_val(a)))
 #endif
 
-void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+#ifdef CONFIG_X86_64
+static inline bool is_hypervisor_range(int idx)
+{
+	/*
+	 * ffff800000000000 - ffff87ffffffffff is reserved for
+	 * the hypervisor.
+	 */
+	return paravirt_enabled() &&
+		(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+		(idx < pgd_index(__PAGE_OFFSET));
+}
+#else
+static inline bool is_hypervisor_range(int idx) { return false; }
+#endif
+
+static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
+				       bool checkwx)
 {
 #ifdef CONFIG_X86_64
 	pgd_t *start = (pgd_t *) &init_level4_pgt;
 #else
 	pgd_t *start = swapper_pg_dir;
 #endif
+	pgprotval_t prot;
 	int i;
 	struct pg_state st = {};
 
@@ -359,16 +390,20 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 		st.to_dmesg = true;
 	}
 
+	st.check_wx = checkwx;
+	if (checkwx)
+		st.wx_pages = 0;
+
 	for (i = 0; i < PTRS_PER_PGD; i++) {
 		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
-		if (!pgd_none(*start)) {
-			pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
-
-			if (pgd_large(*start) || !pgd_present(*start))
+		if (!pgd_none(*start) && !is_hypervisor_range(i)) {
+			if (pgd_large(*start) || !pgd_present(*start)) {
+				prot = pgd_flags(*start);
 				note_page(m, &st, __pgprot(prot), 1);
-			else
+			} else {
 				walk_pud_level(m, &st, *start,
 					       i * PGD_LEVEL_MULT);
+			}
 		} else
 			note_page(m, &st, __pgprot(0), 1);
 
@@ -378,8 +413,26 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 	/* Flush out the last page */
 	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
 	note_page(m, &st, __pgprot(0), 0);
+	if (!checkwx)
+		return;
+	if (st.wx_pages)
+		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
+			st.wx_pages);
+	else
+		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+}
+
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+{
+	ptdump_walk_pgd_level_core(m, pgd, false);
+}
+
+void ptdump_walk_pgd_level_checkwx(void)
+{
+	ptdump_walk_pgd_level_core(NULL, NULL, true);
 }
 
+#ifdef CONFIG_X86_PTDUMP
 static int ptdump_show(struct seq_file *m, void *v)
 {
 	ptdump_walk_pgd_level(m, NULL);
@@ -397,10 +450,13 @@ static const struct file_operations ptdump_fops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+#endif
 
 static int pt_dump_init(void)
 {
+#ifdef CONFIG_X86_PTDUMP
 	struct dentry *pe;
+#endif
 
 #ifdef CONFIG_X86_32
 	/* Not a compile-time constant on x86-32 */
@@ -412,10 +468,12 @@ static int pt_dump_init(void)
 	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
 #endif
 
+#ifdef CONFIG_X86_PTDUMP
 	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
 				 &ptdump_fops);
 	if (!pe)
 		return -ENOMEM;
+#endif
 
 	return 0;
 }
diff --git a/kernel/arch/x86/mm/fault.c b/kernel/arch/x86/mm/fault.c
index 9dc909841..e830c71a1 100644
--- a/kernel/arch/x86/mm/fault.c
+++ b/kernel/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
+#include <asm/vm86.h>			/* struct vm86			*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
@@ -286,6 +287,9 @@ static noinline int vmalloc_fault(unsigned long address)
 	if (!pmd_k)
 		return -1;
 
+	if (pmd_huge(*pmd_k))
+		return 0;
+
 	pte_k = pte_offset_kernel(pmd_k, address);
 	if (!pte_present(*pte_k))
 		return -1;
@@ -301,14 +305,16 @@ static inline void
 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 		 struct task_struct *tsk)
 {
+#ifdef CONFIG_VM86
 	unsigned long bit;
 
-	if (!v8086_mode(regs))
+	if (!v8086_mode(regs) || !tsk->thread.vm86)
 		return;
 
 	bit = (address - 0xA0000) >> PAGE_SHIFT;
 	if (bit < 32)
-		tsk->thread.screen_bitmap |= 1 << bit;
+		tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
 }
 
 static bool low_pfn(unsigned long pfn)
@@ -357,8 +363,6 @@ void vmalloc_sync_all(void)
  * 64-bit:
  *
  *   Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
  */
 static noinline int vmalloc_fault(unsigned long address)
 {
@@ -400,17 +404,23 @@ static noinline int vmalloc_fault(unsigned long address)
 	if (pud_none(*pud_ref))
 		return -1;
 
-	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+	if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
 		BUG();
 
+	if (pud_huge(*pud))
+		return 0;
+
 	pmd = pmd_offset(pud, address);
 	pmd_ref = pmd_offset(pud_ref, address);
 	if (pmd_none(*pmd_ref))
 		return -1;
 
-	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+	if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
 		BUG();
 
+	if (pmd_huge(*pmd))
+		return 0;
+
 	pte_ref = pte_offset_kernel(pmd_ref, address);
 	if (!pte_present(*pte_ref))
 		return -1;
diff --git a/kernel/arch/x86/mm/gup.c b/kernel/arch/x86/mm/gup.c
index 81bf3d2af..ae9a37bf1 100644
--- a/kernel/arch/x86/mm/gup.c
+++ b/kernel/arch/x86/mm/gup.c
@@ -118,21 +118,20 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
 	unsigned long mask;
-	pte_t pte = *(pte_t *)&pmd;
 	struct page *head, *page;
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
 	if (write)
 		mask |= _PAGE_RW;
-	if ((pte_flags(pte) & mask) != mask)
+	if ((pmd_flags(pmd) & mask) != mask)
 		return 0;
 	/* hugepages are never "special" */
-	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+	VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
+	VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
 
 	refs = 0;
-	head = pte_page(pte);
+	head = pmd_page(pmd);
 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	do {
 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
@@ -195,21 +194,20 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
 	unsigned long mask;
-	pte_t pte = *(pte_t *)&pud;
 	struct page *head, *page;
 	int refs;
 
 	mask = _PAGE_PRESENT|_PAGE_USER;
 	if (write)
 		mask |= _PAGE_RW;
-	if ((pte_flags(pte) & mask) != mask)
+	if ((pud_flags(pud) & mask) != mask)
 		return 0;
 	/* hugepages are never "special" */
-	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
-	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+	VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
+	VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
 
 	refs = 0;
-	head = pte_page(pte);
+	head = pud_page(pud);
 	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	do {
 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
diff --git a/kernel/arch/x86/mm/highmem_32.c b/kernel/arch/x86/mm/highmem_32.c
index 0d1cbcf47..bd24ba1c4 100644
--- a/kernel/arch/x86/mm/highmem_32.c
+++ b/kernel/arch/x86/mm/highmem_32.c
@@ -111,20 +111,6 @@ void __kunmap_atomic(void *kvaddr)
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
-struct page *kmap_atomic_to_page(void *ptr)
-{
-	unsigned long idx, vaddr = (unsigned long)ptr;
-	pte_t *pte;
-
-	if (vaddr < FIXADDR_START)
-		return virt_to_page(ptr);
-
-	idx = virt_to_fix(vaddr);
-	pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
-	return pte_page(*pte);
-}
-EXPORT_SYMBOL(kmap_atomic_to_page);
-
 void __init set_highmem_pages_init(void)
 {
 	struct zone *zone;
diff --git a/kernel/arch/x86/mm/init.c b/kernel/arch/x86/mm/init.c
index 1d553186c..493f54172 100644
--- a/kernel/arch/x86/mm/init.c
+++ b/kernel/arch/x86/mm/init.c
@@ -30,8 +30,11 @@
 /*
  * Tables translating between page_cache_type_t and pte encoding.
  *
- * Minimal supported modes are defined statically, they are modified
- * during bootup if more supported cache modes are available.
+ * The default values are defined statically as minimal supported mode;
+ * WC and WT fall back to UC-.  pat_init() updates these values to support
+ * more cache modes, WC and WT, when it is safe to do so.  See pat_init()
+ * for the details.  Note, __early_ioremap() used during early boot-time
+ * takes pgprot_t (pte encoding) and does not use these tables.
  *
  *   Index into __cachemode2pte_tbl[] is the cachemode.
  *
@@ -40,7 +43,7 @@
  */
 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
 	[_PAGE_CACHE_MODE_WB      ]	= 0         | 0        ,
-	[_PAGE_CACHE_MODE_WC      ]	= _PAGE_PWT | 0        ,
+	[_PAGE_CACHE_MODE_WC      ]	= 0         | _PAGE_PCD,
 	[_PAGE_CACHE_MODE_UC_MINUS]	= 0         | _PAGE_PCD,
 	[_PAGE_CACHE_MODE_UC      ]	= _PAGE_PWT | _PAGE_PCD,
 	[_PAGE_CACHE_MODE_WT      ]	= 0         | _PAGE_PCD,
@@ -50,11 +53,11 @@ EXPORT_SYMBOL(__cachemode2pte_tbl);
 
 uint8_t __pte2cachemode_tbl[8] = {
 	[__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
-	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
 	[__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
-	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
 };
@@ -351,7 +354,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
 	}
 
 	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+		pr_debug(" [mem %#010lx-%#010lx] page %s\n",
 				mr[i].start, mr[i].end - 1,
 				page_size_string(&mr[i]));
 
@@ -398,7 +401,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	unsigned long ret = 0;
 	int nr_range, i;
 
-	pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+	pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
 	       start, end - 1);
 
 	memset(mr, 0, sizeof(mr));
@@ -690,14 +693,12 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-#ifdef CONFIG_MICROCODE_EARLY
 	/*
 	 * Remember, initrd memory may contain microcode or other useful things.
 	 * Before we lose initrd mem, we need to find a place to hold them
 	 * now that normal virtual memory is enabled.
 	 */
 	save_microcode_in_initrd();
-#endif
 
 	/*
 	 * end could be not aligned, and We can not align that,
diff --git a/kernel/arch/x86/mm/init_32.c b/kernel/arch/x86/mm/init_32.c
index c23ab1ee3..cb4ef3de6 100644
--- a/kernel/arch/x86/mm/init_32.c
+++ b/kernel/arch/x86/mm/init_32.c
@@ -434,7 +434,7 @@ void __init add_highpages_with_active_regions(int nid,
 	phys_addr_t start, end;
 	u64 i;
 
-	for_each_free_mem_range(i, nid, &start, &end, NULL) {
+	for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
 		unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
 					    start_pfn, end_pfn);
 		unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
@@ -823,11 +823,11 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata = NODE_DATA(nid);
 	struct zone *zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+		zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
@@ -957,6 +957,8 @@ void mark_rodata_ro(void)
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
 	mark_nxdata_nx();
+	if (__supported_pte_mask & _PAGE_NX)
+		debug_checkwx();
 }
 #endif
 
diff --git a/kernel/arch/x86/mm/init_64.c b/kernel/arch/x86/mm/init_64.c
index 3fba623e3..ec081fe0c 100644
--- a/kernel/arch/x86/mm/init_64.c
+++ b/kernel/arch/x86/mm/init_64.c
@@ -687,11 +687,11 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL);
+		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
@@ -1132,7 +1132,7 @@ void mark_rodata_ro(void)
 	 * has been zapped already via cleanup_highmem().
 	 */
 	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
-	set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
+	set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
 
 	rodata_test();
 
@@ -1150,6 +1150,8 @@ void mark_rodata_ro(void)
 	free_init_pages("unused kernel",
 			(unsigned long) __va(__pa_symbol(rodata_end)),
 			(unsigned long) __va(__pa_symbol(_sdata)));
+
+	debug_checkwx();
 }
 
 #endif
@@ -1268,7 +1270,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
 				/* check to see if we have contiguous blocks */
 				if (p_end != p || node_start != node) {
 					if (p_start)
-						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+						pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
 						       addr_start, addr_end-1, p_start, p_end-1, node_start);
 					addr_start = addr;
 					node_start = node;
@@ -1366,7 +1368,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 void __meminit vmemmap_populate_print_last(void)
 {
 	if (p_start) {
-		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+		pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
 			addr_start, addr_end-1, p_start, p_end-1, node_start);
 		p_start = NULL;
 		p_end = NULL;
diff --git a/kernel/arch/x86/mm/iomap_32.c b/kernel/arch/x86/mm/iomap_32.c
index b2ffa5c7d..dd25dd167 100644
--- a/kernel/arch/x86/mm/iomap_32.c
+++ b/kernel/arch/x86/mm/iomap_32.c
@@ -84,13 +84,13 @@ void __iomem *
 iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
 	/*
-	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
-	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
-	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
-	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
+	 * For non-PAT systems, translate non-WB request to UC- just in
+	 * case the caller set the PWT bit to prot directly without using
+	 * pgprot_writecombine(). UC- translates to uncached if the MTRR
+	 * is UC or WC. UC- gets the real intention, of the user, which is
+	 * "WC if the MTRR is WC, UC if you can't do that."
 	 */
-	if (!pat_enabled && pgprot_val(prot) ==
-	    (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
+	if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
 		prot = __pgprot(__PAGE_KERNEL |
 				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
 
diff --git a/kernel/arch/x86/mm/ioremap.c b/kernel/arch/x86/mm/ioremap.c
index 70e7444c6..b9c78f3bc 100644
--- a/kernel/arch/x86/mm/ioremap.c
+++ b/kernel/arch/x86/mm/ioremap.c
@@ -42,6 +42,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	case _PAGE_CACHE_MODE_WC:
 		err = _set_memory_wc(vaddr, nrpages);
 		break;
+	case _PAGE_CACHE_MODE_WT:
+		err = _set_memory_wt(vaddr, nrpages);
+		break;
 	case _PAGE_CACHE_MODE_WB:
 		err = _set_memory_wb(vaddr, nrpages);
 		break;
@@ -60,8 +63,6 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
 		    !PageReserved(pfn_to_page(start_pfn + i)))
 			return 1;
 
-	WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
-
 	return 0;
 }
 
@@ -91,7 +92,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	pgprot_t prot;
 	int retval;
 	void __iomem *ret_addr;
-	int ram_region;
 
 	/* Don't allow wraparound or zero size */
 	last_addr = phys_addr + size - 1;
@@ -114,23 +114,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	/* First check if whole region can be identified as RAM or not */
-	ram_region = region_is_ram(phys_addr, size);
-	if (ram_region > 0) {
-		WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n",
-				(unsigned long int)phys_addr,
-				(unsigned long int)last_addr);
+	pfn      = phys_addr >> PAGE_SHIFT;
+	last_pfn = last_addr >> PAGE_SHIFT;
+	if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+					  __ioremap_check_ram) == 1) {
+		WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
+			  &phys_addr, &last_addr);
 		return NULL;
 	}
 
-	/* If could not be identified(-1), check page by page */
-	if (ram_region < 0) {
-		pfn      = phys_addr >> PAGE_SHIFT;
-		last_pfn = last_addr >> PAGE_SHIFT;
-		if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
-					  __ioremap_check_ram) == 1)
-			return NULL;
-	}
 	/*
 	 * Mappings have to be page-aligned
 	 */
@@ -172,6 +164,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		prot = __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
 		break;
+	case _PAGE_CACHE_MODE_WT:
+		prot = __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WT));
+		break;
 	case _PAGE_CACHE_MODE_WB:
 		break;
 	}
@@ -234,10 +230,11 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
 	/*
 	 * Ideally, this should be:
-	 *	pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+	 *	pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
 	 *
 	 * Till we fix all X drivers to use ioremap_wc(), we will use
-	 * UC MINUS.
+	 * UC MINUS. Drivers that are certain they need or can already
+	 * be converted over to strong UC can use ioremap_uc().
 	 */
 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -247,6 +244,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 EXPORT_SYMBOL(ioremap_nocache);
 
 /**
+ * ioremap_uc     -   map bus memory into CPU space as strongly uncachable
+ * @phys_addr:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC.  This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+	return __ioremap_caller(phys_addr, size, pcm,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
  * ioremap_wc	-	map memory into CPU space write combined
  * @phys_addr:	bus address of the memory
  * @size:	size of the resource to map
@@ -258,14 +288,28 @@ EXPORT_SYMBOL(ioremap_nocache);
  */
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
-	if (pat_enabled)
-		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
 					__builtin_return_address(0));
-	else
-		return ioremap_nocache(phys_addr, size);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
+/**
+ * ioremap_wt	-	map memory into CPU space write through
+ * @phys_addr:	bus address of the memory
+ * @size:	size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write through.
+ * Write through stores data into memory while keeping the cache up-to-date.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
+{
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
+					__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
 	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
@@ -331,7 +375,7 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-int arch_ioremap_pud_supported(void)
+int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
 	return cpu_has_gbpages;
@@ -340,7 +384,7 @@ int arch_ioremap_pud_supported(void)
 #endif
 }
 
-int arch_ioremap_pmd_supported(void)
+int __init arch_ioremap_pmd_supported(void)
 {
 	return cpu_has_pse;
 }
@@ -353,18 +397,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 {
 	unsigned long start  = phys &  PAGE_MASK;
 	unsigned long offset = phys & ~PAGE_MASK;
-	unsigned long vaddr;
+	void *vaddr;
 
 	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);
 
-	vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+	vaddr = ioremap_cache(start, PAGE_SIZE);
 	/* Only add the offset on success and return NULL if the ioremap() failed: */
 	if (vaddr)
 		vaddr += offset;
 
-	return (void *)vaddr;
+	return vaddr;
 }
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -373,7 +417,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 		return;
 
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
-	return;
 }
 
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/kernel/arch/x86/mm/kasan_init_64.c b/kernel/arch/x86/mm/kasan_init_64.c
index 9a54dbe98..d470cf219 100644
--- a/kernel/arch/x86/mm/kasan_init_64.c
+++ b/kernel/arch/x86/mm/kasan_init_64.c
@@ -1,3 +1,4 @@
+#define pr_fmt(fmt) "kasan: " fmt
 #include <linux/bootmem.h>
 #include <linux/kasan.h>
 #include <linux/kdebug.h>
@@ -11,20 +12,6 @@
 extern pgd_t early_level4_pgt[PTRS_PER_PGD];
 extern struct range pfn_mapped[E820_X_MAX];
 
-static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
-static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
-static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
-
-/*
- * This page used as early shadow. We don't use empty_zero_page
- * at early stages, stack instrumentation could write some garbage
- * to this page.
- * Latter we reuse it as zero shadow for large ranges of memory
- * that allowed to access, but not instrumented by kasan
- * (vmalloc/vmemmap ...).
- */
-static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
-
 static int __init map_range(struct range *range)
 {
 	unsigned long start;
@@ -61,106 +48,6 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
 	}
 }
 
-static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
-				unsigned long end)
-{
-	pte_t *pte = pte_offset_kernel(pmd, addr);
-
-	while (addr + PAGE_SIZE <= end) {
-		WARN_ON(!pte_none(*pte));
-		set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
-					| __PAGE_KERNEL_RO));
-		addr += PAGE_SIZE;
-		pte = pte_offset_kernel(pmd, addr);
-	}
-	return 0;
-}
-
-static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
-				unsigned long end)
-{
-	int ret = 0;
-	pmd_t *pmd = pmd_offset(pud, addr);
-
-	while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
-		WARN_ON(!pmd_none(*pmd));
-		set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
-					| _KERNPG_TABLE));
-		addr += PMD_SIZE;
-		pmd = pmd_offset(pud, addr);
-	}
-	if (addr < end) {
-		if (pmd_none(*pmd)) {
-			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
-			if (!p)
-				return -ENOMEM;
-			set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
-		}
-		ret = zero_pte_populate(pmd, addr, end);
-	}
-	return ret;
-}
-
-
-static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
-				unsigned long end)
-{
-	int ret = 0;
-	pud_t *pud = pud_offset(pgd, addr);
-
-	while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
-		WARN_ON(!pud_none(*pud));
-		set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
-					| _KERNPG_TABLE));
-		addr += PUD_SIZE;
-		pud = pud_offset(pgd, addr);
-	}
-
-	if (addr < end) {
-		if (pud_none(*pud)) {
-			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
-			if (!p)
-				return -ENOMEM;
-			set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
-		}
-		ret = zero_pmd_populate(pud, addr, end);
-	}
-	return ret;
-}
-
-static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
-{
-	int ret = 0;
-	pgd_t *pgd = pgd_offset_k(addr);
-
-	while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
-		WARN_ON(!pgd_none(*pgd));
-		set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
-					| _KERNPG_TABLE));
-		addr += PGDIR_SIZE;
-		pgd = pgd_offset_k(addr);
-	}
-
-	if (addr < end) {
-		if (pgd_none(*pgd)) {
-			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
-			if (!p)
-				return -ENOMEM;
-			set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
-		}
-		ret = zero_pud_populate(pgd, addr, end);
-	}
-	return ret;
-}
-
-
-static void __init populate_zero_shadow(const void *start, const void *end)
-{
-	if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
-		panic("kasan: unable to map zero shadow!");
-}
-
-
 #ifdef CONFIG_KASAN_INLINE
 static int kasan_die_handler(struct notifier_block *self,
 			     unsigned long val,
@@ -212,7 +99,7 @@ void __init kasan_init(void)
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
-	populate_zero_shadow((void *)KASAN_SHADOW_START,
+	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
 			kasan_mem_to_shadow((void *)PAGE_OFFSET));
 
 	for (i = 0; i < E820_X_MAX; i++) {
@@ -222,14 +109,15 @@ void __init kasan_init(void)
 		if (map_range(&pfn_mapped[i]))
 			panic("kasan: unable to allocate shadow!");
 	}
-	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+	kasan_populate_zero_shadow(
+		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+		kasan_mem_to_shadow((void *)__START_KERNEL_map));
 
 	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
 			(unsigned long)kasan_mem_to_shadow(_end),
 			NUMA_NO_NODE);
 
-	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
 	memset(kasan_zero_page, 0, PAGE_SIZE);
@@ -237,4 +125,6 @@ void __init kasan_init(void)
 	load_cr3(init_level4_pgt);
 	__flush_tlb_all();
 	init_task.kasan_depth = 0;
+
+	pr_info("KernelAddressSanitizer initialized\n");
 }
diff --git a/kernel/arch/x86/mm/mpx.c b/kernel/arch/x86/mm/mpx.c
index 4d1c11c07..ef05755a1 100644
--- a/kernel/arch/x86/mm/mpx.c
+++ b/kernel/arch/x86/mm/mpx.c
@@ -10,13 +10,31 @@
 #include <linux/syscalls.h>
 #include <linux/sched/sysctl.h>
 
-#include <asm/i387.h>
 #include <asm/insn.h>
 #include <asm/mman.h>
 #include <asm/mmu_context.h>
 #include <asm/mpx.h>
 #include <asm/processor.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/mpx.h>
+
+static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
+{
+	if (is_64bit_mm(mm))
+		return MPX_BD_SIZE_BYTES_64;
+	else
+		return MPX_BD_SIZE_BYTES_32;
+}
+
+static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
+{
+	if (is_64bit_mm(mm))
+		return MPX_BT_SIZE_BYTES_64;
+	else
+		return MPX_BT_SIZE_BYTES_32;
+}
 
 /*
  * This is really a simplified "vm_mmap". it only handles MPX
@@ -24,58 +42,21 @@
  */
 static unsigned long mpx_mmap(unsigned long len)
 {
-	unsigned long ret;
-	unsigned long addr, pgoff;
 	struct mm_struct *mm = current->mm;
-	vm_flags_t vm_flags;
-	struct vm_area_struct *vma;
+	unsigned long addr, populate;
 
-	/* Only bounds table and bounds directory can be allocated here */
-	if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES)
+	/* Only bounds table can be allocated here */
+	if (len != mpx_bt_size_bytes(mm))
 		return -EINVAL;
 
 	down_write(&mm->mmap_sem);
-
-	/* Too many mappings? */
-	if (mm->map_count > sysctl_max_map_count) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	/* Obtain the address to map to. we verify (or select) it and ensure
-	 * that it represents a valid section of the address space.
-	 */
-	addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
-	if (addr & ~PAGE_MASK) {
-		ret = addr;
-		goto out;
-	}
-
-	vm_flags = VM_READ | VM_WRITE | VM_MPX |
-			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
-	/* Set pgoff according to addr for anon_vma */
-	pgoff = addr >> PAGE_SHIFT;
-
-	ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
-	if (IS_ERR_VALUE(ret))
-		goto out;
-
-	vma = find_vma(mm, ret);
-	if (!vma) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	if (vm_flags & VM_LOCKED) {
-		up_write(&mm->mmap_sem);
-		mm_populate(ret, len);
-		return ret;
-	}
-
-out:
+	addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+			MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
 	up_write(&mm->mmap_sem);
-	return ret;
+	if (populate)
+		mm_populate(addr, populate);
+
+	return addr;
 }
 
 enum reg_type {
@@ -120,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
 	switch (type) {
 	case REG_TYPE_RM:
 		regno = X86_MODRM_RM(insn->modrm.value);
-		if (X86_REX_B(insn->rex_prefix.value) == 1)
+		if (X86_REX_B(insn->rex_prefix.value))
 			regno += 8;
 		break;
 
 	case REG_TYPE_INDEX:
 		regno = X86_SIB_INDEX(insn->sib.value);
-		if (X86_REX_X(insn->rex_prefix.value) == 1)
+		if (X86_REX_X(insn->rex_prefix.value))
 			regno += 8;
 		break;
 
 	case REG_TYPE_BASE:
 		regno = X86_SIB_BASE(insn->sib.value);
-		if (X86_REX_B(insn->rex_prefix.value) == 1)
+		if (X86_REX_B(insn->rex_prefix.value))
 			regno += 8;
 		break;
 
@@ -142,7 +123,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
 		break;
 	}
 
-	if (regno > nr_registers) {
+	if (regno >= nr_registers) {
 		WARN_ONCE(1, "decoded an instruction with an invalid register");
 		return -EINVAL;
 	}
@@ -254,10 +235,10 @@ bad_opcode:
  *
  * The caller is expected to kfree() the returned siginfo_t.
  */
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-				struct xsave_struct *xsave_buf)
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
-	struct bndreg *bndregs, *bndreg;
+	const struct mpx_bndreg_state *bndregs;
+	const struct mpx_bndreg *bndreg;
 	siginfo_t *info = NULL;
 	struct insn insn;
 	uint8_t bndregno;
@@ -277,14 +258,14 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
 		err = -EINVAL;
 		goto err_out;
 	}
-	/* get the bndregs _area_ of the xsave structure */
-	bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS);
+	/* get bndregs field from current task's xsave area */
+	bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
 	if (!bndregs) {
 		err = -EINVAL;
 		goto err_out;
 	}
 	/* now go select the individual register in the set of 4 */
-	bndreg = &bndregs[bndregno];
+	bndreg = &bndregs->bndreg[bndregno];
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
@@ -316,6 +297,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
 		err = -EINVAL;
 		goto err_out;
 	}
+	trace_mpx_bounds_register_exception(info->si_addr, bndreg);
 	return info;
 err_out:
 	/* info might be NULL, but kfree() handles that */
@@ -323,25 +305,18 @@ err_out:
 	return ERR_PTR(err);
 }
 
-static __user void *task_get_bounds_dir(struct task_struct *tsk)
+static __user void *mpx_get_bounds_dir(void)
 {
-	struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX))
 		return MPX_INVALID_BOUNDS_DIR;
 
 	/*
-	 * 32-bit binaries on 64-bit kernels are currently
-	 * unsupported.
-	 */
-	if (IS_ENABLED(CONFIG_X86_64) && test_thread_flag(TIF_IA32))
-		return MPX_INVALID_BOUNDS_DIR;
-	/*
 	 * The bounds directory pointer is stored in a register
 	 * only accessible if we first do an xsave.
 	 */
-	fpu_save_init(&tsk->thread.fpu);
-	bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		return MPX_INVALID_BOUNDS_DIR;
 
@@ -360,10 +335,10 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk)
 		(bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
 }
 
-int mpx_enable_management(struct task_struct *tsk)
+int mpx_enable_management(void)
 {
 	void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
-	struct mm_struct *mm = tsk->mm;
+	struct mm_struct *mm = current->mm;
 	int ret = 0;
 
 	/*
@@ -372,11 +347,12 @@ int mpx_enable_management(struct task_struct *tsk)
 	 * directory into XSAVE/XRSTOR Save Area and enable MPX through
 	 * XRSTOR instruction.
 	 *
-	 * fpu_xsave() is expected to be very expensive. Storing the bounds
-	 * directory here means that we do not have to do xsave in the unmap
-	 * path; we can just use mm->bd_addr instead.
+	 * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
+	 * expected to be relatively expensive. Storing the bounds
+	 * directory here means that we do not have to do xsave in the
+	 * unmap path; we can just use mm->bd_addr instead.
 	 */
-	bd_base = task_get_bounds_dir(tsk);
+	bd_base = mpx_get_bounds_dir();
 	down_write(&mm->mmap_sem);
 	mm->bd_addr = bd_base;
 	if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
@@ -386,7 +362,7 @@ int mpx_enable_management(struct task_struct *tsk)
 	return ret;
 }
 
-int mpx_disable_management(struct task_struct *tsk)
+int mpx_disable_management(void)
 {
 	struct mm_struct *mm = current->mm;
 
@@ -399,29 +375,59 @@ int mpx_disable_management(struct task_struct *tsk)
 	return 0;
 }
 
+static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
+		unsigned long *curval,
+		unsigned long __user *addr,
+		unsigned long old_val, unsigned long new_val)
+{
+	int ret;
+	/*
+	 * user_atomic_cmpxchg_inatomic() actually uses sizeof()
+	 * the pointer that we pass to it to figure out how much
+	 * data to cmpxchg.  We have to be careful here not to
+	 * pass a pointer to a 64-bit data type when we only want
+	 * a 32-bit copy.
+	 */
+	if (is_64bit_mm(mm)) {
+		ret = user_atomic_cmpxchg_inatomic(curval,
+				addr, old_val, new_val);
+	} else {
+		u32 uninitialized_var(curval_32);
+		u32 old_val_32 = old_val;
+		u32 new_val_32 = new_val;
+		u32 __user *addr_32 = (u32 __user *)addr;
+
+		ret = user_atomic_cmpxchg_inatomic(&curval_32,
+				addr_32, old_val_32, new_val_32);
+		*curval = curval_32;
+	}
+	return ret;
+}
+
 /*
- * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each
- * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB,
+ * With 32-bit mode, a bounds directory is 4MB, and the size of each
+ * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
  * and the size of each bounds table is 4MB.
  */
-static int allocate_bt(long __user *bd_entry)
+static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
 {
 	unsigned long expected_old_val = 0;
 	unsigned long actual_old_val = 0;
 	unsigned long bt_addr;
+	unsigned long bd_new_entry;
 	int ret = 0;
 
 	/*
 	 * Carve the virtual space out of userspace for the new
 	 * bounds table:
 	 */
-	bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES);
+	bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
 	if (IS_ERR((void *)bt_addr))
 		return PTR_ERR((void *)bt_addr);
 	/*
 	 * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
 	 */
-	bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+	bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
 
 	/*
 	 * Go poke the address of the new bounds table in to the
@@ -434,8 +440,8 @@ static int allocate_bt(long __user *bd_entry)
 	 * mmap_sem at this point, unlike some of the other part
 	 * of the MPX code that have to pagefault_disable().
 	 */
-	ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
-					   expected_old_val, bt_addr);
+	ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,	bd_entry,
+				   expected_old_val, bd_new_entry);
 	if (ret)
 		goto out_unmap;
 
@@ -463,9 +469,10 @@ static int allocate_bt(long __user *bd_entry)
 		ret = -EINVAL;
 		goto out_unmap;
 	}
+	trace_mpx_new_bounds_table(bt_addr);
 	return 0;
 out_unmap:
-	vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES);
+	vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
 	return ret;
 }
 
@@ -480,12 +487,13 @@ out_unmap:
  * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
  * and the size of each bound table is 4MB.
  */
-static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
+static int do_mpx_bt_fault(void)
 {
 	unsigned long bd_entry, bd_base;
-	struct bndcsr *bndcsr;
+	const struct mpx_bndcsr *bndcsr;
+	struct mm_struct *mm = current->mm;
 
-	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 	if (!bndcsr)
 		return -EINVAL;
 	/*
@@ -502,13 +510,13 @@ static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
 	 * the directory is.
 	 */
 	if ((bd_entry < bd_base) ||
-	    (bd_entry >= bd_base + MPX_BD_SIZE_BYTES))
+	    (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
 		return -EINVAL;
 
-	return allocate_bt((long __user *)bd_entry);
+	return allocate_bt(mm, (long __user *)bd_entry);
 }
 
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+int mpx_handle_bd_fault(void)
 {
 	/*
 	 * Userspace never asked us to manage the bounds tables,
@@ -517,7 +525,7 @@ int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
 	if (!kernel_managing_mpx_tables(current->mm))
 		return -EINVAL;
 
-	if (do_mpx_bt_fault(xsave_buf)) {
+	if (do_mpx_bt_fault()) {
 		force_sig(SIGSEGV, current);
 		/*
 		 * The force_sig() is essentially "handling" this
@@ -554,29 +562,78 @@ static int mpx_resolve_fault(long __user *addr, int write)
 	return 0;
 }
 
+static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
+					     unsigned long bd_entry)
+{
+	unsigned long bt_addr = bd_entry;
+	int align_to_bytes;
+	/*
+	 * Bit 0 in a bt_entry is always the valid bit.
+	 */
+	bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
+	/*
+	 * Tables are naturally aligned at 8-byte boundaries
+	 * on 64-bit and 4-byte boundaries on 32-bit.  The
+	 * documentation makes it appear that the low bits
+	 * are ignored by the hardware, so we do the same.
+	 */
+	if (is_64bit_mm(mm))
+		align_to_bytes = 8;
+	else
+		align_to_bytes = 4;
+	bt_addr &= ~(align_to_bytes-1);
+	return bt_addr;
+}
+
+/*
+ * We only want to do a 4-byte get_user() on 32-bit.  Otherwise,
+ * we might run off the end of the bounds table if we are on
+ * a 64-bit kernel and try to get 8 bytes.
+ */
+int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
+		long __user *bd_entry_ptr)
+{
+	u32 bd_entry_32;
+	int ret;
+
+	if (is_64bit_mm(mm))
+		return get_user(*bd_entry_ret, bd_entry_ptr);
+
+	/*
+	 * Note that get_user() uses the type of the *pointer* to
+	 * establish the size of the get, not the destination.
+	 */
+	ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
+	*bd_entry_ret = bd_entry_32;
+	return ret;
+}
+
 /*
  * Get the base of bounds tables pointed by specific bounds
  * directory entry.
  */
 static int get_bt_addr(struct mm_struct *mm,
-			long __user *bd_entry, unsigned long *bt_addr)
+			long __user *bd_entry_ptr,
+			unsigned long *bt_addr_result)
 {
 	int ret;
 	int valid_bit;
+	unsigned long bd_entry;
+	unsigned long bt_addr;
 
-	if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry)))
+	if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
 		return -EFAULT;
 
 	while (1) {
 		int need_write = 0;
 
 		pagefault_disable();
-		ret = get_user(*bt_addr, bd_entry);
+		ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
 		pagefault_enable();
 		if (!ret)
 			break;
 		if (ret == -EFAULT)
-			ret = mpx_resolve_fault(bd_entry, need_write);
+			ret = mpx_resolve_fault(bd_entry_ptr, need_write);
 		/*
 		 * If we could not resolve the fault, consider it
 		 * userspace's fault and error out.
@@ -585,8 +642,8 @@ static int get_bt_addr(struct mm_struct *mm,
 			return ret;
 	}
 
-	valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG;
-	*bt_addr &= MPX_BT_ADDR_MASK;
+	valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
+	bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
 
 	/*
 	 * When the kernel is managing bounds tables, a bounds directory
@@ -595,7 +652,7 @@ static int get_bt_addr(struct mm_struct *mm,
 	 * data in the address field, we know something is wrong. This
 	 * -EINVAL return will cause a SIGSEGV.
 	 */
-	if (!valid_bit && *bt_addr)
+	if (!valid_bit && bt_addr)
 		return -EINVAL;
 	/*
 	 * Do we have an completely zeroed bt entry?  That is OK.  It
@@ -606,19 +663,112 @@ static int get_bt_addr(struct mm_struct *mm,
 	if (!valid_bit)
 		return -ENOENT;
 
+	*bt_addr_result = bt_addr;
 	return 0;
 }
 
+static inline int bt_entry_size_bytes(struct mm_struct *mm)
+{
+	if (is_64bit_mm(mm))
+		return MPX_BT_ENTRY_BYTES_64;
+	else
+		return MPX_BT_ENTRY_BYTES_32;
+}
+
+/*
+ * Take a virtual address and turns it in to the offset in bytes
+ * inside of the bounds table where the bounds table entry
+ * controlling 'addr' can be found.
+ */
+static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
+		unsigned long addr)
+{
+	unsigned long bt_table_nr_entries;
+	unsigned long offset = addr;
+
+	if (is_64bit_mm(mm)) {
+		/* Bottom 3 bits are ignored on 64-bit */
+		offset >>= 3;
+		bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
+	} else {
+		/* Bottom 2 bits are ignored on 32-bit */
+		offset >>= 2;
+		bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
+	}
+	/*
+	 * We know the size of the table in to which we are
+	 * indexing, and we have eliminated all the low bits
+	 * which are ignored for indexing.
+	 *
+	 * Mask out all the high bits which we do not need
+	 * to index in to the table.  Note that the tables
+	 * are always powers of two so this gives us a proper
+	 * mask.
+	 */
+	offset &= (bt_table_nr_entries-1);
+	/*
+	 * We now have an entry offset in terms of *entries* in
+	 * the table.  We need to scale it back up to bytes.
+	 */
+	offset *= bt_entry_size_bytes(mm);
+	return offset;
+}
+
+/*
+ * How much virtual address space does a single bounds
+ * directory entry cover?
+ *
+ * Note, we need a long long because 4GB doesn't fit in
+ * to a long on 32-bit.
+ */
+static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
+{
+	unsigned long long virt_space;
+	unsigned long long GB = (1ULL << 30);
+
+	/*
+	 * This covers 32-bit emulation as well as 32-bit kernels
+	 * running on 64-bit harware.
+	 */
+	if (!is_64bit_mm(mm))
+		return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
+
+	/*
+	 * 'x86_virt_bits' returns what the hardware is capable
+	 * of, and returns the full >32-bit adddress space when
+	 * running 32-bit kernels on 64-bit hardware.
+	 */
+	virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
+	return virt_space / MPX_BD_NR_ENTRIES_64;
+}
+
 /*
  * Free the backing physical pages of bounds table 'bt_addr'.
  * Assume start...end is within that bounds table.
  */
-static int zap_bt_entries(struct mm_struct *mm,
+static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
 		unsigned long bt_addr,
-		unsigned long start, unsigned long end)
+		unsigned long start_mapping, unsigned long end_mapping)
 {
 	struct vm_area_struct *vma;
 	unsigned long addr, len;
+	unsigned long start;
+	unsigned long end;
+
+	/*
+	 * if we 'end' on a boundary, the offset will be 0 which
+	 * is not what we want.  Back it up a byte to get the
+	 * last bt entry.  Then once we have the entry itself,
+	 * move 'end' back up by the table entry size.
+	 */
+	start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
+	end   = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
+	/*
+	 * Move end back up by one entry.  Among other things
+	 * this ensures that it remains page-aligned and does
+	 * not screw up zap_page_range()
+	 */
+	end += bt_entry_size_bytes(mm);
 
 	/*
 	 * Find the first overlapping vma. If vma->vm_start > start, there
@@ -630,7 +780,7 @@ static int zap_bt_entries(struct mm_struct *mm,
 		return -EINVAL;
 
 	/*
-	 * A NUMA policy on a VM_MPX VMA could cause this bouds table to
+	 * A NUMA policy on a VM_MPX VMA could cause this bounds table to
 	 * be split. So we need to look across the entire 'start -> end'
 	 * range of this bounds table, find all of the VM_MPX VMAs, and
 	 * zap only those.
@@ -648,27 +798,65 @@ static int zap_bt_entries(struct mm_struct *mm,
 
 		len = min(vma->vm_end, end) - addr;
 		zap_page_range(vma, addr, len, NULL);
+		trace_mpx_unmap_zap(addr, addr+len);
 
 		vma = vma->vm_next;
 		addr = vma->vm_start;
 	}
-
 	return 0;
 }
 
-static int unmap_single_bt(struct mm_struct *mm,
+static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
+		unsigned long addr)
+{
+	/*
+	 * There are several ways to derive the bd offsets.  We
+	 * use the following approach here:
+	 * 1. We know the size of the virtual address space
+	 * 2. We know the number of entries in a bounds table
+	 * 3. We know that each entry covers a fixed amount of
+	 *    virtual address space.
+	 * So, we can just divide the virtual address by the
+	 * virtual space used by one entry to determine which
+	 * entry "controls" the given virtual address.
+	 */
+	if (is_64bit_mm(mm)) {
+		int bd_entry_size = 8; /* 64-bit pointer */
+		/*
+		 * Take the 64-bit addressing hole in to account.
+		 */
+		addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
+		return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+	} else {
+		int bd_entry_size = 4; /* 32-bit pointer */
+		/*
+		 * 32-bit has no hole so this case needs no mask
+		 */
+		return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+	}
+	/*
+	 * The two return calls above are exact copies.  If we
+	 * pull out a single copy and put it in here, gcc won't
+	 * realize that we're doing a power-of-2 divide and use
+	 * shifts.  It uses a real divide.  If we put them up
+	 * there, it manages to figure it out (gcc 4.8.3).
+	 */
+}
+
+static int unmap_entire_bt(struct mm_struct *mm,
 		long __user *bd_entry, unsigned long bt_addr)
 {
 	unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
-	unsigned long actual_old_val = 0;
+	unsigned long uninitialized_var(actual_old_val);
 	int ret;
 
 	while (1) {
 		int need_write = 1;
+		unsigned long cleared_bd_entry = 0;
 
 		pagefault_disable();
-		ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
-						   expected_old_val, 0);
+		ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
+				bd_entry, expected_old_val, cleared_bd_entry);
 		pagefault_enable();
 		if (!ret)
 			break;
@@ -687,9 +875,8 @@ static int unmap_single_bt(struct mm_struct *mm,
 	if (actual_old_val != expected_old_val) {
 		/*
 		 * Someone else raced with us to unmap the table.
-		 * There was no bounds table pointed to by the
-		 * directory, so declare success.  Somebody freed
-		 * it.
+		 * That is OK, since we were both trying to do
+		 * the same thing.  Declare success.
 		 */
 		if (!actual_old_val)
 			return 0;
@@ -702,176 +889,113 @@ static int unmap_single_bt(struct mm_struct *mm,
 		 */
 		return -EINVAL;
 	}
-
 	/*
 	 * Note, we are likely being called under do_munmap() already. To
 	 * avoid recursion, do_munmap() will check whether it comes
 	 * from one bounds table through VM_MPX flag.
 	 */
-	return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES);
+	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
 }
 
-/*
- * If the bounds table pointed by bounds directory 'bd_entry' is
- * not shared, unmap this whole bounds table. Otherwise, only free
- * those backing physical pages of bounds table entries covered
- * in this virtual address region start...end.
- */
-static int unmap_shared_bt(struct mm_struct *mm,
-		long __user *bd_entry, unsigned long start,
-		unsigned long end, bool prev_shared, bool next_shared)
+static int try_unmap_single_bt(struct mm_struct *mm,
+	       unsigned long start, unsigned long end)
 {
-	unsigned long bt_addr;
-	int ret;
-
-	ret = get_bt_addr(mm, bd_entry, &bt_addr);
+	struct vm_area_struct *next;
+	struct vm_area_struct *prev;
 	/*
-	 * We could see an "error" ret for not-present bounds
-	 * tables (not really an error), or actual errors, but
-	 * stop unmapping either way.
+	 * "bta" == Bounds Table Area: the area controlled by the
+	 * bounds table that we are unmapping.
 	 */
-	if (ret)
-		return ret;
-
-	if (prev_shared && next_shared)
-		ret = zap_bt_entries(mm, bt_addr,
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
-	else if (prev_shared)
-		ret = zap_bt_entries(mm, bt_addr,
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
-				bt_addr+MPX_BT_SIZE_BYTES);
-	else if (next_shared)
-		ret = zap_bt_entries(mm, bt_addr, bt_addr,
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
-	else
-		ret = unmap_single_bt(mm, bd_entry, bt_addr);
-
-	return ret;
-}
-
-/*
- * A virtual address region being munmap()ed might share bounds table
- * with adjacent VMAs. We only need to free the backing physical
- * memory of these shared bounds tables entries covered in this virtual
- * address region.
- */
-static int unmap_edge_bts(struct mm_struct *mm,
-		unsigned long start, unsigned long end)
-{
+	unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
+	unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
+	unsigned long uninitialized_var(bt_addr);
+	void __user *bde_vaddr;
 	int ret;
-	long __user *bde_start, *bde_end;
-	struct vm_area_struct *prev, *next;
-	bool prev_shared = false, next_shared = false;
-
-	bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
-	bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
-
 	/*
-	 * Check whether bde_start and bde_end are shared with adjacent
-	 * VMAs.
-	 *
-	 * We already unliked the VMAs from the mm's rbtree so 'start'
+	 * We already unlinked the VMAs from the mm's rbtree so 'start'
 	 * is guaranteed to be in a hole. This gets us the first VMA
 	 * before the hole in to 'prev' and the next VMA after the hole
 	 * in to 'next'.
 	 */
 	next = find_vma_prev(mm, start, &prev);
-	if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1))
-			== bde_start)
-		prev_shared = true;
-	if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start))
-			== bde_end)
-		next_shared = true;
-
 	/*
-	 * This virtual address region being munmap()ed is only
-	 * covered by one bounds table.
-	 *
-	 * In this case, if this table is also shared with adjacent
-	 * VMAs, only part of the backing physical memory of the bounds
-	 * table need be freeed. Otherwise the whole bounds table need
-	 * be unmapped.
-	 */
-	if (bde_start == bde_end) {
-		return unmap_shared_bt(mm, bde_start, start, end,
-				prev_shared, next_shared);
+	 * Do not count other MPX bounds table VMAs as neighbors.
+	 * Although theoretically possible, we do not allow bounds
+	 * tables for bounds tables so our heads do not explode.
+	 * If we count them as neighbors here, we may end up with
+	 * lots of tables even though we have no actual table
+	 * entries in use.
+	 */
+	while (next && (next->vm_flags & VM_MPX))
+		next = next->vm_next;
+	while (prev && (prev->vm_flags & VM_MPX))
+		prev = prev->vm_prev;
+	/*
+	 * We know 'start' and 'end' lie within an area controlled
+	 * by a single bounds table.  See if there are any other
+	 * VMAs controlled by that bounds table.  If there are not
+	 * then we can "expand" the are we are unmapping to possibly
+	 * cover the entire table.
+	 */
+	next = find_vma_prev(mm, start, &prev);
+	if ((!prev || prev->vm_end <= bta_start_vaddr) &&
+	    (!next || next->vm_start >= bta_end_vaddr)) {
+		/*
+		 * No neighbor VMAs controlled by same bounds
+		 * table.  Try to unmap the whole thing
+		 */
+		start = bta_start_vaddr;
+		end = bta_end_vaddr;
 	}
 
+	bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
+	ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
 	/*
-	 * If more than one bounds tables are covered in this virtual
-	 * address region being munmap()ed, we need to separately check
-	 * whether bde_start and bde_end are shared with adjacent VMAs.
+	 * No bounds table there, so nothing to unmap.
 	 */
-	ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false);
-	if (ret)
-		return ret;
-	ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared);
+	if (ret == -ENOENT) {
+		ret = 0;
+		return 0;
+	}
 	if (ret)
 		return ret;
-
-	return 0;
+	/*
+	 * We are unmapping an entire table.  Either because the
+	 * unmap that started this whole process was large enough
+	 * to cover an entire table, or that the unmap was small
+	 * but was the area covered by a bounds table.
+	 */
+	if ((start == bta_start_vaddr) &&
+	    (end == bta_end_vaddr))
+		return unmap_entire_bt(mm, bde_vaddr, bt_addr);
+	return zap_bt_entries_mapping(mm, bt_addr, start, end);
 }
 
 static int mpx_unmap_tables(struct mm_struct *mm,
 		unsigned long start, unsigned long end)
 {
-	int ret;
-	long __user *bd_entry, *bde_start, *bde_end;
-	unsigned long bt_addr;
-
-	/*
-	 * "Edge" bounds tables are those which are being used by the region
-	 * (start -> end), but that may be shared with adjacent areas.  If they
-	 * turn out to be completely unshared, they will be freed.  If they are
-	 * shared, we will free the backing store (like an MADV_DONTNEED) for
-	 * areas used by this region.
-	 */
-	ret = unmap_edge_bts(mm, start, end);
-	switch (ret) {
-		/* non-present tables are OK */
-		case 0:
-		case -ENOENT:
-			/* Success, or no tables to unmap */
-			break;
-		case -EINVAL:
-		case -EFAULT:
-		default:
-			return ret;
-	}
-
-	/*
-	 * Only unmap the bounds table that are
-	 *   1. fully covered
-	 *   2. not at the edges of the mapping, even if full aligned
-	 */
-	bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
-	bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
-	for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) {
-		ret = get_bt_addr(mm, bd_entry, &bt_addr);
-		switch (ret) {
-			case 0:
-				break;
-			case -ENOENT:
-				/* No table here, try the next one */
-				continue;
-			case -EINVAL:
-			case -EFAULT:
-			default:
-				/*
-				 * Note: we are being strict here.
-				 * Any time we run in to an issue
-				 * unmapping tables, we stop and
-				 * SIGSEGV.
-				 */
-				return ret;
-		}
-
-		ret = unmap_single_bt(mm, bd_entry, bt_addr);
+	unsigned long one_unmap_start;
+	trace_mpx_unmap_search(start, end);
+
+	one_unmap_start = start;
+	while (one_unmap_start < end) {
+		int ret;
+		unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
+						       bd_entry_virt_space(mm));
+		unsigned long one_unmap_end = end;
+		/*
+		 * if the end is beyond the current bounds table,
+		 * move it back so we only deal with a single one
+		 * at a time
+		 */
+		if (one_unmap_end > next_unmap_start)
+			one_unmap_end = next_unmap_start;
+		ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
 		if (ret)
 			return ret;
-	}
 
+		one_unmap_start = next_unmap_start;
+	}
 	return 0;
 }
 
diff --git a/kernel/arch/x86/mm/numa.c b/kernel/arch/x86/mm/numa.c
index 4053bb58b..c3b3f653e 100644
--- a/kernel/arch/x86/mm/numa.c
+++ b/kernel/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 		bi->start = max(bi->start, low);
 		bi->end = min(bi->end, high);
 
-		/* and there's no empty block */
-		if (bi->start >= bi->end)
+		/* and there's no empty or non-exist block */
+		if (bi->start >= bi->end ||
+		    !memblock_overlaps_region(&memblock.memory,
+			bi->start, bi->end - bi->start))
 			numa_remove_memblk_from(i--, mi);
 	}
 
diff --git a/kernel/arch/x86/mm/pageattr-test.c b/kernel/arch/x86/mm/pageattr-test.c
index 6629f397b..5f169d5d7 100644
--- a/kernel/arch/x86/mm/pageattr-test.c
+++ b/kernel/arch/x86/mm/pageattr-test.c
@@ -8,7 +8,9 @@
 #include <linux/kthread.h>
 #include <linux/random.h>
 #include <linux/kernel.h>
+#include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
@@ -255,5 +257,4 @@ static int start_pageattr_test(void)
 
 	return 0;
 }
-
-module_init(start_pageattr_test);
+device_initcall(start_pageattr_test);
diff --git a/kernel/arch/x86/mm/pageattr.c b/kernel/arch/x86/mm/pageattr.c
index 89af288ec..b599a780a 100644
--- a/kernel/arch/x86/mm/pageattr.c
+++ b/kernel/arch/x86/mm/pageattr.c
@@ -4,7 +4,6 @@
  */
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
-#include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
@@ -14,6 +13,7 @@
 #include <linux/percpu.h>
 #include <linux/gfp.h>
 #include <linux/pci.h>
+#include <linux/vmalloc.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -33,7 +33,7 @@ struct cpa_data {
 	pgd_t		*pgd;
 	pgprot_t	mask_set;
 	pgprot_t	mask_clr;
-	int		numpages;
+	unsigned long	numpages;
 	int		flags;
 	unsigned long	pfn;
 	unsigned	force_split : 1;
@@ -129,16 +129,15 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-	void *vend = vaddr + size - 1;
+	unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
 
 	mb();
 
-	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
-		clflushopt(vaddr);
-	/*
-	 * Flush any possible final partial cacheline:
-	 */
-	clflushopt(vend);
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += boot_cpu_data.x86_clflush_size)
+		clflushopt(p);
 
 	mb();
 }
@@ -418,17 +417,31 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
 	phys_addr_t phys_addr;
 	unsigned long offset;
 	enum pg_level level;
-	unsigned long psize;
-	unsigned long pmask;
 	pte_t *pte;
 
 	pte = lookup_address(virt_addr, &level);
 	BUG_ON(!pte);
-	psize = page_level_size(level);
-	pmask = page_level_mask(level);
-	offset = virt_addr & ~pmask;
-	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
-	return (phys_addr | offset);
+
+	/*
+	 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
+	 * before being left-shifted PAGE_SHIFT bits -- this trick is to
+	 * make 32-PAE kernel work correctly.
+	 */
+	switch (level) {
+	case PG_LEVEL_1G:
+		phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
+		offset = virt_addr & ~PUD_PAGE_MASK;
+		break;
+	case PG_LEVEL_2M:
+		phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
+		offset = virt_addr & ~PMD_PAGE_MASK;
+		break;
+	default:
+		phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+		offset = virt_addr & ~PAGE_MASK;
+	}
+
+	return (phys_addr_t)(phys_addr | offset);
 }
 EXPORT_SYMBOL_GPL(slow_virt_to_phys);
 
@@ -461,7 +474,7 @@ static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
-	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
 	pte_t new_pte, old_pte, *tmp;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
@@ -481,17 +494,21 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 
 	switch (level) {
 	case PG_LEVEL_2M:
-#ifdef CONFIG_X86_64
+		old_prot = pmd_pgprot(*(pmd_t *)kpte);
+		old_pfn = pmd_pfn(*(pmd_t *)kpte);
+		break;
 	case PG_LEVEL_1G:
-#endif
-		psize = page_level_size(level);
-		pmask = page_level_mask(level);
+		old_prot = pud_pgprot(*(pud_t *)kpte);
+		old_pfn = pud_pfn(*(pud_t *)kpte);
 		break;
 	default:
 		do_split = -EINVAL;
 		goto out_unlock;
 	}
 
+	psize = page_level_size(level);
+	pmask = page_level_mask(level);
+
 	/*
 	 * Calculate the number of pages, which fit into this large
 	 * page starting at address:
@@ -507,7 +524,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * up accordingly.
 	 */
 	old_pte = *kpte;
-	old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
+	req_prot = pgprot_large_2_4k(old_prot);
 
 	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
@@ -533,10 +550,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	req_prot = canon_pgprot(req_prot);
 
 	/*
-	 * old_pte points to the large page base address. So we need
+	 * old_pfn points to the large page base pfn. So we need
 	 * to add the offset of the virtual address:
 	 */
-	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
+	pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
 	cpa->pfn = pfn;
 
 	new_prot = static_protections(req_prot, address, pfn);
@@ -547,7 +564,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * the pages in the range we try to preserve:
 	 */
 	addr = address & pmask;
-	pfn = pte_pfn(old_pte);
+	pfn = old_pfn;
 	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 
@@ -577,7 +594,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 		 * The address is aligned and the number of pages
 		 * covers the full page.
 		 */
-		new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
+		new_pte = pfn_pte(old_pfn, new_prot);
 		__set_pmd_pte(kpte, address, new_pte);
 		cpa->flags |= CPA_FLUSHTLB;
 		do_split = 0;
@@ -594,7 +611,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 		   struct page *base)
 {
 	pte_t *pbase = (pte_t *)page_address(base);
-	unsigned long pfn, pfninc = 1;
+	unsigned long ref_pfn, pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *tmp;
 	pgprot_t ref_prot;
@@ -611,26 +628,33 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	}
 
 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
-	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 
-	/* promote PAT bit to correct position */
-	if (level == PG_LEVEL_2M)
+	switch (level) {
+	case PG_LEVEL_2M:
+		ref_prot = pmd_pgprot(*(pmd_t *)kpte);
+		/* clear PSE and promote PAT bit to correct position */
 		ref_prot = pgprot_large_2_4k(ref_prot);
+		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
+		break;
 
-#ifdef CONFIG_X86_64
-	if (level == PG_LEVEL_1G) {
+	case PG_LEVEL_1G:
+		ref_prot = pud_pgprot(*(pud_t *)kpte);
+		ref_pfn = pud_pfn(*(pud_t *)kpte);
 		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+
 		/*
-		 * Set the PSE flags only if the PRESENT flag is set
+		 * Clear the PSE flags if the PRESENT flag is not set
 		 * otherwise pmd_present/pmd_huge will return true
 		 * even on a non present pmd.
 		 */
-		if (pgprot_val(ref_prot) & _PAGE_PRESENT)
-			pgprot_val(ref_prot) |= _PAGE_PSE;
-		else
+		if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
 			pgprot_val(ref_prot) &= ~_PAGE_PSE;
+		break;
+
+	default:
+		spin_unlock(&pgd_lock);
+		return 1;
 	}
-#endif
 
 	/*
 	 * Set the GLOBAL flags only if the PRESENT flag is set
@@ -646,13 +670,16 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	/*
 	 * Get the target pfn from the original entry:
 	 */
-	pfn = pte_pfn(*kpte);
+	pfn = ref_pfn;
 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
 
-	if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
-				PFN_DOWN(__pa(address)) + 1))
-		split_page_count(level);
+	if (virt_addr_valid(address)) {
+		unsigned long pfn = PFN_DOWN(__pa(address));
+
+		if (pfn_range_is_mapped(pfn, pfn + 1))
+			split_page_count(level);
+	}
 
 	/*
 	 * Install the new, split up pagetable.
@@ -1324,7 +1351,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 * CPA operation. Either a large page has been
 		 * preserved or a single page update happened.
 		 */
-		BUG_ON(cpa->numpages > numpages);
+		BUG_ON(cpa->numpages > numpages || !cpa->numpages);
 		numpages -= cpa->numpages;
 		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
 			cpa->curpage++;
@@ -1468,6 +1495,9 @@ int _set_memory_uc(unsigned long addr, int numpages)
 {
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
+	 * If you really need strong UC use ioremap_uc(), but note
+	 * that you cannot override IO areas with set_memory_*() as
+	 * these helpers cannot work with IO memory.
 	 */
 	return change_page_attr_set(&addr, numpages,
 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
@@ -1502,12 +1532,10 @@ EXPORT_SYMBOL(set_memory_uc);
 static int _set_memory_array(unsigned long *addr, int addrinarray,
 		enum page_cache_mode new_type)
 {
+	enum page_cache_mode set_type;
 	int i, j;
 	int ret;
 
-	/*
-	 * for now UC MINUS. see comments in ioremap_nocache()
-	 */
 	for (i = 0; i < addrinarray; i++) {
 		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
 					new_type, NULL);
@@ -1515,9 +1543,12 @@ static int _set_memory_array(unsigned long *addr, int addrinarray,
 			goto out_free;
 	}
 
+	/* If WC, set to UC- first and then WC */
+	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+				_PAGE_CACHE_MODE_UC_MINUS : new_type;
+
 	ret = change_page_attr_set(addr, addrinarray,
-				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
-				   1);
+				   cachemode2pgprot(set_type), 1);
 
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(addr, addrinarray,
@@ -1549,6 +1580,12 @@ int set_memory_array_wc(unsigned long *addr, int addrinarray)
 }
 EXPORT_SYMBOL(set_memory_array_wc);
 
+int set_memory_array_wt(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_memory_array_wt);
+
 int _set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
@@ -1571,27 +1608,42 @@ int set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
 
-	if (!pat_enabled)
-		return set_memory_uc(addr, numpages);
-
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 		_PAGE_CACHE_MODE_WC, NULL);
 	if (ret)
-		goto out_err;
+		return ret;
 
 	ret = _set_memory_wc(addr, numpages);
 	if (ret)
-		goto out_free;
+		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
 
-	return 0;
-
-out_free:
-	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-out_err:
 	return ret;
 }
 EXPORT_SYMBOL(set_memory_wc);
 
+int _set_memory_wt(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(&addr, numpages,
+				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
+}
+
+int set_memory_wt(unsigned long addr, int numpages)
+{
+	int ret;
+
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+			      _PAGE_CACHE_MODE_WT, NULL);
+	if (ret)
+		return ret;
+
+	ret = _set_memory_wt(addr, numpages);
+	if (ret)
+		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_memory_wt);
+
 int _set_memory_wb(unsigned long addr, int numpages)
 {
 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
@@ -1682,6 +1734,7 @@ static int _set_pages_array(struct page **pages, int addrinarray,
 {
 	unsigned long start;
 	unsigned long end;
+	enum page_cache_mode set_type;
 	int i;
 	int free_idx;
 	int ret;
@@ -1695,8 +1748,12 @@ static int _set_pages_array(struct page **pages, int addrinarray,
 			goto err_out;
 	}
 
+	/* If WC, set to UC- first and then WC */
+	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+				_PAGE_CACHE_MODE_UC_MINUS : new_type;
+
 	ret = cpa_set_pages_array(pages, addrinarray,
-			cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
+				  cachemode2pgprot(set_type));
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(NULL, addrinarray,
 					       cachemode2pgprot(
@@ -1730,6 +1787,12 @@ int set_pages_array_wc(struct page **pages, int addrinarray)
 }
 EXPORT_SYMBOL(set_pages_array_wc);
 
+int set_pages_array_wt(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_pages_array_wt);
+
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
diff --git a/kernel/arch/x86/mm/pat.c b/kernel/arch/x86/mm/pat.c
index 35af6771a..188e3e07e 100644
--- a/kernel/arch/x86/mm/pat.c
+++ b/kernel/arch/x86/mm/pat.c
@@ -33,13 +33,17 @@
 #include "pat_internal.h"
 #include "mm_internal.h"
 
-#ifdef CONFIG_X86_PAT
-int __read_mostly pat_enabled = 1;
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static bool boot_cpu_done;
+
+static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
 
 static inline void pat_disable(const char *reason)
 {
-	pat_enabled = 0;
-	printk(KERN_INFO "%s\n", reason);
+	__pat_enabled = 0;
+	pr_info("x86/PAT: %s\n", reason);
 }
 
 static int __init nopat(char *str)
@@ -48,13 +52,12 @@ static int __init nopat(char *str)
 	return 0;
 }
 early_param("nopat", nopat);
-#else
-static inline void pat_disable(const char *reason)
+
+bool pat_enabled(void)
 {
-	(void)reason;
+	return !!__pat_enabled;
 }
-#endif
-
+EXPORT_SYMBOL_GPL(pat_enabled);
 
 int pat_debug_enable;
 
@@ -65,22 +68,24 @@ static int __init pat_debug_setup(char *str)
 }
 __setup("debugpat", pat_debug_setup);
 
-static u64 __read_mostly boot_pat_state;
-
 #ifdef CONFIG_X86_PAT
 /*
- * X86 PAT uses page flags WC and Uncached together to keep track of
- * memory type of pages that have backing page struct. X86 PAT supports 3
- * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
- * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
- * been changed from its default (value of -1 used to denote this).
- * Note we do not support _PAGE_CACHE_MODE_UC here.
+ * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * memory type of pages that have backing page struct.
+ *
+ * X86 PAT supports 4 different memory types:
+ *  - _PAGE_CACHE_MODE_WB
+ *  - _PAGE_CACHE_MODE_WC
+ *  - _PAGE_CACHE_MODE_UC_MINUS
+ *  - _PAGE_CACHE_MODE_WT
+ *
+ * _PAGE_CACHE_MODE_WB is the default type.
  */
 
-#define _PGMT_DEFAULT		0
+#define _PGMT_WB		0
 #define _PGMT_WC		(1UL << PG_arch_1)
 #define _PGMT_UC_MINUS		(1UL << PG_uncached)
-#define _PGMT_WB		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_WT		(1UL << PG_uncached | 1UL << PG_arch_1)
 #define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
 #define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
 
@@ -88,14 +93,14 @@ static inline enum page_cache_mode get_page_memtype(struct page *pg)
 {
 	unsigned long pg_flags = pg->flags & _PGMT_MASK;
 
-	if (pg_flags == _PGMT_DEFAULT)
-		return -1;
+	if (pg_flags == _PGMT_WB)
+		return _PAGE_CACHE_MODE_WB;
 	else if (pg_flags == _PGMT_WC)
 		return _PAGE_CACHE_MODE_WC;
 	else if (pg_flags == _PGMT_UC_MINUS)
 		return _PAGE_CACHE_MODE_UC_MINUS;
 	else
-		return _PAGE_CACHE_MODE_WB;
+		return _PAGE_CACHE_MODE_WT;
 }
 
 static inline void set_page_memtype(struct page *pg,
@@ -112,11 +117,12 @@ static inline void set_page_memtype(struct page *pg,
 	case _PAGE_CACHE_MODE_UC_MINUS:
 		memtype_flags = _PGMT_UC_MINUS;
 		break;
-	case _PAGE_CACHE_MODE_WB:
-		memtype_flags = _PGMT_WB;
+	case _PAGE_CACHE_MODE_WT:
+		memtype_flags = _PGMT_WT;
 		break;
+	case _PAGE_CACHE_MODE_WB:
 	default:
-		memtype_flags = _PGMT_DEFAULT;
+		memtype_flags = _PGMT_WB;
 		break;
 	}
 
@@ -174,78 +180,154 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
  * configuration.
  * Using lower indices is preferred, so we start with highest index.
  */
-void pat_init_cache_modes(void)
+void pat_init_cache_modes(u64 pat)
 {
-	int i;
 	enum page_cache_mode cache;
 	char pat_msg[33];
-	u64 pat;
+	int i;
 
-	rdmsrl(MSR_IA32_CR_PAT, pat);
 	pat_msg[32] = 0;
 	for (i = 7; i >= 0; i--) {
 		cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
 					   pat_msg + 4 * i);
 		update_cache_mode_entry(i, cache);
 	}
-	pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
 }
 
 #define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
 
-void pat_init(void)
+static void pat_bsp_init(u64 pat)
 {
-	u64 pat;
-	bool boot_cpu = !boot_pat_state;
+	u64 tmp_pat;
 
-	if (!pat_enabled)
+	if (!cpu_has_pat) {
+		pat_disable("PAT not supported by CPU.");
 		return;
+	}
 
-	if (!cpu_has_pat) {
-		if (!boot_pat_state) {
-			pat_disable("PAT not supported by CPU.");
-			return;
-		} else {
-			/*
-			 * If this happens we are on a secondary CPU, but
-			 * switched to PAT on the boot CPU. We have no way to
-			 * undo PAT.
-			 */
-			printk(KERN_ERR "PAT enabled, "
-			       "but not supported by secondary CPU\n");
-			BUG();
-		}
+	if (!pat_enabled())
+		goto done;
+
+	rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
+	if (!tmp_pat) {
+		pat_disable("PAT MSR is 0, disabled.");
+		return;
 	}
 
-	/* Set PWT to Write-Combining. All other bits stay the same */
-	/*
-	 * PTE encoding used in Linux:
-	 *      PAT
-	 *      |PCD
-	 *      ||PWT
-	 *      |||
-	 *      000 WB		_PAGE_CACHE_WB
-	 *      001 WC		_PAGE_CACHE_WC
-	 *      010 UC-		_PAGE_CACHE_UC_MINUS
-	 *      011 UC		_PAGE_CACHE_UC
-	 * PAT bit unused
-	 */
-	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
-
-	/* Boot CPU check */
-	if (!boot_pat_state) {
-		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
-		if (!boot_pat_state) {
-			pat_disable("PAT read returns always zero, disabled.");
-			return;
-		}
+	wrmsrl(MSR_IA32_CR_PAT, pat);
+
+done:
+	pat_init_cache_modes(pat);
+}
+
+static void pat_ap_init(u64 pat)
+{
+	if (!pat_enabled())
+		return;
+
+	if (!cpu_has_pat) {
+		/*
+		 * If this happens we are on a secondary CPU, but switched to
+		 * PAT on the boot CPU. We have no way to undo PAT.
+		 */
+		panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
 	}
 
 	wrmsrl(MSR_IA32_CR_PAT, pat);
+}
+
+void pat_init(void)
+{
+	u64 pat;
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (!pat_enabled()) {
+		/*
+		 * No PAT. Emulate the PAT table that corresponds to the two
+		 * cache bits, PWT (Write Through) and PCD (Cache Disable). This
+		 * setup is the same as the BIOS default setup when the system
+		 * has PAT but the "nopat" boot option has been specified. This
+		 * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
+		 *
+		 * PTE encoding:
+		 *
+		 *       PCD
+		 *       |PWT  PAT
+		 *       ||    slot
+		 *       00    0    WB : _PAGE_CACHE_MODE_WB
+		 *       01    1    WT : _PAGE_CACHE_MODE_WT
+		 *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *       11    3    UC : _PAGE_CACHE_MODE_UC
+		 *
+		 * NOTE: When WC or WP is used, it is redirected to UC- per
+		 * the default setup in __cachemode2pte_tbl[].
+		 */
+		pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
 
-	if (boot_cpu)
-		pat_init_cache_modes();
+	} else if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+		   (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+		    ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+		/*
+		 * PAT support with the lower four entries. Intel Pentium 2,
+		 * 3, M, and 4 are affected by PAT errata, which makes the
+		 * upper four entries unusable. To be on the safe side, we don't
+		 * use those.
+		 *
+		 *  PTE encoding:
+		 *      PAT
+		 *      |PCD
+		 *      ||PWT  PAT
+		 *      |||    slot
+		 *      000    0    WB : _PAGE_CACHE_MODE_WB
+		 *      001    1    WC : _PAGE_CACHE_MODE_WC
+		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *      011    3    UC : _PAGE_CACHE_MODE_UC
+		 * PAT bit unused
+		 *
+		 * NOTE: When WT or WP is used, it is redirected to UC- per
+		 * the default setup in __cachemode2pte_tbl[].
+		 */
+		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+	} else {
+		/*
+		 * Full PAT support.  We put WT in slot 7 to improve
+		 * robustness in the presence of errata that might cause
+		 * the high PAT bit to be ignored.  This way, a buggy slot 7
+		 * access will hit slot 3, and slot 3 is UC, so at worst
+		 * we lose performance without causing a correctness issue.
+		 * Pentium 4 erratum N46 is an example for such an erratum,
+		 * although we try not to use PAT at all on affected CPUs.
+		 *
+		 *  PTE encoding:
+		 *      PAT
+		 *      |PCD
+		 *      ||PWT  PAT
+		 *      |||    slot
+		 *      000    0    WB : _PAGE_CACHE_MODE_WB
+		 *      001    1    WC : _PAGE_CACHE_MODE_WC
+		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *      011    3    UC : _PAGE_CACHE_MODE_UC
+		 *      100    4    WB : Reserved
+		 *      101    5    WC : Reserved
+		 *      110    6    UC-: Reserved
+		 *      111    7    WT : _PAGE_CACHE_MODE_WT
+		 *
+		 * The reserved slots are unused, but mapped to their
+		 * corresponding types in the presence of PAT errata.
+		 */
+		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+	}
+
+	if (!boot_cpu_done) {
+		pat_bsp_init(pat);
+		boot_cpu_done = true;
+	} else {
+		pat_ap_init(pat);
+	}
 }
 
 #undef PAT
@@ -267,9 +349,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end,
 	 * request is for WB.
 	 */
 	if (req_type == _PAGE_CACHE_MODE_WB) {
-		u8 mtrr_type;
+		u8 mtrr_type, uniform;
 
-		mtrr_type = mtrr_type_lookup(start, end);
+		mtrr_type = mtrr_type_lookup(start, end, &uniform);
 		if (mtrr_type != MTRR_TYPE_WRBACK)
 			return _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -324,9 +406,14 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 
 /*
  * For RAM pages, we use page flags to mark the pages with appropriate type.
- * Here we do two pass:
- * - Find the memtype of all the pages in the range, look for any conflicts
- * - In case of no conflicts, set the new memtype for pages in the range
+ * The page flags are limited to four types, WB (default), WC, WT and UC-.
+ * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
+ * a new memory type is only allowed for a page mapped with the default WB
+ * type.
+ *
+ * Here we do two passes:
+ * - Find the memtype of all the pages in the range, look for any conflicts.
+ * - In case of no conflicts, set the new memtype for pages in the range.
  */
 static int reserve_ram_pages_type(u64 start, u64 end,
 				  enum page_cache_mode req_type,
@@ -335,6 +422,12 @@ static int reserve_ram_pages_type(u64 start, u64 end,
 	struct page *page;
 	u64 pfn;
 
+	if (req_type == _PAGE_CACHE_MODE_WP) {
+		if (new_type)
+			*new_type = _PAGE_CACHE_MODE_UC_MINUS;
+		return -EINVAL;
+	}
+
 	if (req_type == _PAGE_CACHE_MODE_UC) {
 		/* We do not support strong UC */
 		WARN_ON_ONCE(1);
@@ -346,8 +439,8 @@ static int reserve_ram_pages_type(u64 start, u64 end,
 
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
-		if (type != -1) {
-			pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+		if (type != _PAGE_CACHE_MODE_WB) {
+			pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 				start, end - 1, type, req_type);
 			if (new_type)
 				*new_type = type;
@@ -373,7 +466,7 @@ static int free_ram_pages_type(u64 start, u64 end)
 
 	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 		page = pfn_to_page(pfn);
-		set_page_memtype(page, -1);
+		set_page_memtype(page, _PAGE_CACHE_MODE_WB);
 	}
 	return 0;
 }
@@ -384,6 +477,7 @@ static int free_ram_pages_type(u64 start, u64 end)
  * - _PAGE_CACHE_MODE_WC
  * - _PAGE_CACHE_MODE_UC_MINUS
  * - _PAGE_CACHE_MODE_UC
+ * - _PAGE_CACHE_MODE_WT
  *
  * If new_type is NULL, function will return an error if it cannot reserve the
  * region with req_type. If new_type is non-NULL, function will return
@@ -400,14 +494,10 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 	BUG_ON(start >= end); /* end is exclusive */
 
-	if (!pat_enabled) {
+	if (!pat_enabled()) {
 		/* This is identical to page table setting without PAT */
-		if (new_type) {
-			if (req_type == _PAGE_CACHE_MODE_WC)
-				*new_type = _PAGE_CACHE_MODE_UC_MINUS;
-			else
-				*new_type = req_type;
-		}
+		if (new_type)
+			*new_type = req_type;
 		return 0;
 	}
 
@@ -451,9 +541,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
-		       start, end - 1,
-		       cattr_name(new->type), cattr_name(req_type));
+		pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+			start, end - 1,
+			cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		spin_unlock(&memtype_lock);
 
@@ -475,7 +565,7 @@ int free_memtype(u64 start, u64 end)
 	int is_range_ram;
 	struct memtype *entry;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Low ISA region is always mapped WB. No need to track */
@@ -497,8 +587,8 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
-		       current->comm, current->pid, start, end - 1);
+		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+			current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 	}
 
@@ -517,7 +607,7 @@ int free_memtype(u64 start, u64 end)
  * Only to be called when PAT is enabled
  *
  * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
- * or _PAGE_CACHE_MODE_UC
+ * or _PAGE_CACHE_MODE_WT.
  */
 static enum page_cache_mode lookup_memtype(u64 paddr)
 {
@@ -529,16 +619,9 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
 
 	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 		struct page *page;
-		page = pfn_to_page(paddr >> PAGE_SHIFT);
-		rettype = get_page_memtype(page);
-		/*
-		 * -1 from get_page_memtype() implies RAM page is in its
-		 * default state and not reserved, and hence of type WB
-		 */
-		if (rettype == -1)
-			rettype = _PAGE_CACHE_MODE_WB;
 
-		return rettype;
+		page = pfn_to_page(paddr >> PAGE_SHIFT);
+		return get_page_memtype(page);
 	}
 
 	spin_lock(&memtype_lock);
@@ -623,13 +706,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 	u64 to = from + size;
 	u64 cursor = from;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 1;
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
-			       current->comm, from, to - 1);
+			pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+				current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
@@ -659,7 +742,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	 * caching for the high addresses through the KEN pin, but
 	 * we maintain the tradition of paranoia in this code.
 	 */
-	if (!pat_enabled &&
+	if (!pat_enabled() &&
 	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
@@ -698,8 +781,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size,
 				size;
 
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
-		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
-			"for [mem %#010Lx-%#010Lx]\n",
+		pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid,
 			cattr_name(pcm),
 			base, (unsigned long long)(base + size-1));
@@ -729,12 +811,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	 * the type requested matches the type of first page in the range.
 	 */
 	if (is_ram) {
-		if (!pat_enabled)
+		if (!pat_enabled())
 			return 0;
 
 		pcm = lookup_memtype(paddr);
 		if (want_pcm != pcm) {
-			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+			pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_pcm),
 				(unsigned long long)paddr,
@@ -755,13 +837,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 		if (strict_prot ||
 		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 			free_memtype(paddr, paddr + size);
-			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for [mem %#010Lx-%#010Lx], got %s\n",
-				current->comm, current->pid,
-				cattr_name(want_pcm),
-				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size - 1),
-				cattr_name(pcm));
+			pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+			       current->comm, current->pid,
+			       cattr_name(want_pcm),
+			       (unsigned long long)paddr,
+			       (unsigned long long)(paddr + size - 1),
+			       cattr_name(pcm));
 			return -EINVAL;
 		}
 		/*
@@ -844,7 +925,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 		return ret;
 	}
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/*
@@ -872,7 +953,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 {
 	enum page_cache_mode pcm;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Set prot based on lookup */
@@ -913,14 +994,18 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
-	if (pat_enabled)
-		return __pgprot(pgprot_val(prot) |
+	return __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
-	else
-		return pgprot_noncached(prot);
 }
 EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WT));
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
 static struct memtype *memtype_get_idx(loff_t pos)
@@ -996,7 +1081,7 @@ static const struct file_operations memtype_fops = {
 
 static int __init pat_memtype_list_init(void)
 {
-	if (pat_enabled) {
+	if (pat_enabled()) {
 		debugfs_create_file("pat_memtype_list", S_IRUSR,
 				    arch_debugfs_dir, NULL, &memtype_fops);
 	}
diff --git a/kernel/arch/x86/mm/pat_internal.h b/kernel/arch/x86/mm/pat_internal.h
index f64116203..a739bfc40 100644
--- a/kernel/arch/x86/mm/pat_internal.h
+++ b/kernel/arch/x86/mm/pat_internal.h
@@ -4,7 +4,7 @@
 extern int pat_debug_enable;
 
 #define dprintk(fmt, arg...) \
-	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+	do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
 
 struct memtype {
 	u64			start;
diff --git a/kernel/arch/x86/mm/pat_rbtree.c b/kernel/arch/x86/mm/pat_rbtree.c
index 6582adcc8..639310803 100644
--- a/kernel/arch/x86/mm/pat_rbtree.c
+++ b/kernel/arch/x86/mm/pat_rbtree.c
@@ -160,9 +160,9 @@ success:
 	return 0;
 
 failure:
-	printk(KERN_INFO "%s:%d conflicting memory types "
-		"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
-		end, cattr_name(found_type), cattr_name(match->type));
+	pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+		current->comm, current->pid, start, end,
+		cattr_name(found_type), cattr_name(match->type));
 	return -EBUSY;
 }
 
diff --git a/kernel/arch/x86/mm/pgtable.c b/kernel/arch/x86/mm/pgtable.c
index 0b97d2c75..fb0a9dd1d 100644
--- a/kernel/arch/x86/mm/pgtable.c
+++ b/kernel/arch/x86/mm/pgtable.c
@@ -563,16 +563,31 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 }
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ *   has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK))
 		return 0;
 
 	prot = pgprot_4k_2_large(prot);
@@ -584,17 +599,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK)) {
+		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+			     __func__, addr, addr + PMD_SIZE);
 		return 0;
+	}
 
 	prot = pgprot_4k_2_large(prot);
 
@@ -605,6 +627,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
 int pud_clear_huge(pud_t *pud)
 {
 	if (pud_large(*pud)) {
@@ -615,6 +642,11 @@ int pud_clear_huge(pud_t *pud)
 	return 0;
 }
 
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
 int pmd_clear_huge(pmd_t *pmd)
 {
 	if (pmd_large(*pmd)) {
diff --git a/kernel/arch/x86/mm/srat.c b/kernel/arch/x86/mm/srat.c
index 66338a60a..c2aea63be 100644
--- a/kernel/arch/x86/mm/srat.c
+++ b/kernel/arch/x86/mm/srat.c
@@ -192,10 +192,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
 	node_set(node, numa_nodes_parsed);
 
-	pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+	pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
 		node, pxm,
 		(unsigned long long) start, (unsigned long long) end - 1,
-		hotpluggable ? " hotplug" : "");
+		hotpluggable ? " hotplug" : "",
+		ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");
 
 	/* Mark hotplug range in memblock. */
 	if (hotpluggable && memblock_mark_hotplug(start, ma->length))
diff --git a/kernel/arch/x86/mm/tlb.c b/kernel/arch/x86/mm/tlb.c
index 90b924acd..8f4cc3dfa 100644
--- a/kernel/arch/x86/mm/tlb.c
+++ b/kernel/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	info.flush_end = end;
 
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+	trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
 	if (is_uv_system()) {
 		unsigned int cpu;
 
@@ -160,7 +161,10 @@ void flush_tlb_current_task(void)
 	preempt_disable();
 
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+	/* This is an implicit full barrier that synchronizes with switch_mm. */
 	local_flush_tlb();
+
 	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -187,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
 
 	preempt_disable();
-	if (current->active_mm != mm)
+	if (current->active_mm != mm) {
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
+	}
 
 	if (!current->mm) {
 		leave_mm(smp_processor_id());
+
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
 	}
 
 	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
 		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
 
+	/*
+	 * Both branches below are implicit full barriers (MOV to CR or
+	 * INVLPG) that synchronize with switch_mm.
+	 */
 	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
 		base_pages_to_flush = TLB_FLUSH_ALL;
 		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -227,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 	preempt_disable();
 
 	if (current->active_mm == mm) {
-		if (current->mm)
+		if (current->mm) {
+			/*
+			 * Implicit full barrier (INVLPG) that synchronizes
+			 * with switch_mm.
+			 */
 			__flush_tlb_one(start);
-		else
+		} else {
 			leave_mm(smp_processor_id());
+
+			/* Synchronize with switch_mm. */
+			smp_mb();
+		}
 	}
 
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
diff --git a/kernel/arch/x86/net/bpf_jit.S b/kernel/arch/x86/net/bpf_jit.S
index 6440221ce..4093216b3 100644
--- a/kernel/arch/x86/net/bpf_jit.S
+++ b/kernel/arch/x86/net/bpf_jit.S
@@ -8,7 +8,6 @@
  * of the License.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 /*
  * Calling convention :
diff --git a/kernel/arch/x86/net/bpf_jit_comp.c b/kernel/arch/x86/net/bpf_jit_comp.c
index ddeff4844..75991979f 100644
--- a/kernel/arch/x86/net/bpf_jit_comp.c
+++ b/kernel/arch/x86/net/bpf_jit_comp.c
@@ -12,6 +12,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <asm/cacheflush.h>
+#include <linux/bpf.h>
 
 int bpf_jit_enable __read_mostly;
 
@@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 	return ptr + len;
 }
 
-#define EMIT(bytes, len)	do { prog = emit_code(prog, bytes, len); } while (0)
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
 
 #define EMIT1(b1)		EMIT(b1, 1)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
@@ -186,31 +188,31 @@ struct jit_context {
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
-		  int oldproglen, struct jit_context *ctx)
+#define STACKSIZE \
+	(MAX_BPF_STACK + \
+	 32 /* space for rbx, r13, r14, r15 */ + \
+	 8 /* space for skb_copy_bits() buffer */)
+
+#define PROLOGUE_SIZE 51
+
+/* emit x64 prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog)
 {
-	struct bpf_insn *insn = bpf_prog->insnsi;
-	int insn_cnt = bpf_prog->len;
-	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
-	bool seen_exit = false;
-	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
-	int i;
-	int proglen = 0;
-	u8 *prog = temp;
-	int stacksize = MAX_BPF_STACK +
-		32 /* space for rbx, r13, r14, r15 */ +
-		8 /* space for skb_copy_bits() buffer */;
+	u8 *prog = *pprog;
+	int cnt = 0;
 
 	EMIT1(0x55); /* push rbp */
 	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
 
-	/* sub rsp, stacksize */
-	EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+	/* sub rsp, STACKSIZE */
+	EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);
 
 	/* all classic BPF filters use R6(rbx) save it */
 
 	/* mov qword ptr [rbp-X],rbx */
-	EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+	EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);
 
 	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
 	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -221,46 +223,134 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	 */
 
 	/* mov qword ptr [rbp-X],r13 */
-	EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+	EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
 	/* mov qword ptr [rbp-X],r14 */
-	EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+	EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
 	/* mov qword ptr [rbp-X],r15 */
-	EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+	EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
 
 	/* clear A and X registers */
 	EMIT2(0x31, 0xc0); /* xor eax, eax */
 	EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
 
-	if (seen_ld_abs) {
-		/* r9d : skb->len - skb->data_len (headlen)
-		 * r10 : skb->data
-		 */
-		if (is_imm8(offsetof(struct sk_buff, len)))
-			/* mov %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x8b, 0x4f,
-			      offsetof(struct sk_buff, len));
-		else
-			/* mov %r9d, off32(%rdi) */
-			EMIT3_off32(0x44, 0x8b, 0x8f,
-				    offsetof(struct sk_buff, len));
-
-		if (is_imm8(offsetof(struct sk_buff, data_len)))
-			/* sub %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x2b, 0x4f,
-			      offsetof(struct sk_buff, data_len));
-		else
-			EMIT3_off32(0x44, 0x2b, 0x8f,
-				    offsetof(struct sk_buff, data_len));
-
-		if (is_imm8(offsetof(struct sk_buff, data)))
-			/* mov %r10, off8(%rdi) */
-			EMIT4(0x4c, 0x8b, 0x57,
-			      offsetof(struct sk_buff, data));
-		else
-			/* mov %r10, off32(%rdi) */
-			EMIT3_off32(0x4c, 0x8b, 0x97,
-				    offsetof(struct sk_buff, data));
-	}
+	/* clear tail_cnt: mov qword ptr [rbp-X], rax */
+	EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->ptrs[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int label1, label2, label3;
+	int cnt = 0;
+
+	/* rdi - pointer to ctx
+	 * rsi - pointer to bpf_array
+	 * rdx - index in bpf_array
+	 */
+
+	/* if (index >= array->map.max_entries)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	      offsetof(struct bpf_array, map.max_entries));
+	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
+#define OFFSET1 47 /* number of bytes to jump */
+	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
+	label1 = cnt;
+
+	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *   goto out;
+	 */
+	EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
+	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
+#define OFFSET2 36
+	EMIT2(X86_JA, OFFSET2);                   /* ja out */
+	label2 = cnt;
+	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
+	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
+
+	/* prog = array->ptrs[index]; */
+	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
+		    offsetof(struct bpf_array, ptrs));
+	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
+
+	/* if (prog == NULL)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x83, 0xF8, 0x00);            /* cmp rax, 0 */
+#define OFFSET3 10
+	EMIT2(X86_JE, OFFSET3);                   /* je out */
+	label3 = cnt;
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	EMIT4(0x48, 0x8B, 0x40,                   /* mov rax, qword ptr [rax + 32] */
+	      offsetof(struct bpf_prog, bpf_func));
+	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
+
+	/* now we're ready to jump into next BPF program
+	 * rdi == ctx (1st arg)
+	 * rax == prog->bpf_func + prologue_size
+	 */
+	EMIT2(0xFF, 0xE0);                        /* jmp rax */
+
+	/* out: */
+	BUILD_BUG_ON(cnt - label1 != OFFSET1);
+	BUILD_BUG_ON(cnt - label2 != OFFSET2);
+	BUILD_BUG_ON(cnt - label3 != OFFSET3);
+	*pprog = prog;
+}
+
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* r9d = skb->len - skb->data_len (headlen)
+	 * r10 = skb->data
+	 */
+	/* mov %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
+
+	/* sub %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
+
+	/* mov %r10, off32(%rdi) */
+	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog);
+
+	if (seen_ld_abs)
+		emit_load_skb_data_hlen(&prog);
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
@@ -269,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b1 = 0, b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
+		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
@@ -720,12 +811,18 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x52); /* push %r10 */
-				EMIT2(0x41, 0x51); /* push %r9 */
-				/* need to adjust jmp offset, since
-				 * pop %r9, pop %r10 take 4 bytes after call insn
-				 */
-				jmp_offset += 4;
+				reload_skb_data = bpf_helper_changes_skb_data(func);
+				if (reload_skb_data) {
+					EMIT1(0x57); /* push %rdi */
+					jmp_offset += 22; /* pop, mov, sub, mov */
+				} else {
+					EMIT2(0x41, 0x52); /* push %r10 */
+					EMIT2(0x41, 0x51); /* push %r9 */
+					/* need to adjust jmp offset, since
+					 * pop %r9, pop %r10 take 4 bytes after call insn
+					 */
+					jmp_offset += 4;
+				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -734,11 +831,20 @@ xadd:			if (is_imm8(insn->off))
 			}
 			EMIT1_off32(0xE8, jmp_offset);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x59); /* pop %r9 */
-				EMIT2(0x41, 0x5A); /* pop %r10 */
+				if (reload_skb_data) {
+					EMIT1(0x5F); /* pop %rdi */
+					emit_load_skb_data_hlen(&prog);
+				} else {
+					EMIT2(0x41, 0x59); /* pop %r9 */
+					EMIT2(0x41, 0x5A); /* pop %r10 */
+				}
 			}
 			break;
 
+		case BPF_JMP | BPF_CALL | BPF_X:
+			emit_bpf_tail_call(&prog);
+			break;
+
 			/* cond jump */
 		case BPF_JMP | BPF_JEQ | BPF_X:
 		case BPF_JMP | BPF_JNE | BPF_X:
@@ -891,13 +997,13 @@ common_load:
 			/* update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp-X] */
-			EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+			EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
 			/* mov r13, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+			EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
 			/* mov r14, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+			EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
 			/* mov r15, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+			EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);
 
 			EMIT1(0xC9); /* leave */
 			EMIT1(0xC3); /* ret */
@@ -997,13 +1103,13 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 
 	if (bpf_jit_enable > 1)
-		bpf_jit_dump(prog->len, proglen, 0, image);
+		bpf_jit_dump(prog->len, proglen, pass + 1, image);
 
 	if (image) {
 		bpf_flush_icache(header, image + proglen);
 		set_memory_ro((unsigned long)header, header->pages);
 		prog->bpf_func = (void *)image;
-		prog->jited = true;
+		prog->jited = 1;
 	}
 out:
 	kfree(addrs);
diff --git a/kernel/arch/x86/pci/acpi.c b/kernel/arch/x86/pci/acpi.c
index ff9911707..3cd69832d 100644
--- a/kernel/arch/x86/pci/acpi.c
+++ b/kernel/arch/x86/pci/acpi.c
@@ -4,16 +4,15 @@
 #include <linux/irq.h>
 #include <linux/dmi.h>
 #include <linux/slab.h>
+#include <linux/pci-acpi.h>
 #include <asm/numa.h>
 #include <asm/pci_x86.h>
 
 struct pci_root_info {
-	struct acpi_device *bridge;
-	char name[16];
+	struct acpi_pci_root_info common;
 	struct pci_sysdata sd;
 #ifdef	CONFIG_PCI_MMCONFIG
 	bool mcfg_added;
-	u16 segment;
 	u8 start_bus;
 	u8 end_bus;
 #endif
@@ -178,15 +177,18 @@ static int check_segment(u16 seg, struct device *dev, char *estr)
 	return 0;
 }
 
-static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
-			  u8 end, phys_addr_t addr)
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
 {
-	int result;
-	struct device *dev = &info->bridge->dev;
+	int result, seg;
+	struct pci_root_info *info;
+	struct acpi_pci_root *root = ci->root;
+	struct device *dev = &ci->bridge->dev;
 
-	info->start_bus = start;
-	info->end_bus = end;
+	info = container_of(ci, struct pci_root_info, common);
+	info->start_bus = (u8)root->secondary.start;
+	info->end_bus = (u8)root->secondary.end;
 	info->mcfg_added = false;
+	seg = info->sd.domain;
 
 	/* return success if MMCFG is not in use */
 	if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
@@ -195,7 +197,8 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
 	if (!(pci_probe & PCI_PROBE_MMCONF))
 		return check_segment(seg, dev, "MMCONFIG is disabled,");
 
-	result = pci_mmconfig_insert(dev, seg, start, end, addr);
+	result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus,
+				     root->mcfg_addr);
 	if (result == 0) {
 		/* enable MMCFG if it hasn't been enabled yet */
 		if (raw_pci_ext_ops == NULL)
@@ -208,134 +211,55 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start,
 	return 0;
 }
 
-static void teardown_mcfg_map(struct pci_root_info *info)
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
 {
+	struct pci_root_info *info;
+
+	info = container_of(ci, struct pci_root_info, common);
 	if (info->mcfg_added) {
-		pci_mmconfig_delete(info->segment, info->start_bus,
-				    info->end_bus);
+		pci_mmconfig_delete(info->sd.domain,
+				    info->start_bus, info->end_bus);
 		info->mcfg_added = false;
 	}
 }
 #else
-static int setup_mcfg_map(struct pci_root_info *info,
-				    u16 seg, u8 start, u8 end,
-				    phys_addr_t addr)
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
 {
 	return 0;
 }
-static void teardown_mcfg_map(struct pci_root_info *info)
+
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
 {
 }
 #endif
 
-static void validate_resources(struct device *dev, struct list_head *crs_res,
-			       unsigned long type)
+static int pci_acpi_root_get_node(struct acpi_pci_root *root)
 {
-	LIST_HEAD(list);
-	struct resource *res1, *res2, *root = NULL;
-	struct resource_entry *tmp, *entry, *entry2;
-
-	BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0);
-	root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource;
-
-	list_splice_init(crs_res, &list);
-	resource_list_for_each_entry_safe(entry, tmp, &list) {
-		bool free = false;
-		resource_size_t end;
-
-		res1 = entry->res;
-		if (!(res1->flags & type))
-			goto next;
-
-		/* Exclude non-addressable range or non-addressable portion */
-		end = min(res1->end, root->end);
-		if (end <= res1->start) {
-			dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n",
-				 res1);
-			free = true;
-			goto next;
-		} else if (res1->end != end) {
-			dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n",
-				 res1, (unsigned long long)end + 1,
-				 (unsigned long long)res1->end);
-			res1->end = end;
-		}
-
-		resource_list_for_each_entry(entry2, crs_res) {
-			res2 = entry2->res;
-			if (!(res2->flags & type))
-				continue;
-
-			/*
-			 * I don't like throwing away windows because then
-			 * our resources no longer match the ACPI _CRS, but
-			 * the kernel resource tree doesn't allow overlaps.
-			 */
-			if (resource_overlaps(res1, res2)) {
-				res2->start = min(res1->start, res2->start);
-				res2->end = max(res1->end, res2->end);
-				dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n",
-					 res2, res1);
-				free = true;
-				goto next;
-			}
-		}
+	int busnum = root->secondary.start;
+	struct acpi_device *device = root->device;
+	int node = acpi_get_node(device->handle);
 
-next:
-		resource_list_del(entry);
-		if (free)
-			resource_list_free_entry(entry);
-		else
-			resource_list_add_tail(entry, crs_res);
+	if (node == NUMA_NO_NODE) {
+		node = x86_pci_root_bus_node(busnum);
+		if (node != 0 && node != NUMA_NO_NODE)
+			dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
+				node);
 	}
+	if (node != NUMA_NO_NODE && !node_online(node))
+		node = NUMA_NO_NODE;
+
+	return node;
 }
 
-static void add_resources(struct pci_root_info *info,
-			  struct list_head *resources,
-			  struct list_head *crs_res)
+static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci)
 {
-	struct resource_entry *entry, *tmp;
-	struct resource *res, *conflict, *root = NULL;
-
-	validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM);
-	validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO);
-
-	resource_list_for_each_entry_safe(entry, tmp, crs_res) {
-		res = entry->res;
-		if (res->flags & IORESOURCE_MEM)
-			root = &iomem_resource;
-		else if (res->flags & IORESOURCE_IO)
-			root = &ioport_resource;
-		else
-			BUG_ON(res);
-
-		conflict = insert_resource_conflict(root, res);
-		if (conflict) {
-			dev_info(&info->bridge->dev,
-				 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
-				 res, conflict->name, conflict);
-			resource_list_destroy_entry(entry);
-		}
-	}
-
-	list_splice_tail(crs_res, resources);
+	return setup_mcfg_map(ci);
 }
 
-static void release_pci_root_info(struct pci_host_bridge *bridge)
+static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci)
 {
-	struct resource *res;
-	struct resource_entry *entry;
-	struct pci_root_info *info = bridge->release_data;
-
-	resource_list_for_each_entry(entry, &bridge->windows) {
-		res = entry->res;
-		if (res->parent &&
-		    (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
-			release_resource(res);
-	}
-
-	teardown_mcfg_map(info);
-	kfree(info);
+	teardown_mcfg_map(ci);
+	kfree(container_of(ci, struct pci_root_info, common));
 }
 
 /*
@@ -358,50 +282,47 @@ static bool resource_is_pcicfg_ioport(struct resource *res)
 		res->start == 0xCF8 && res->end == 0xCFF;
 }
 
-static void probe_pci_root_info(struct pci_root_info *info,
-				struct acpi_device *device,
-				int busnum, int domain,
-				struct list_head *list)
+static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
 {
-	int ret;
+	struct acpi_device *device = ci->bridge;
+	int busnum = ci->root->secondary.start;
 	struct resource_entry *entry, *tmp;
+	int status;
 
-	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
-	info->bridge = device;
-	ret = acpi_dev_get_resources(device, list,
-				     acpi_dev_filter_resource_type_cb,
-				     (void *)(IORESOURCE_IO | IORESOURCE_MEM));
-	if (ret < 0)
-		dev_warn(&device->dev,
-			 "failed to parse _CRS method, error code %d\n", ret);
-	else if (ret == 0)
-		dev_dbg(&device->dev,
-			"no IO and memory resources present in _CRS\n");
-	else
-		resource_list_for_each_entry_safe(entry, tmp, list) {
-			if ((entry->res->flags & IORESOURCE_DISABLED) ||
-			    resource_is_pcicfg_ioport(entry->res))
+	status = acpi_pci_probe_root_resources(ci);
+	if (pci_use_crs) {
+		resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
+			if (resource_is_pcicfg_ioport(entry->res))
 				resource_list_destroy_entry(entry);
-			else
-				entry->res->name = info->name;
-		}
+		return status;
+	}
+
+	resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
+		dev_printk(KERN_DEBUG, &device->dev,
+			   "host bridge window %pR (ignored)\n", entry->res);
+		resource_list_destroy_entry(entry);
+	}
+	x86_pci_root_bus_resources(busnum, &ci->resources);
+
+	return 0;
 }
 
+static struct acpi_pci_root_ops acpi_pci_root_ops = {
+	.pci_ops = &pci_root_ops,
+	.init_info = pci_acpi_root_init_info,
+	.release_info = pci_acpi_root_release_info,
+	.prepare_resources = pci_acpi_root_prepare_resources,
+};
+
 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 {
-	struct acpi_device *device = root->device;
-	struct pci_root_info *info;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
-	struct resource_entry *res_entry;
-	LIST_HEAD(crs_res);
-	LIST_HEAD(resources);
+	int node = pci_acpi_root_get_node(root);
 	struct pci_bus *bus;
-	struct pci_sysdata *sd;
-	int node;
 
 	if (pci_ignore_seg)
-		domain = 0;
+		root->segment = domain = 0;
 
 	if (domain && !pci_domains_supported) {
 		printk(KERN_WARNING "pci_bus %04x:%02x: "
@@ -410,71 +331,33 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 		return NULL;
 	}
 
-	node = acpi_get_node(device->handle);
-	if (node == NUMA_NO_NODE) {
-		node = x86_pci_root_bus_node(busnum);
-		if (node != 0 && node != NUMA_NO_NODE)
-			dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
-				node);
-	}
-
-	if (node != NUMA_NO_NODE && !node_online(node))
-		node = NUMA_NO_NODE;
-
-	info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
-	if (!info) {
-		printk(KERN_WARNING "pci_bus %04x:%02x: "
-		       "ignored (out of memory)\n", domain, busnum);
-		return NULL;
-	}
-
-	sd = &info->sd;
-	sd->domain = domain;
-	sd->node = node;
-	sd->companion = device;
-
 	bus = pci_find_bus(domain, busnum);
 	if (bus) {
 		/*
 		 * If the desired bus has been scanned already, replace
 		 * its bus->sysdata.
 		 */
-		memcpy(bus->sysdata, sd, sizeof(*sd));
-		kfree(info);
-	} else {
-		/* insert busn res at first */
-		pci_add_resource(&resources,  &root->secondary);
+		struct pci_sysdata sd = {
+			.domain = domain,
+			.node = node,
+			.companion = root->device
+		};
 
-		/*
-		 * _CRS with no apertures is normal, so only fall back to
-		 * defaults or native bridge info if we're ignoring _CRS.
-		 */
-		probe_pci_root_info(info, device, busnum, domain, &crs_res);
-		if (pci_use_crs) {
-			add_resources(info, &resources, &crs_res);
-		} else {
-			resource_list_for_each_entry(res_entry, &crs_res)
-				dev_printk(KERN_DEBUG, &device->dev,
-					   "host bridge window %pR (ignored)\n",
-					   res_entry->res);
-			resource_list_free(&crs_res);
-			x86_pci_root_bus_resources(busnum, &resources);
-		}
-
-		if (!setup_mcfg_map(info, domain, (u8)root->secondary.start,
-				    (u8)root->secondary.end, root->mcfg_addr))
-			bus = pci_create_root_bus(NULL, busnum, &pci_root_ops,
-						  sd, &resources);
-
-		if (bus) {
-			pci_scan_child_bus(bus);
-			pci_set_host_bridge_release(
-				to_pci_host_bridge(bus->bridge),
-				release_pci_root_info, info);
-		} else {
-			resource_list_free(&resources);
-			teardown_mcfg_map(info);
-			kfree(info);
+		memcpy(bus->sysdata, &sd, sizeof(sd));
+	} else {
+		struct pci_root_info *info;
+
+		info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
+		if (!info)
+			dev_err(&root->device->dev,
+				"pci_bus %04x:%02x: ignored (out of memory)\n",
+				domain, busnum);
+		else {
+			info->sd.domain = domain;
+			info->sd.node = node;
+			info->sd.companion = root->device;
+			bus = acpi_pci_root_create(root, &acpi_pci_root_ops,
+						   &info->common, &info->sd);
 		}
 	}
 
@@ -487,9 +370,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 			pcie_bus_configure_settings(child);
 	}
 
-	if (bus && node != NUMA_NO_NODE)
-		dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node);
-
 	return bus;
 }
 
diff --git a/kernel/arch/x86/pci/bus_numa.c b/kernel/arch/x86/pci/bus_numa.c
index 7bcf06a7c..6eb3c8af9 100644
--- a/kernel/arch/x86/pci/bus_numa.c
+++ b/kernel/arch/x86/pci/bus_numa.c
@@ -50,18 +50,9 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 	if (!found)
 		pci_add_resource(resources, &info->busn);
 
-	list_for_each_entry(root_res, &info->resources, list) {
-		struct resource *res;
-		struct resource *root;
+	list_for_each_entry(root_res, &info->resources, list)
+		pci_add_resource(resources, &root_res->res);
 
-		res = &root_res->res;
-		pci_add_resource(resources, res);
-		if (res->flags & IORESOURCE_IO)
-			root = &ioport_resource;
-		else
-			root = &iomem_resource;
-		insert_resource(root, res);
-	}
 	return;
 
 default_resources:
diff --git a/kernel/arch/x86/pci/common.c b/kernel/arch/x86/pci/common.c
index 8fd6f44ae..eccd4d99e 100644
--- a/kernel/arch/x86/pci/common.c
+++ b/kernel/arch/x86/pci/common.c
@@ -673,24 +673,30 @@ int pcibios_add_device(struct pci_dev *dev)
 	return 0;
 }
 
-int pcibios_enable_device(struct pci_dev *dev, int mask)
+int pcibios_alloc_irq(struct pci_dev *dev)
 {
-	int err;
-
-	if ((err = pci_enable_resources(dev, mask)) < 0)
-		return err;
+	/*
+	 * If the PCI device was already claimed by core code and has
+	 * MSI enabled, probing of the pcibios IRQ will overwrite
+	 * dev->irq.  So bail out if MSI is already enabled.
+	 */
+	if (pci_dev_msi_enabled(dev))
+		return -EBUSY;
 
-	if (!pci_dev_msi_enabled(dev))
-		return pcibios_enable_irq(dev);
-	return 0;
+	return pcibios_enable_irq(dev);
 }
 
-void pcibios_disable_device (struct pci_dev *dev)
+void pcibios_free_irq(struct pci_dev *dev)
 {
-	if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
+	if (pcibios_disable_irq)
 		pcibios_disable_irq(dev);
 }
 
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	return pci_enable_resources(dev, mask);
+}
+
 int pci_ext_cfg_avail(void)
 {
 	if (raw_pci_ext_ops)
diff --git a/kernel/arch/x86/pci/fixup.c b/kernel/arch/x86/pci/fixup.c
index 9a2b7101a..e58565556 100644
--- a/kernel/arch/x86/pci/fixup.c
+++ b/kernel/arch/x86/pci/fixup.c
@@ -62,19 +62,6 @@ static void pci_fixup_umc_ide(struct pci_dev *d)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
 
-static void pci_fixup_ncr53c810(struct pci_dev *d)
-{
-	/*
-	 * NCR 53C810 returns class code 0 (at least on some systems).
-	 * Fix class to be PCI_CLASS_STORAGE_SCSI
-	 */
-	if (!d->class) {
-		dev_warn(&d->dev, "Fixing NCR 53C810 class code\n");
-		d->class = PCI_CLASS_STORAGE_SCSI << 8;
-	}
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810);
-
 static void pci_fixup_latency(struct pci_dev *d)
 {
 	/*
diff --git a/kernel/arch/x86/pci/i386.c b/kernel/arch/x86/pci/i386.c
index 349c0d32c..0a9f2caf3 100644
--- a/kernel/arch/x86/pci/i386.c
+++ b/kernel/arch/x86/pci/i386.c
@@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
  	 * Caller can followup with UC MINUS request and add a WC mtrr if there
  	 * is a free mtrr slot.
  	 */
-	if (!pat_enabled && write_combine)
+	if (!pat_enabled() && write_combine)
 		return -EINVAL;
 
-	if (pat_enabled && write_combine)
+	if (pat_enabled() && write_combine)
 		prot |= cachemode2protval(_PAGE_CACHE_MODE_WC);
-	else if (pat_enabled || boot_cpu_data.x86 > 3)
+	else if (pat_enabled() || boot_cpu_data.x86 > 3)
 		/*
 		 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
 		 * To avoid attribute conflicts, request UC MINUS here
diff --git a/kernel/arch/x86/pci/intel_mid_pci.c b/kernel/arch/x86/pci/intel_mid_pci.c
index 852aa4c92..0d24e7c10 100644
--- a/kernel/arch/x86/pci/intel_mid_pci.c
+++ b/kernel/arch/x86/pci/intel_mid_pci.c
@@ -35,6 +35,9 @@
 
 #define PCIE_CAP_OFFSET	0x100
 
+/* Quirks for the listed devices */
+#define PCI_DEVICE_ID_INTEL_MRFL_MMC	0x1190
+
 /* Fixed BAR fields */
 #define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00	/* Fixed BAR (TBD) */
 #define PCI_FIXED_BAR_0_SIZE	0x04
@@ -208,24 +211,43 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
+	struct irq_alloc_info info;
 	int polarity;
+	int ret;
 
-	if (dev->irq_managed && dev->irq > 0)
+	if (pci_has_managed_irq(dev))
 		return 0;
 
-	if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
-		polarity = 0; /* active high */
-	else
-		polarity = 1; /* active low */
+	switch (intel_mid_identify_cpu()) {
+	case INTEL_MID_CPU_CHIP_TANGIER:
+		polarity = IOAPIC_POL_HIGH;
+
+		/* Special treatment for IRQ0 */
+		if (dev->irq == 0) {
+			/*
+			 * TNG has IRQ0 assigned to eMMC controller. But there
+			 * are also other devices with bogus PCI configuration
+			 * that have IRQ0 assigned. This check ensures that
+			 * eMMC gets it.
+			 */
+			if (dev->device != PCI_DEVICE_ID_INTEL_MRFL_MMC)
+				return -EBUSY;
+		}
+		break;
+	default:
+		polarity = IOAPIC_POL_LOW;
+		break;
+	}
+
+	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
 
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
 	 * IOAPIC RTE entries, so we just enable RTE for the device.
 	 */
-	if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
-		return -EBUSY;
-	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
-		return -EBUSY;
+	ret = mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info);
+	if (ret < 0)
+		return ret;
 
 	dev->irq_managed = 1;
 
@@ -234,14 +256,17 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 {
-	if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
-	    dev->irq > 0) {
+	if (pci_has_managed_irq(dev)) {
 		mp_unmap_irq(dev->irq);
 		dev->irq_managed = 0;
+		/*
+		 * Don't reset dev->irq here, otherwise
+		 * intel_mid_pci_irq_enable() will fail on next call.
+		 */
 	}
 }
 
-struct pci_ops intel_mid_pci_ops = {
+static struct pci_ops intel_mid_pci_ops = {
 	.read = pci_read,
 	.write = pci_write,
 };
diff --git a/kernel/arch/x86/pci/irq.c b/kernel/arch/x86/pci/irq.c
index 5dc6ca5e1..32e70343e 100644
--- a/kernel/arch/x86/pci/irq.c
+++ b/kernel/arch/x86/pci/irq.c
@@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void)
 
 /*
  *  Code for querying and setting of IRQ routes on various interrupt routers.
+ *  PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1.
  */
 
-void eisa_set_level_irq(unsigned int irq)
+void elcr_set_level_irq(unsigned int irq)
 {
 	unsigned char mask = 1 << (irq & 7);
 	unsigned int port = 0x4d0 + (irq >> 3);
 	unsigned char val;
-	static u16 eisa_irq_mask;
+	static u16 elcr_irq_mask;
 
-	if (irq >= 16 || (1 << irq) & eisa_irq_mask)
+	if (irq >= 16 || (1 << irq) & elcr_irq_mask)
 		return;
 
-	eisa_irq_mask |= (1 << irq);
+	elcr_irq_mask |= (1 << irq);
 	printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
 	val = inb(port);
 	if (!(val & mask)) {
@@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
 	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
 		msg = "found";
-		eisa_set_level_irq(irq);
+		elcr_set_level_irq(irq);
 	} else if (newirq && r->set &&
 		(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
 		if (r->set(pirq_router_dev, dev, pirq, newirq)) {
-			eisa_set_level_irq(newirq);
+			elcr_set_level_irq(newirq);
 			msg = "assigned";
 			irq = newirq;
 		}
@@ -1201,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			struct pci_dev *temp_dev;
 			int irq;
 
-			if (dev->irq_managed && dev->irq > 0)
+			if (pci_has_managed_irq(dev))
 				return 0;
 
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
@@ -1229,8 +1230,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				dev->irq_managed = 1;
-				dev->irq = irq;
+				pci_set_managed_irq(dev, irq);
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
 				return 0;
@@ -1256,24 +1256,10 @@ static int pirq_enable_irq(struct pci_dev *dev)
 	return 0;
 }
 
-bool mp_should_keep_irq(struct device *dev)
-{
-	if (dev->power.is_prepared)
-		return true;
-#ifdef CONFIG_PM
-	if (dev->power.runtime_status == RPM_SUSPENDING)
-		return true;
-#endif
-
-	return false;
-}
-
 static void pirq_disable_irq(struct pci_dev *dev)
 {
-	if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
-	    dev->irq_managed && dev->irq) {
+	if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) {
 		mp_unmap_irq(dev->irq);
-		dev->irq = 0;
-		dev->irq_managed = 0;
+		pci_reset_managed_irq(dev);
 	}
 }
diff --git a/kernel/arch/x86/pci/legacy.c b/kernel/arch/x86/pci/legacy.c
index 5b662c0fa..ea6f3802c 100644
--- a/kernel/arch/x86/pci/legacy.c
+++ b/kernel/arch/x86/pci/legacy.c
@@ -54,7 +54,7 @@ void pcibios_scan_specific_bus(int busn)
 }
 EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
 
-int __init pci_subsys_init(void)
+static int __init pci_subsys_init(void)
 {
 	/*
 	 * The init function returns an non zero value when
diff --git a/kernel/arch/x86/pci/xen.c b/kernel/arch/x86/pci/xen.c
index d22f4b5bb..ff31ab464 100644
--- a/kernel/arch/x86/pci/xen.c
+++ b/kernel/arch/x86/pci/xen.c
@@ -179,7 +179,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (ret)
 		goto error;
 	i = 0;
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
+	for_each_pci_msi_entry(msidesc, dev) {
 		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
 					       (type == PCI_CAP_ID_MSI) ? nvec : 1,
 					       (type == PCI_CAP_ID_MSIX) ?
@@ -230,7 +230,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
+	for_each_pci_msi_entry(msidesc, dev) {
 		__pci_read_msi_msg(msidesc, &msg);
 		pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
 			((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
@@ -274,7 +274,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	int ret = 0;
 	struct msi_desc *msidesc;
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
+	for_each_pci_msi_entry(msidesc, dev) {
 		struct physdev_map_pirq map_irq;
 		domid_t domid;
 
@@ -386,7 +386,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
 {
 	struct msi_desc *msidesc;
 
-	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+	msidesc = first_pci_msi_entry(dev);
 	if (msidesc->msi_attrib.is_msix)
 		xen_pci_frontend_disable_msix(dev);
 	else
diff --git a/kernel/arch/x86/platform/Makefile b/kernel/arch/x86/platform/Makefile
index a62e0be3a..184842ef3 100644
--- a/kernel/arch/x86/platform/Makefile
+++ b/kernel/arch/x86/platform/Makefile
@@ -1,9 +1,11 @@
 # Platform specific code goes here
+obj-y	+= atom/
 obj-y	+= ce4100/
 obj-y	+= efi/
 obj-y	+= geode/
 obj-y	+= goldfish/
 obj-y	+= iris/
+obj-y	+= intel/
 obj-y	+= intel-mid/
 obj-y	+= intel-quark/
 obj-y	+= olpc/
diff --git a/kernel/arch/x86/platform/atom/Makefile b/kernel/arch/x86/platform/atom/Makefile
new file mode 100644
index 000000000..40983f5b0
--- /dev/null
+++ b/kernel/arch/x86/platform/atom/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_PMC_ATOM)		+= pmc_atom.o
+obj-$(CONFIG_PUNIT_ATOM_DEBUG)	+= punit_atom_debug.o
diff --git a/kernel/arch/x86/kernel/pmc_atom.c b/kernel/arch/x86/platform/atom/pmc_atom.c
index d66a4fe6c..964ff4fc6 100644
--- a/kernel/arch/x86/kernel/pmc_atom.c
+++ b/kernel/arch/x86/platform/atom/pmc_atom.c
@@ -15,7 +15,6 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/device.h>
@@ -25,80 +24,149 @@
 
 #include <asm/pmc_atom.h>
 
+struct pmc_bit_map {
+	const char *name;
+	u32 bit_mask;
+};
+
+struct pmc_reg_map {
+	const struct pmc_bit_map *d3_sts_0;
+	const struct pmc_bit_map *d3_sts_1;
+	const struct pmc_bit_map *func_dis;
+	const struct pmc_bit_map *func_dis_2;
+	const struct pmc_bit_map *pss;
+};
+
 struct pmc_dev {
 	u32 base_addr;
 	void __iomem *regmap;
+	const struct pmc_reg_map *map;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dbgfs_dir;
 #endif /* CONFIG_DEBUG_FS */
+	bool init;
 };
 
 static struct pmc_dev pmc_device;
 static u32 acpi_base_addr;
 
-struct pmc_bit_map {
-	const char *name;
-	u32 bit_mask;
+static const struct pmc_bit_map d3_sts_0_map[] = {
+	{"LPSS1_F0_DMA",	BIT_LPSS1_F0_DMA},
+	{"LPSS1_F1_PWM1",	BIT_LPSS1_F1_PWM1},
+	{"LPSS1_F2_PWM2",	BIT_LPSS1_F2_PWM2},
+	{"LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
+	{"LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
+	{"LPSS1_F5_SPI",	BIT_LPSS1_F5_SPI},
+	{"LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
+	{"LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
+	{"SCC_EMMC",		BIT_SCC_EMMC},
+	{"SCC_SDIO",		BIT_SCC_SDIO},
+	{"SCC_SDCARD",		BIT_SCC_SDCARD},
+	{"SCC_MIPI",		BIT_SCC_MIPI},
+	{"HDA",			BIT_HDA},
+	{"LPE",			BIT_LPE},
+	{"OTG",			BIT_OTG},
+	{"USH",			BIT_USH},
+	{"GBE",			BIT_GBE},
+	{"SATA",		BIT_SATA},
+	{"USB_EHCI",		BIT_USB_EHCI},
+	{"SEC",			BIT_SEC},
+	{"PCIE_PORT0",		BIT_PCIE_PORT0},
+	{"PCIE_PORT1",		BIT_PCIE_PORT1},
+	{"PCIE_PORT2",		BIT_PCIE_PORT2},
+	{"PCIE_PORT3",		BIT_PCIE_PORT3},
+	{"LPSS2_F0_DMA",	BIT_LPSS2_F0_DMA},
+	{"LPSS2_F1_I2C1",	BIT_LPSS2_F1_I2C1},
+	{"LPSS2_F2_I2C2",	BIT_LPSS2_F2_I2C2},
+	{"LPSS2_F3_I2C3",	BIT_LPSS2_F3_I2C3},
+	{"LPSS2_F3_I2C4",	BIT_LPSS2_F4_I2C4},
+	{"LPSS2_F5_I2C5",	BIT_LPSS2_F5_I2C5},
+	{"LPSS2_F6_I2C6",	BIT_LPSS2_F6_I2C6},
+	{"LPSS2_F7_I2C7",	BIT_LPSS2_F7_I2C7},
+	{},
+};
+
+static struct pmc_bit_map byt_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"OTG_SS_PHY",		BIT_OTG_SS_PHY},
+	{"USH_SS_PHY",		BIT_USH_SS_PHY},
+	{"DFX",			BIT_DFX},
+	{},
 };
 
-static const struct pmc_bit_map dev_map[] = {
-	{"0  - LPSS1_F0_DMA",		BIT_LPSS1_F0_DMA},
-	{"1  - LPSS1_F1_PWM1",		BIT_LPSS1_F1_PWM1},
-	{"2  - LPSS1_F2_PWM2",		BIT_LPSS1_F2_PWM2},
-	{"3  - LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
-	{"4  - LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
-	{"5  - LPSS1_F5_SPI",		BIT_LPSS1_F5_SPI},
-	{"6  - LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
-	{"7  - LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
-	{"8  - SCC_EMMC",		BIT_SCC_EMMC},
-	{"9  - SCC_SDIO",		BIT_SCC_SDIO},
-	{"10 - SCC_SDCARD",		BIT_SCC_SDCARD},
-	{"11 - SCC_MIPI",		BIT_SCC_MIPI},
-	{"12 - HDA",			BIT_HDA},
-	{"13 - LPE",			BIT_LPE},
-	{"14 - OTG",			BIT_OTG},
-	{"15 - USH",			BIT_USH},
-	{"16 - GBE",			BIT_GBE},
-	{"17 - SATA",			BIT_SATA},
-	{"18 - USB_EHCI",		BIT_USB_EHCI},
-	{"19 - SEC",			BIT_SEC},
-	{"20 - PCIE_PORT0",		BIT_PCIE_PORT0},
-	{"21 - PCIE_PORT1",		BIT_PCIE_PORT1},
-	{"22 - PCIE_PORT2",		BIT_PCIE_PORT2},
-	{"23 - PCIE_PORT3",		BIT_PCIE_PORT3},
-	{"24 - LPSS2_F0_DMA",		BIT_LPSS2_F0_DMA},
-	{"25 - LPSS2_F1_I2C1",		BIT_LPSS2_F1_I2C1},
-	{"26 - LPSS2_F2_I2C2",		BIT_LPSS2_F2_I2C2},
-	{"27 - LPSS2_F3_I2C3",		BIT_LPSS2_F3_I2C3},
-	{"28 - LPSS2_F3_I2C4",		BIT_LPSS2_F4_I2C4},
-	{"29 - LPSS2_F5_I2C5",		BIT_LPSS2_F5_I2C5},
-	{"30 - LPSS2_F6_I2C6",		BIT_LPSS2_F6_I2C6},
-	{"31 - LPSS2_F7_I2C7",		BIT_LPSS2_F7_I2C7},
-	{"32 - SMB",			BIT_SMB},
-	{"33 - OTG_SS_PHY",		BIT_OTG_SS_PHY},
-	{"34 - USH_SS_PHY",		BIT_USH_SS_PHY},
-	{"35 - DFX",			BIT_DFX},
+static struct pmc_bit_map cht_d3_sts_1_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_STS_GMM},
+	{"ISH",			BIT_STS_ISH},
+	{},
 };
 
-static const struct pmc_bit_map pss_map[] = {
-	{"0  - GBE",			PMC_PSS_BIT_GBE},
-	{"1  - SATA",			PMC_PSS_BIT_SATA},
-	{"2  - HDA",			PMC_PSS_BIT_HDA},
-	{"3  - SEC",			PMC_PSS_BIT_SEC},
-	{"4  - PCIE",			PMC_PSS_BIT_PCIE},
-	{"5  - LPSS",			PMC_PSS_BIT_LPSS},
-	{"6  - LPE",			PMC_PSS_BIT_LPE},
-	{"7  - DFX",			PMC_PSS_BIT_DFX},
-	{"8  - USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
-	{"9  - USH_SUS",		PMC_PSS_BIT_USH_SUS},
-	{"10 - USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
-	{"11 - USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
-	{"12 - OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
-	{"13 - OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
-	{"14 - OTG_VCCA_CLK",		PMC_PSS_BIT_OTG_VCCA_CLK},
-	{"15 - OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
-	{"16 - USB",			PMC_PSS_BIT_USB},
-	{"17 - USB_SUS",		PMC_PSS_BIT_USB_SUS},
+static struct pmc_bit_map cht_func_dis_2_map[] = {
+	{"SMB",			BIT_SMB},
+	{"GMM",			BIT_FD_GMM},
+	{"ISH",			BIT_FD_ISH},
+	{},
+};
+
+static const struct pmc_bit_map byt_pss_map[] = {
+	{"GBE",			PMC_PSS_BIT_GBE},
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"DFX",			PMC_PSS_BIT_DFX},
+	{"USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
+	{"USH_SUS",		PMC_PSS_BIT_USH_SUS},
+	{"USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
+	{"USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
+	{"OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
+	{"OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
+	{"OTG_VCCA_CLK",	PMC_PSS_BIT_OTG_VCCA_CLK},
+	{"OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
+	{"USB",			PMC_PSS_BIT_USB},
+	{"USB_SUS",		PMC_PSS_BIT_USB_SUS},
+	{},
+};
+
+static const struct pmc_bit_map cht_pss_map[] = {
+	{"SATA",		PMC_PSS_BIT_SATA},
+	{"HDA",			PMC_PSS_BIT_HDA},
+	{"SEC",			PMC_PSS_BIT_SEC},
+	{"PCIE",		PMC_PSS_BIT_PCIE},
+	{"LPSS",		PMC_PSS_BIT_LPSS},
+	{"LPE",			PMC_PSS_BIT_LPE},
+	{"UFS",			PMC_PSS_BIT_CHT_UFS},
+	{"UXD",			PMC_PSS_BIT_CHT_UXD},
+	{"UXD_FD",		PMC_PSS_BIT_CHT_UXD_FD},
+	{"UX_ENG",		PMC_PSS_BIT_CHT_UX_ENG},
+	{"USB_SUS",		PMC_PSS_BIT_CHT_USB_SUS},
+	{"GMM",			PMC_PSS_BIT_CHT_GMM},
+	{"ISH",			PMC_PSS_BIT_CHT_ISH},
+	{"DFX_MASTER",		PMC_PSS_BIT_CHT_DFX_MASTER},
+	{"DFX_CLUSTER1",	PMC_PSS_BIT_CHT_DFX_CLUSTER1},
+	{"DFX_CLUSTER2",	PMC_PSS_BIT_CHT_DFX_CLUSTER2},
+	{"DFX_CLUSTER3",	PMC_PSS_BIT_CHT_DFX_CLUSTER3},
+	{"DFX_CLUSTER4",	PMC_PSS_BIT_CHT_DFX_CLUSTER4},
+	{"DFX_CLUSTER5",	PMC_PSS_BIT_CHT_DFX_CLUSTER5},
+	{},
+};
+
+static const struct pmc_reg_map byt_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= byt_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= byt_d3_sts_1_map,
+	.pss		= byt_pss_map,
+};
+
+static const struct pmc_reg_map cht_reg_map = {
+	.d3_sts_0	= d3_sts_0_map,
+	.d3_sts_1	= cht_d3_sts_1_map,
+	.func_dis	= d3_sts_0_map,
+	.func_dis_2	= cht_func_dis_2_map,
+	.pss		= cht_pss_map,
 };
 
 static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
@@ -111,6 +179,30 @@ static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
 	writel(val, pmc->regmap + reg_offset);
 }
 
+int pmc_atom_read(int offset, u32 *value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	*value = pmc_reg_read(pmc, offset);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_read);
+
+int pmc_atom_write(int offset, u32 value)
+{
+	struct pmc_dev *pmc = &pmc_device;
+
+	if (!pmc->init)
+		return -ENODEV;
+
+	pmc_reg_write(pmc, offset, value);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_write);
+
 static void pmc_power_off(void)
 {
 	u16	pm1_cnt_port;
@@ -142,37 +234,39 @@ static void pmc_hw_reg_setup(struct pmc_dev *pmc)
 }
 
 #ifdef CONFIG_DEBUG_FS
+static void pmc_dev_state_print(struct seq_file *s, int reg_index,
+				u32 sts, const struct pmc_bit_map *sts_map,
+				u32 fd, const struct pmc_bit_map *fd_map)
+{
+	int offset = PMC_REG_BIT_WIDTH * reg_index;
+	int index;
+
+	for (index = 0; sts_map[index].name; index++) {
+		seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
+			offset + index, sts_map[index].name,
+			fd_map[index].bit_mask & fd ?  "Disabled" : "Enabled ",
+			sts_map[index].bit_mask & sts ?  "D3" : "D0");
+	}
+}
+
 static int pmc_dev_state_show(struct seq_file *s, void *unused)
 {
 	struct pmc_dev *pmc = s->private;
-	u32 func_dis, func_dis_2, func_dis_index;
-	u32 d3_sts_0, d3_sts_1, d3_sts_index;
-	int dev_num, dev_index, reg_index;
+	const struct pmc_reg_map *m = pmc->map;
+	u32 func_dis, func_dis_2;
+	u32 d3_sts_0, d3_sts_1;
 
 	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
 	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
 	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
 	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
 
-	dev_num = ARRAY_SIZE(dev_map);
-
-	for (dev_index = 0; dev_index < dev_num; dev_index++) {
-		reg_index = dev_index / PMC_REG_BIT_WIDTH;
-		if (reg_index) {
-			func_dis_index = func_dis_2;
-			d3_sts_index = d3_sts_1;
-		} else {
-			func_dis_index = func_dis;
-			d3_sts_index = d3_sts_0;
-		}
-
-		seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
-			dev_map[dev_index].name,
-			dev_map[dev_index].bit_mask & func_dis_index ?
-			"Disabled" : "Enabled ",
-			dev_map[dev_index].bit_mask & d3_sts_index ?
-			"D3" : "D0");
-	}
+	/* Low part */
+	pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
+
+	/* High part */
+	pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
+
 	return 0;
 }
 
@@ -191,13 +285,14 @@ static const struct file_operations pmc_dev_state_ops = {
 static int pmc_pss_state_show(struct seq_file *s, void *unused)
 {
 	struct pmc_dev *pmc = s->private;
+	const struct pmc_bit_map *map = pmc->map->pss;
 	u32 pss = pmc_reg_read(pmc, PMC_PSS);
-	int pss_index;
+	int index;
 
-	for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
-		seq_printf(s, "Island: %-32s\tState: %s\n",
-			pss_map[pss_index].name,
-			pss_map[pss_index].bit_mask & pss ? "Off" : "On");
+	for (index = 0; map[index].name; index++) {
+		seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
+			index, map[index].name,
+			map[index].bit_mask & pss ? "Off" : "On");
 	}
 	return 0;
 }
@@ -250,7 +345,7 @@ static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
 	debugfs_remove_recursive(pmc->dbgfs_dir);
 }
 
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
 {
 	struct dentry *dir, *f;
 
@@ -262,24 +357,18 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
 
 	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_dev_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "dev_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_pss_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "pss_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
 				dir, pmc, &pmc_sleep_tmr_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "sleep_state register failed\n");
+	if (!f)
 		goto err;
-	}
 
 	return 0;
 err:
@@ -287,15 +376,16 @@ err:
 	return -ENODEV;
 }
 #else
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
 {
 	return 0;
 }
 #endif /* CONFIG_DEBUG_FS */
 
-static int pmc_setup_dev(struct pci_dev *pdev)
+static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	struct pmc_dev *pmc = &pmc_device;
+	const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
 	int ret;
 
 	/* Obtain ACPI base address */
@@ -315,32 +405,30 @@ static int pmc_setup_dev(struct pci_dev *pdev)
 		return -ENOMEM;
 	}
 
+	pmc->map = map;
+
 	/* PMC hardware registers setup */
 	pmc_hw_reg_setup(pmc);
 
-	ret = pmc_dbgfs_register(pmc, pdev);
-	if (ret) {
-		iounmap(pmc->regmap);
-	}
+	ret = pmc_dbgfs_register(pmc);
+	if (ret)
+		dev_warn(&pdev->dev, "debugfs register failed\n");
 
+	pmc->init = true;
 	return ret;
 }
 
 /*
  * Data for PCI driver interface
  *
- * This data only exists for exporting the supported
- * PCI ids via MODULE_DEVICE_TABLE.  We do not actually
- * register a pci_driver, because lpc_ich will register
- * a driver on the same PCI id.
+ * used by pci_match_id() call below.
  */
 static const struct pci_device_id pmc_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
 	{ 0, },
 };
 
-MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
-
 static int __init pmc_atom_init(void)
 {
 	struct pci_dev *pdev = NULL;
@@ -357,15 +445,16 @@ static int __init pmc_atom_init(void)
 	for_each_pci_dev(pdev) {
 		ent = pci_match_id(pmc_pci_ids, pdev);
 		if (ent)
-			return pmc_setup_dev(pdev);
+			return pmc_setup_dev(pdev, ent);
 	}
 	/* Device not found. */
 	return -ENODEV;
 }
 
-module_init(pmc_atom_init);
-/* no module_exit, this driver shouldn't be unloaded */
+device_initcall(pmc_atom_init);
 
+/*
 MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
 MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
 MODULE_LICENSE("GPL v2");
+*/
diff --git a/kernel/arch/x86/platform/atom/punit_atom_debug.c b/kernel/arch/x86/platform/atom/punit_atom_debug.c
new file mode 100644
index 000000000..5ca8ead91
--- /dev/null
+++ b/kernel/arch/x86/platform/atom/punit_atom_debug.c
@@ -0,0 +1,183 @@
+/*
+ * Intel SOC Punit device state debug driver
+ * Punit controls power management for North Complex devices (Graphics
+ * blocks, Image Signal Processing, video processing, display, DSP etc.)
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+#include <asm/cpu_device_id.h>
+#include <asm/iosf_mbi.h>
+
+/* Side band Interface port */
+#define PUNIT_PORT		0x04
+/* Power gate status reg */
+#define PWRGT_STATUS		0x61
+/* Subsystem config/status Video processor */
+#define VED_SS_PM0		0x32
+/* Subsystem config/status ISP (Image Signal Processor) */
+#define ISP_SS_PM0		0x39
+/* Subsystem config/status Input/output controller */
+#define MIO_SS_PM		0x3B
+/* Shift bits for getting status for video, isp and i/o */
+#define SSS_SHIFT		24
+/* Shift bits for getting status for graphics rendering */
+#define RENDER_POS		0
+/* Shift bits for getting status for media control */
+#define MEDIA_POS		2
+/* Shift bits for getting status for Valley View/Baytrail display */
+#define VLV_DISPLAY_POS		6
+/* Subsystem config/status display for Cherry Trail SOC */
+#define CHT_DSP_SSS		0x36
+/* Shift bits for getting status for display */
+#define CHT_DSP_SSS_POS		16
+
+struct punit_device {
+	char *name;
+	int reg;
+	int sss_pos;
+};
+
+static const struct punit_device punit_device_byt[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	PWRGT_STATUS,	VLV_DISPLAY_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const struct punit_device punit_device_cht[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	CHT_DSP_SSS,	CHT_DSP_SSS_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
+{
+	u32 punit_pwr_status;
+	struct punit_device *punit_devp = seq_file->private;
+	int index;
+	int status;
+
+	seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
+	while (punit_devp->name) {
+		status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
+				       punit_devp->reg,
+				       &punit_pwr_status);
+		if (status) {
+			seq_printf(seq_file, "%9s : Read Failed\n",
+				   punit_devp->name);
+		} else  {
+			index = (punit_pwr_status >> punit_devp->sss_pos) & 3;
+			seq_printf(seq_file, "%9s : %s\n", punit_devp->name,
+				   dstates[index]);
+		}
+		punit_devp++;
+	}
+
+	return 0;
+}
+
+static int punit_dev_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, punit_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations punit_dev_state_ops = {
+	.open		= punit_dev_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *punit_dbg_file;
+
+static int punit_dbgfs_register(struct punit_device *punit_device)
+{
+	static struct dentry *dev_state;
+
+	punit_dbg_file = debugfs_create_dir("punit_atom", NULL);
+	if (!punit_dbg_file)
+		return -ENXIO;
+
+	dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO,
+					punit_dbg_file, punit_device,
+					&punit_dev_state_ops);
+	if (!dev_state) {
+		pr_err("punit_dev_state register failed\n");
+		debugfs_remove(punit_dbg_file);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static void punit_dbgfs_unregister(void)
+{
+	debugfs_remove_recursive(punit_dbg_file);
+}
+
+#define ICPU(model, drv_data) \
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\
+	  (kernel_ulong_t)&drv_data }
+
+static const struct x86_cpu_id intel_punit_cpu_ids[] = {
+	ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */
+	ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */
+	{}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
+
+static int __init punit_atom_debug_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(intel_punit_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	ret = punit_dbgfs_register((struct punit_device *)id->driver_data);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void __exit punit_atom_debug_exit(void)
+{
+	punit_dbgfs_unregister();
+}
+
+module_init(punit_atom_debug_init);
+module_exit(punit_atom_debug_exit);
+
+MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_DESCRIPTION("Driver for Punit devices states debugging");
+MODULE_LICENSE("GPL v2");
diff --git a/kernel/arch/x86/platform/efi/efi-bgrt.c b/kernel/arch/x86/platform/efi/efi-bgrt.c
index d7f997f7c..ea48449b2 100644
--- a/kernel/arch/x86/platform/efi/efi-bgrt.c
+++ b/kernel/arch/x86/platform/efi/efi-bgrt.c
@@ -50,11 +50,16 @@ void __init efi_bgrt_init(void)
 		       bgrt_tab->version);
 		return;
 	}
-	if (bgrt_tab->status != 1) {
-		pr_err("Ignoring BGRT: invalid status %u (expected 1)\n",
+	if (bgrt_tab->status & 0xfe) {
+		pr_err("Ignoring BGRT: reserved status bits are non-zero %u\n",
 		       bgrt_tab->status);
 		return;
 	}
+	if (bgrt_tab->status != 1) {
+		pr_debug("Ignoring BGRT: invalid status %u (expected 1)\n",
+			 bgrt_tab->status);
+		return;
+	}
 	if (bgrt_tab->image_type != 0) {
 		pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n",
 		       bgrt_tab->image_type);
diff --git a/kernel/arch/x86/platform/efi/efi.c b/kernel/arch/x86/platform/efi/efi.c
index 841ea05e1..ad285404e 100644
--- a/kernel/arch/x86/platform/efi/efi.c
+++ b/kernel/arch/x86/platform/efi/efi.c
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
 	now->tv_nsec = 0;
 }
 
+void __init efi_find_mirror(void)
+{
+	void *p;
+	u64 mirror_size = 0, total_size = 0;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+		total_size += size;
+		if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+			memblock_mark_mirror(start, size);
+			mirror_size += size;
+		}
+	}
+	if (mirror_size)
+		pr_info("Memory: %lldM/%lldM mirrored memory\n",
+			mirror_size>>20, total_size>>20);
+}
+
 /*
  * Tell the kernel about the EFI memory map.  This might include
  * more than the max 128 entries that can fit in the e820 legacy
@@ -153,6 +174,9 @@ static void __init do_add_efi_memmap(void)
 		case EFI_UNUSABLE_MEMORY:
 			e820_type = E820_UNUSABLE;
 			break;
+		case EFI_PERSISTENT_MEMORY:
+			e820_type = E820_PMEM;
+			break;
 		default:
 			/*
 			 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
@@ -170,7 +194,7 @@ static void __init do_add_efi_memmap(void)
 int __init efi_memblock_x86_reserve_range(void)
 {
 	struct efi_info *e = &boot_params.efi_info;
-	unsigned long pmap;
+	phys_addr_t pmap;
 
 	if (efi_enabled(EFI_PARAVIRT))
 		return 0;
@@ -185,7 +209,7 @@ int __init efi_memblock_x86_reserve_range(void)
 #else
 	pmap = (e->efi_memmap |	((__u64)e->efi_memmap_hi << 32));
 #endif
-	memmap.phys_map		= (void *)pmap;
+	memmap.phys_map		= pmap;
 	memmap.nr_map		= e->efi_memmap_size /
 				  e->efi_memdesc_size;
 	memmap.desc_size	= e->efi_memdesc_size;
@@ -198,7 +222,7 @@ int __init efi_memblock_x86_reserve_range(void)
 	return 0;
 }
 
-static void __init print_efi_memmap(void)
+void __init efi_print_memmap(void)
 {
 #ifdef EFI_DEBUG
 	efi_memory_desc_t *md;
@@ -500,7 +524,9 @@ void __init efi_init(void)
 		return;
 
 	if (efi_enabled(EFI_DBG))
-		print_efi_memmap();
+		efi_print_memmap();
+
+	efi_esrt_init();
 }
 
 void __init efi_late_init(void)
@@ -624,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 
 static void __init save_runtime_map(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *tmp, *p, *q = NULL;
 	int count = 0;
@@ -679,6 +705,70 @@ out:
 }
 
 /*
+ * Iterate the EFI memory map in reverse order because the regions
+ * will be mapped top-down. The end result is the same as if we had
+ * mapped things forward, but doesn't require us to change the
+ * existing implementation of efi_map_region().
+ */
+static inline void *efi_map_next_entry_reverse(void *entry)
+{
+	/* Initial call */
+	if (!entry)
+		return memmap.map_end - memmap.desc_size;
+
+	entry -= memmap.desc_size;
+	if (entry < memmap.map)
+		return NULL;
+
+	return entry;
+}
+
+/*
+ * efi_map_next_entry - Return the next EFI memory map descriptor
+ * @entry: Previous EFI memory map descriptor
+ *
+ * This is a helper function to iterate over the EFI memory map, which
+ * we do in different orders depending on the current configuration.
+ *
+ * To begin traversing the memory map @entry must be %NULL.
+ *
+ * Returns %NULL when we reach the end of the memory map.
+ */
+static void *efi_map_next_entry(void *entry)
+{
+	if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) {
+		/*
+		 * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE
+		 * config table feature requires us to map all entries
+		 * in the same order as they appear in the EFI memory
+		 * map. That is to say, entry N must have a lower
+		 * virtual address than entry N+1. This is because the
+		 * firmware toolchain leaves relative references in
+		 * the code/data sections, which are split and become
+		 * separate EFI memory regions. Mapping things
+		 * out-of-order leads to the firmware accessing
+		 * unmapped addresses.
+		 *
+		 * Since we need to map things this way whether or not
+		 * the kernel actually makes use of
+		 * EFI_PROPERTIES_TABLE, let's just switch to this
+		 * scheme by default for 64-bit.
+		 */
+		return efi_map_next_entry_reverse(entry);
+	}
+
+	/* Initial call */
+	if (!entry)
+		return memmap.map;
+
+	entry += memmap.desc_size;
+	if (entry >= memmap.map_end)
+		return NULL;
+
+	return entry;
+}
+
+/*
  * Map the efi memory ranges of the runtime services and update new_mmap with
  * virtual addresses.
  */
@@ -688,7 +778,8 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 	unsigned long left = 0;
 	efi_memory_desc_t *md;
 
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+	p = NULL;
+	while ((p = efi_map_next_entry(p))) {
 		md = p;
 		if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
 #ifdef CONFIG_X86_64
@@ -722,7 +813,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 
 static void __init kexec_enter_virtual_mode(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 	efi_memory_desc_t *md;
 	void *p;
 
@@ -926,24 +1017,6 @@ u32 efi_mem_type(unsigned long phys_addr)
 	return 0;
 }
 
-u64 efi_mem_attributes(unsigned long phys_addr)
-{
-	efi_memory_desc_t *md;
-	void *p;
-
-	if (!efi_enabled(EFI_MEMMAP))
-		return 0;
-
-	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-		md = p;
-		if ((md->phys_addr <= phys_addr) &&
-		    (phys_addr < (md->phys_addr +
-				  (md->num_pages << EFI_PAGE_SHIFT))))
-			return md->attribute;
-	}
-	return 0;
-}
-
 static int __init arch_parse_efi_cmdline(char *str)
 {
 	if (!str) {
@@ -953,8 +1026,6 @@ static int __init arch_parse_efi_cmdline(char *str)
 
 	if (parse_option_str(str, "old_map"))
 		set_bit(EFI_OLD_MEMMAP, &efi.flags);
-	if (parse_option_str(str, "debug"))
-		set_bit(EFI_DBG, &efi.flags);
 
 	return 0;
 }
diff --git a/kernel/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/kernel/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
index 0b283d4d0..de734134b 100644
--- a/kernel/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
+++ b/kernel/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -27,6 +27,7 @@ static struct platform_device wdt_dev = {
 static int tangier_probe(struct platform_device *pdev)
 {
 	int gsi;
+	struct irq_alloc_info info;
 	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
 
 	if (!pdata)
@@ -34,8 +35,8 @@ static int tangier_probe(struct platform_device *pdev)
 
 	/* IOAPIC builds identity mapping between GSI and IRQ on MID */
 	gsi = pdata->irq;
-	if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
-	    mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
+	ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0);
+	if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) {
 		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
 			 gsi);
 		return -EINVAL;
diff --git a/kernel/arch/x86/platform/intel-mid/intel-mid.c b/kernel/arch/x86/platform/intel-mid/intel-mid.c
index 3005f0c89..1bbc21e2e 100644
--- a/kernel/arch/x86/platform/intel-mid/intel-mid.c
+++ b/kernel/arch/x86/platform/intel-mid/intel-mid.c
@@ -61,7 +61,7 @@
 enum intel_mid_timer_options intel_mid_timer_options;
 
 /* intel_mid_ops to store sub arch ops */
-struct intel_mid_ops *intel_mid_ops;
+static struct intel_mid_ops *intel_mid_ops;
 /* getter function for sub arch ops*/
 static void *(*get_intel_mid_ops[])(void) = INTEL_MID_OPS_INIT;
 enum intel_mid_cpu_type __intel_mid_cpu_chip;
@@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void)
 	return 0;
 }
 
+static void __init intel_mid_setup_bp_timer(void)
+{
+	apbt_time_init();
+	setup_boot_APIC_clock();
+}
+
 static void __init intel_mid_time_init(void)
 {
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+
 	switch (intel_mid_timer_options) {
 	case INTEL_MID_TIMER_APBT_ONLY:
 		break;
 	case INTEL_MID_TIMER_LAPIC_APBT:
-		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		/* Use apbt and local apic */
+		x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-		break;
+		return;
 	default:
 		if (!boot_cpu_has(X86_FEATURE_ARAT))
 			break;
+		/* Lapic only, no apbt */
 		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
 		return;
 	}
-	/* we need at least one APB timer */
-	pre_init_apic_IRQ0();
-	apbt_time_init();
+
+	x86_init.timers.setup_percpu_clockev = apbt_time_init;
 }
 
 static void intel_mid_arch_setup(void)
diff --git a/kernel/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/kernel/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 32947ba0f..ee40fcb6e 100644
--- a/kernel/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/kernel/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -173,5 +173,4 @@ static int __init intel_mid_device_create(void)
 
 	return platform_device_register(&vrtc_device);
 }
-
-module_init(intel_mid_device_create);
+device_initcall(intel_mid_device_create);
diff --git a/kernel/arch/x86/platform/intel-mid/sfi.c b/kernel/arch/x86/platform/intel-mid/sfi.c
index c14ad3477..5ee360a95 100644
--- a/kernel/arch/x86/platform/intel-mid/sfi.c
+++ b/kernel/arch/x86/platform/intel-mid/sfi.c
@@ -95,18 +95,16 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table)
 		pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
 			totallen, (u32)pentry->phys_addr,
 			pentry->freq_hz, pentry->irq);
-			if (!pentry->irq)
-				continue;
-			mp_irq.type = MP_INTSRC;
-			mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
-			mp_irq.irqflag = 5;
-			mp_irq.srcbus = MP_BUS_ISA;
-			mp_irq.srcbusirq = pentry->irq;	/* IRQ */
-			mp_irq.dstapic = MP_APIC_ALL;
-			mp_irq.dstirq = pentry->irq;
-			mp_save_irq(&mp_irq);
-			mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_irq.type = MP_INTSRC;
+		mp_irq.irqtype = mp_INT;
+		/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+		mp_irq.irqflag = 5;
+		mp_irq.srcbus = MP_BUS_ISA;
+		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
+		mp_irq.dstapic = MP_APIC_ALL;
+		mp_irq.dstirq = pentry->irq;
+		mp_save_irq(&mp_irq);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 
 	return 0;
@@ -177,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 		mp_irq.dstapic = MP_APIC_ALL;
 		mp_irq.dstirq = pentry->irq;
 		mp_save_irq(&mp_irq);
-		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 	return 0;
 }
@@ -199,10 +197,9 @@ static int __init sfi_parse_gpio(struct sfi_table_header *table)
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
 	pentry = (struct sfi_gpio_table_entry *)sb->pentry;
 
-	gpio_table = kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+	gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL);
 	if (!gpio_table)
 		return -1;
-	memcpy(gpio_table, pentry, num * sizeof(*pentry));
 	gpio_num_entry = num;
 
 	pr_debug("GPIO pin info:\n");
@@ -436,6 +433,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 	struct devs_id *dev = NULL;
 	int num, i, ret;
 	int polarity;
+	struct irq_alloc_info info;
 
 	sb = (struct sfi_table_simple *)table;
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -469,9 +467,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 				polarity = 1;
 			}
 
-			ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
-			if (ret == 0)
-				ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+			ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
+			ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
 			WARN_ON(ret < 0);
 		}
 
diff --git a/kernel/arch/x86/platform/intel/Makefile b/kernel/arch/x86/platform/intel/Makefile
new file mode 100644
index 000000000..b878032fb
--- /dev/null
+++ b/kernel/arch/x86/platform/intel/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
diff --git a/kernel/arch/x86/kernel/iosf_mbi.c b/kernel/arch/x86/platform/intel/iosf_mbi.c
index 82f8d02f0..edf2c54bf 100644
--- a/kernel/arch/x86/kernel/iosf_mbi.c
+++ b/kernel/arch/x86/platform/intel/iosf_mbi.c
@@ -30,7 +30,9 @@
 #define PCI_DEVICE_ID_BAYTRAIL		0x0F00
 #define PCI_DEVICE_ID_BRASWELL		0x2280
 #define PCI_DEVICE_ID_QUARK_X1000	0x0958
+#define PCI_DEVICE_ID_TANGIER		0x1170
 
+static struct pci_dev *mbi_pdev;
 static DEFINE_SPINLOCK(iosf_mbi_lock);
 
 static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
@@ -38,8 +40,6 @@ static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
 	return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
 }
 
-static struct pci_dev *mbi_pdev;	/* one mbi device */
-
 static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
 {
 	int result;
@@ -104,7 +104,7 @@ int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
 	unsigned long flags;
 	int ret;
 
-	/*Access to the GFX unit is handled by GPU code */
+	/* Access to the GFX unit is handled by GPU code */
 	if (port == BT_MBI_UNIT_GFX) {
 		WARN_ON(1);
 		return -EPERM;
@@ -127,7 +127,7 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
 	unsigned long flags;
 	int ret;
 
-	/*Access to the GFX unit is handled by GPU code */
+	/* Access to the GFX unit is handled by GPU code */
 	if (port == BT_MBI_UNIT_GFX) {
 		WARN_ON(1);
 		return -EPERM;
@@ -151,7 +151,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
 	unsigned long flags;
 	int ret;
 
-	/*Access to the GFX unit is handled by GPU code */
+	/* Access to the GFX unit is handled by GPU code */
 	if (port == BT_MBI_UNIT_GFX) {
 		WARN_ON(1);
 		return -EPERM;
@@ -240,17 +240,17 @@ static void iosf_sideband_debug_init(void)
 
 	/* mdr */
 	d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
-	if (IS_ERR_OR_NULL(d))
+	if (!d)
 		goto cleanup;
 
 	/* mcrx */
-	debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
-	if (IS_ERR_OR_NULL(d))
+	d = debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+	if (!d)
 		goto cleanup;
 
 	/* mcr - initiates mailbox tranaction */
-	debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
-	if (IS_ERR_OR_NULL(d))
+	d = debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+	if (!d)
 		goto cleanup;
 
 	return;
@@ -292,6 +292,7 @@ static const struct pci_device_id iosf_mbi_pci_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_TANGIER) },
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
@@ -314,10 +315,8 @@ static void __exit iosf_mbi_exit(void)
 	iosf_debugfs_remove();
 
 	pci_unregister_driver(&iosf_mbi_pci_driver);
-	if (mbi_pdev) {
-		pci_dev_put(mbi_pdev);
-		mbi_pdev = NULL;
-	}
+	pci_dev_put(mbi_pdev);
+	mbi_pdev = NULL;
 }
 
 module_init(iosf_mbi_init);
diff --git a/kernel/arch/x86/platform/sfi/sfi.c b/kernel/arch/x86/platform/sfi/sfi.c
index 2a8a74f3b..6c7111bbd 100644
--- a/kernel/arch/x86/platform/sfi/sfi.c
+++ b/kernel/arch/x86/platform/sfi/sfi.c
@@ -25,8 +25,8 @@
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 
+#include <asm/irqdomain.h>
 #include <asm/io_apic.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
@@ -71,9 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-};
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -82,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	int i, num;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_STRICT,
-		.ops = &sfi_ioapic_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	sb = (struct sfi_table_simple *)table;
diff --git a/kernel/arch/x86/platform/uv/uv_irq.c b/kernel/arch/x86/platform/uv/uv_irq.c
index 0ce673645..e1c24631a 100644
--- a/kernel/arch/x86/platform/uv/uv_irq.c
+++ b/kernel/arch/x86/platform/uv/uv_irq.c
@@ -13,22 +13,37 @@
 #include <linux/slab.h>
 #include <linux/irq.h>
 
+#include <asm/irqdomain.h>
 #include <asm/apic.h>
 #include <asm/uv/uv_irq.h>
 #include <asm/uv/uv_hub.h>
 
 /* MMR offset and pnode of hub sourcing interrupts for a given irq */
-struct uv_irq_2_mmr_pnode{
-	struct rb_node		list;
+struct uv_irq_2_mmr_pnode {
 	unsigned long		offset;
 	int			pnode;
-	int			irq;
 };
 
-static DEFINE_SPINLOCK(uv_irq_lock);
-static struct rb_root		uv_irq_root;
+static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
+{
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+		     sizeof(unsigned long));
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= cfg->dest_apicid;
 
-static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
+	uv_write_global_mmr64(info->pnode, info->offset, mmr_value);
+}
 
 static void uv_noop(struct irq_data *data) { }
 
@@ -37,6 +52,23 @@ static void uv_ack_apic(struct irq_data *data)
 	ack_APIC_irq();
 }
 
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
+{
+	struct irq_data *parent = data->parent_data;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		uv_program_mmr(cfg, data->chip_data);
+		send_cleanup_vector(cfg);
+	}
+
+	return ret;
+}
+
 static struct irq_chip uv_irq_chip = {
 	.name			= "UV-CORE",
 	.irq_mask		= uv_noop,
@@ -45,189 +77,99 @@ static struct irq_chip uv_irq_chip = {
 	.irq_set_affinity	= uv_set_irq_affinity,
 };
 
-/*
- * Add offset and pnode information of the hub sourcing interrupts to the
- * rb tree for a specific irq.
- */
-static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
+static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs, void *arg)
 {
-	struct rb_node **link = &uv_irq_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct uv_irq_2_mmr_pnode *n;
-	struct uv_irq_2_mmr_pnode *e;
-	unsigned long irqflags;
-
-	n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
-				uv_blade_to_memory_nid(blade));
-	if (!n)
+	struct uv_irq_2_mmr_pnode *chip_data;
+	struct irq_alloc_info *info = arg;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+	int ret;
+
+	if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV)
+		return -EINVAL;
+
+	chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
+				 irq_data_get_node(irq_data));
+	if (!chip_data)
 		return -ENOMEM;
 
-	n->irq = irq;
-	n->offset = offset;
-	n->pnode = uv_blade_to_pnode(blade);
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	/* Find the right place in the rbtree: */
-	while (*link) {
-		parent = *link;
-		e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
-
-		if (unlikely(irq == e->irq)) {
-			/* irq entry exists */
-			e->pnode = uv_blade_to_pnode(blade);
-			e->offset = offset;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			kfree(n);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			link = &(*link)->rb_left;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret >= 0) {
+		if (info->uv_limit == UV_AFFINITY_CPU)
+			irq_set_status_flags(virq, IRQ_NO_BALANCING);
 		else
-			link = &(*link)->rb_right;
+			irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+
+		chip_data->pnode = uv_blade_to_pnode(info->uv_blade);
+		chip_data->offset = info->uv_offset;
+		irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data,
+				    handle_percpu_irq, NULL, info->uv_name);
+	} else {
+		kfree(chip_data);
 	}
 
-	/* Insert the node into the rbtree. */
-	rb_link_node(&n->list, parent, link);
-	rb_insert_color(&n->list, &uv_irq_root);
-
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return 0;
+	return ret;
 }
 
-/* Retrieve offset and pnode information from the rb tree for a specific irq */
-int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
+static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs)
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-
-		if (e->irq == irq) {
-			*offset = e->offset;
-			*pnode = e->pnode;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return -1;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_clear_status_flags(virq, IRQ_NO_BALANCING);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
 /*
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
-static int
-arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int limit)
+static void uv_domain_activate(struct irq_domain *domain,
+			       struct irq_data *irq_data)
 {
-	const struct cpumask *eligible_cpu = cpumask_of(cpu);
-	struct irq_cfg *cfg = irq_cfg(irq);
-	unsigned long mmr_value;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode, err;
-	unsigned int dest;
-
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
-	err = assign_irq_vector(irq, cfg, eligible_cpu);
-	if (err != 0)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
-	if (err != 0)
-		return err;
-
-	if (limit == UV_AFFINITY_CPU)
-		irq_set_status_flags(irq, IRQ_NO_BALANCING);
-	else
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-
-	irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
-				      irq_name);
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	mmr_pnode = uv_blade_to_pnode(mmr_blade);
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	return irq;
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 
 /*
  * Disable the specified MMR located on the specified blade so that MSIs are
  * longer allowed to be sent.
  */
-static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
+static void uv_domain_deactivate(struct irq_domain *domain,
+				 struct irq_data *irq_data)
 {
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	entry->mask = 1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 
-static int
-uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
-		    bool force)
-{
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
-	unsigned long mmr_value, mmr_offset;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode;
-
-	if (apic_set_affinity(data, mask, &dest))
-		return -1;
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	/* Get previously stored MMR and pnode of hub sourcing interrupts */
-	if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
-		return -1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+static const struct irq_domain_ops uv_domain_ops = {
+	.alloc		= uv_domain_alloc,
+	.free		= uv_domain_free,
+	.activate	= uv_domain_activate,
+	.deactivate	= uv_domain_deactivate,
+};
 
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
+static struct irq_domain *uv_get_irq_domain(void)
+{
+	static struct irq_domain *uv_domain;
+	static DEFINE_MUTEX(uv_lock);
+
+	mutex_lock(&uv_lock);
+	if (uv_domain == NULL) {
+		uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL);
+		if (uv_domain)
+			uv_domain->parent = x86_vector_domain;
+	}
+	mutex_unlock(&uv_lock);
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return uv_domain;
 }
 
 /*
@@ -238,19 +180,21 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		 unsigned long mmr_offset, int limit)
 {
-	int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade));
+	struct irq_alloc_info info;
+	struct irq_domain *domain = uv_get_irq_domain();
 
-	if (!irq)
-		return -EBUSY;
+	if (!domain)
+		return -ENOMEM;
 
-	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		limit);
-	if (ret == irq)
-		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
-	else
-		irq_free_hwirq(irq);
+	init_irq_alloc_info(&info, cpumask_of(cpu));
+	info.type = X86_IRQ_ALLOC_TYPE_UV;
+	info.uv_limit = limit;
+	info.uv_blade = mmr_blade;
+	info.uv_offset = mmr_offset;
+	info.uv_name = irq_name;
 
-	return ret;
+	return irq_domain_alloc_irqs(domain, 1,
+				     uv_blade_to_memory_nid(mmr_blade), &info);
 }
 EXPORT_SYMBOL_GPL(uv_setup_irq);
 
@@ -263,26 +207,6 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
  */
 void uv_teardown_irq(unsigned int irq)
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-		if (e->irq == irq) {
-			arch_disable_uv_irq(e->pnode, e->offset);
-			rb_erase(n, &uv_irq_root);
-			kfree(e);
-			break;
-		}
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	irq_free_hwirq(irq);
+	irq_domain_free_irqs(irq, 1);
 }
 EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/kernel/arch/x86/platform/uv/uv_nmi.c b/kernel/arch/x86/platform/uv/uv_nmi.c
index 7488cafab..327f21c3b 100644
--- a/kernel/arch/x86/platform/uv/uv_nmi.c
+++ b/kernel/arch/x86/platform/uv/uv_nmi.c
@@ -104,7 +104,7 @@ static int param_set_local64(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops param_ops_local64 = {
+static const struct kernel_param_ops param_ops_local64 = {
 	.get = param_get_local64,
 	.set = param_set_local64,
 };
@@ -376,38 +376,42 @@ static void uv_nmi_wait(int master)
 		atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus());
 }
 
+/* Dump Instruction Pointer header */
 static void uv_nmi_dump_cpu_ip_hdr(void)
 {
-	printk(KERN_DEFAULT
-		"\nUV: %4s %6s %-32s %s   (Note: PID 0 not listed)\n",
+	pr_info("\nUV: %4s %6s %-32s %s   (Note: PID 0 not listed)\n",
 		"CPU", "PID", "COMMAND", "IP");
 }
 
+/* Dump Instruction Pointer info */
 static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs)
 {
-	printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ",
-		cpu, current->pid, current->comm);
-
+	pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm);
 	printk_address(regs->ip);
 }
 
-/* Dump this cpu's state */
+/*
+ * Dump this CPU's state.  If action was set to "kdump" and the crash_kexec
+ * failed, then we provide "dump" as an alternate action.  Action "dump" now
+ * also includes the show "ips" (instruction pointers) action whereas the
+ * action "ips" only displays instruction pointers for the non-idle CPU's.
+ * This is an abbreviated form of the "ps" command.
+ */
 static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
 {
 	const char *dots = " ................................. ";
 
-	if (uv_nmi_action_is("ips")) {
-		if (cpu == 0)
-			uv_nmi_dump_cpu_ip_hdr();
+	if (cpu == 0)
+		uv_nmi_dump_cpu_ip_hdr();
 
-		if (current->pid != 0)
-			uv_nmi_dump_cpu_ip(cpu, regs);
+	if (current->pid != 0 || !uv_nmi_action_is("ips"))
+		uv_nmi_dump_cpu_ip(cpu, regs);
 
-	} else if (uv_nmi_action_is("dump")) {
-		printk(KERN_DEFAULT
-			"UV:%sNMI process trace for CPU %d\n", dots, cpu);
+	if (uv_nmi_action_is("dump")) {
+		pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu);
 		show_regs(regs);
 	}
+
 	this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
 }
 
@@ -469,8 +473,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
 				uv_nmi_trigger_dump(tcpu);
 		}
 		if (ignored)
-			printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n",
-				ignored);
+			pr_alert("UV: %d CPUs ignored NMI\n", ignored);
 
 		console_loglevel = saved_console_loglevel;
 		pr_alert("UV: process trace complete\n");
@@ -492,8 +495,9 @@ static void uv_nmi_touch_watchdogs(void)
 	touch_nmi_watchdog();
 }
 
-#if defined(CONFIG_KEXEC)
 static atomic_t uv_nmi_kexec_failed;
+
+#if defined(CONFIG_KEXEC_CORE)
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
 	/* Call crash to dump system state */
@@ -502,10 +506,9 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 		crash_kexec(regs);
 
 		pr_emerg("UV: crash_kexec unexpectedly returned, ");
+		atomic_set(&uv_nmi_kexec_failed, 1);
 		if (!kexec_crash_image) {
 			pr_cont("crash kernel not loaded\n");
-			atomic_set(&uv_nmi_kexec_failed, 1);
-			uv_nmi_sync_exit(1);
 			return;
 		}
 		pr_cont("kexec busy, stalling cpus while waiting\n");
@@ -514,18 +517,16 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 	/* If crash exec fails the slaves should return, otherwise stall */
 	while (atomic_read(&uv_nmi_kexec_failed) == 0)
 		mdelay(10);
-
-	/* Crash kernel most likely not loaded, return in an orderly fashion */
-	uv_nmi_sync_exit(0);
 }
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
 	if (master)
 		pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
+	atomic_set(&uv_nmi_kexec_failed, 1);
 }
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
@@ -613,9 +614,14 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 	master = (atomic_read(&uv_nmi_cpu) == cpu);
 
 	/* If NMI action is "kdump", then attempt to do it */
-	if (uv_nmi_action_is("kdump"))
+	if (uv_nmi_action_is("kdump")) {
 		uv_nmi_kdump(cpu, master, regs);
 
+		/* Unexpected return, revert action to "dump" */
+		if (master)
+			strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action));
+	}
+
 	/* Pause as all cpus enter the NMI handler */
 	uv_nmi_wait(master);
 
@@ -640,6 +646,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
 		atomic_set(&uv_nmi_cpus_in_nmi, -1);
 		atomic_set(&uv_nmi_cpu, -1);
 		atomic_set(&uv_in_nmi, 0);
+		atomic_set(&uv_nmi_kexec_failed, 0);
 	}
 
 	uv_nmi_touch_watchdogs();
diff --git a/kernel/arch/x86/platform/uv/uv_time.c b/kernel/arch/x86/platform/uv/uv_time.c
index a718fe0d2..5e0b12262 100644
--- a/kernel/arch/x86/platform/uv/uv_time.c
+++ b/kernel/arch/x86/platform/uv/uv_time.c
@@ -32,8 +32,7 @@
 
 static cycle_t uv_read_rtc(struct clocksource *cs);
 static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
-static void uv_rtc_timer_setup(enum clock_event_mode,
-				struct clock_event_device *);
+static int uv_rtc_shutdown(struct clock_event_device *evt);
 
 static struct clocksource clocksource_uv = {
 	.name		= RTC_NAME,
@@ -44,14 +43,14 @@ static struct clocksource clocksource_uv = {
 };
 
 static struct clock_event_device clock_event_device_uv = {
-	.name		= RTC_NAME,
-	.features	= CLOCK_EVT_FEAT_ONESHOT,
-	.shift		= 20,
-	.rating		= 400,
-	.irq		= -1,
-	.set_next_event	= uv_rtc_next_event,
-	.set_mode	= uv_rtc_timer_setup,
-	.event_handler	= NULL,
+	.name			= RTC_NAME,
+	.features		= CLOCK_EVT_FEAT_ONESHOT,
+	.shift			= 20,
+	.rating			= 400,
+	.irq			= -1,
+	.set_next_event		= uv_rtc_next_event,
+	.set_state_shutdown	= uv_rtc_shutdown,
+	.event_handler		= NULL,
 };
 
 static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
@@ -326,24 +325,14 @@ static int uv_rtc_next_event(unsigned long delta,
 }
 
 /*
- * Setup the RTC timer in oneshot mode
+ * Shutdown the RTC timer
  */
-static void uv_rtc_timer_setup(enum clock_event_mode mode,
-			       struct clock_event_device *evt)
+static int uv_rtc_shutdown(struct clock_event_device *evt)
 {
 	int ced_cpu = cpumask_first(evt->cpumask);
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here yet */
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		uv_rtc_unset_timer(ced_cpu, 1);
-		break;
-	}
+	uv_rtc_unset_timer(ced_cpu, 1);
+	return 0;
 }
 
 static void uv_rtc_interrupt(void)
diff --git a/kernel/arch/x86/power/cpu.c b/kernel/arch/x86/power/cpu.c
index bf9384488..9ab52791f 100644
--- a/kernel/arch/x86/power/cpu.c
+++ b/kernel/arch/x86/power/cpu.c
@@ -18,10 +18,9 @@
 #include <asm/mtrr.h>
 #include <asm/page.h>
 #include <asm/mce.h>
-#include <asm/xcr.h>
 #include <asm/suspend.h>
+#include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
-#include <asm/fpu-internal.h> /* pcntxt_mask */
 #include <asm/cpu.h>
 #include <asm/mmu_context.h>
 
@@ -156,6 +155,8 @@ static void fix_processor_context(void)
 #endif
 	load_TR_desc();				/* This does ltr */
 	load_mm_ldt(current->active_mm);	/* This does lldt */
+
+	fpu__resume_cpu();
 }
 
 /**
@@ -222,12 +223,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
 #endif
 
-	/*
-	 * restore XCR0 for xsave capable cpu's.
-	 */
-	if (cpu_has_xsave)
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-
 	fix_processor_context();
 
 	do_fpu_end();
diff --git a/kernel/arch/x86/power/hibernate_asm_64.S b/kernel/arch/x86/power/hibernate_asm_64.S
index 3c4469a7a..e2386cb4e 100644
--- a/kernel/arch/x86/power/hibernate_asm_64.S
+++ b/kernel/arch/x86/power/hibernate_asm_64.S
@@ -78,9 +78,9 @@ ENTRY(restore_image)
 
 	/* code below has been relocated to a safe page */
 ENTRY(core_restore_code)
-loop:
+.Lloop:
 	testq	%rdx, %rdx
-	jz	done
+	jz	.Ldone
 
 	/* get addresses from the pbe and copy the page */
 	movq	pbe_address(%rdx), %rsi
@@ -91,8 +91,8 @@ loop:
 
 	/* progress to the next pbe */
 	movq	pbe_next(%rdx), %rdx
-	jmp	loop
-done:
+	jmp	.Lloop
+.Ldone:
 	/* jump to the restore_registers address from the image header */
 	jmpq	*%rax
 	/*
diff --git a/kernel/arch/x86/ras/Kconfig b/kernel/arch/x86/ras/Kconfig
new file mode 100644
index 000000000..df280da34
--- /dev/null
+++ b/kernel/arch/x86/ras/Kconfig
@@ -0,0 +1,9 @@
+config AMD_MCE_INJ
+	tristate "Simple MCE injection interface for AMD processors"
+	depends on RAS && EDAC_DECODE_MCE && DEBUG_FS && AMD_NB
+	default n
+	help
+	  This is a simple debugfs interface to inject MCEs and test different
+	  aspects of the MCE handling code.
+
+	  WARNING: Do not even assume this interface is staying stable!
diff --git a/kernel/arch/x86/ras/Makefile b/kernel/arch/x86/ras/Makefile
new file mode 100644
index 000000000..dd2c98b84
--- /dev/null
+++ b/kernel/arch/x86/ras/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_AMD_MCE_INJ)		+= mce_amd_inj.o
+
diff --git a/kernel/arch/x86/ras/mce_amd_inj.c b/kernel/arch/x86/ras/mce_amd_inj.c
new file mode 100644
index 000000000..55d38cfa4
--- /dev/null
+++ b/kernel/arch/x86/ras/mce_amd_inj.c
@@ -0,0 +1,464 @@
+/*
+ * A simple MCE injection facility for testing different aspects of the RAS
+ * code. This driver should be built as module so that it can be loaded
+ * on production kernels for testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-15:  Borislav Petkov <bp@alien8.de>
+ *			Advanced Micro Devices Inc.
+ */
+
+#include <linux/kobject.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/pci.h>
+
+#include <asm/mce.h>
+#include <asm/amd_nb.h>
+#include <asm/irq_vectors.h>
+
+#include "../kernel/cpu/mcheck/mce-internal.h"
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+static u8 n_banks;
+
+#define MAX_FLAG_OPT_SIZE	3
+#define NBCFG			0x44
+
+enum injection_type {
+	SW_INJ = 0,	/* SW injection, simply decode the error */
+	HW_INJ,		/* Trigger a #MC */
+	DFR_INT_INJ,    /* Trigger Deferred error interrupt */
+	THR_INT_INJ,    /* Trigger threshold interrupt */
+	N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+	[SW_INJ] = "sw",
+	[HW_INJ] = "hw",
+	[DFR_INT_INJ] = "df",
+	[THR_INT_INJ] = "th",
+	NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg)						\
+static int inj_##reg##_set(void *data, u64 val)				\
+{									\
+	struct mce *m = (struct mce *)data;				\
+									\
+	m->reg = val;							\
+	return 0;							\
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+
+#define MCE_INJECT_GET(reg)						\
+static int inj_##reg##_get(void *data, u64 *val)			\
+{									\
+	struct mce *m = (struct mce *)data;				\
+									\
+	*val = m->reg;							\
+	return 0;							\
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+	u32 l, h;
+	int err;
+
+	err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+	if (err) {
+		pr_err("%s: error reading HWCR\n", __func__);
+		return err;
+	}
+
+	enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+	err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+	if (err)
+		pr_err("%s: error writing HWCR\n", __func__);
+
+	return err;
+}
+
+static int __set_inj(const char *buf)
+{
+	int i;
+
+	for (i = 0; i < N_INJ_TYPES; i++) {
+		if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+			inj_type = i;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+			  size_t cnt, loff_t *ppos)
+{
+	char buf[MAX_FLAG_OPT_SIZE];
+	int n;
+
+	n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+			   size_t cnt, loff_t *ppos)
+{
+	char buf[MAX_FLAG_OPT_SIZE], *__buf;
+	int err;
+
+	if (cnt > MAX_FLAG_OPT_SIZE)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt - 1] = 0;
+
+	/* strip whitespace */
+	__buf = strstrip(buf);
+
+	err = __set_inj(__buf);
+	if (err) {
+		pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+		return err;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static const struct file_operations flags_fops = {
+	.read           = flags_read,
+	.write          = flags_write,
+	.llseek         = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+	struct mce *m = (struct mce *)data;
+
+	if (val >= nr_cpu_ids || !cpu_online(val)) {
+		pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+		return -EINVAL;
+	}
+	m->extcpu = val;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+	asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+	asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+	asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+	u32 cores_per_node;
+
+	cores_per_node = c->x86_max_cores / amd_get_nodes_per_socket();
+
+	return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+	struct pci_dev *F3 = node_to_amd_nb(nid)->misc;
+	u32 val;
+	int err;
+
+	if (!F3)
+		return;
+
+	err = pci_read_config_dword(F3, NBCFG, &val);
+	if (err) {
+		pr_err("%s: Error reading F%dx%03x.\n",
+		       __func__, PCI_FUNC(F3->devfn), NBCFG);
+		return;
+	}
+
+	if (val & BIT(27))
+		return;
+
+	pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+	       __func__);
+
+	val |= BIT(27);
+	err = pci_write_config_dword(F3, NBCFG, val);
+	if (err)
+		pr_err("%s: Error writing F%dx%03x.\n",
+		       __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void do_inject(void)
+{
+	u64 mcg_status = 0;
+	unsigned int cpu = i_mce.extcpu;
+	u8 b = i_mce.bank;
+
+	if (i_mce.misc)
+		i_mce.status |= MCI_STATUS_MISCV;
+
+	if (inj_type == SW_INJ) {
+		mce_inject_log(&i_mce);
+		return;
+	}
+
+	/* prep MCE global settings for the injection */
+	mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+	if (!(i_mce.status & MCI_STATUS_PCC))
+		mcg_status |= MCG_STATUS_RIPV;
+
+	/*
+	 * Ensure necessary status bits for deferred errors:
+	 * - MCx_STATUS[Deferred]: make sure it is a deferred error
+	 * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+	 */
+	if (inj_type == DFR_INT_INJ) {
+		i_mce.status |= MCI_STATUS_DEFERRED;
+		i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+	}
+
+	/*
+	 * For multi node CPUs, logging and reporting of bank 4 errors happens
+	 * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+	 * Fam10h and later BKDGs.
+	 */
+	if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) {
+		toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+		cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+	}
+
+	get_online_cpus();
+	if (!cpu_online(cpu))
+		goto err;
+
+	toggle_hw_mce_inject(cpu, true);
+
+	wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS,
+		     (u32)mcg_status, (u32)(mcg_status >> 32));
+
+	wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
+		     (u32)i_mce.status, (u32)(i_mce.status >> 32));
+
+	wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
+		     (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+
+	wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
+		     (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+
+	toggle_hw_mce_inject(cpu, false);
+
+	switch (inj_type) {
+	case DFR_INT_INJ:
+		smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+		break;
+	case THR_INT_INJ:
+		smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+		break;
+	default:
+		smp_call_function_single(cpu, trigger_mce, NULL, 0);
+	}
+
+err:
+	put_online_cpus();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+	struct mce *m = (struct mce *)data;
+
+	if (val >= n_banks) {
+		pr_err("Non-existent MCE bank: %llu\n", val);
+		return -EINVAL;
+	}
+
+	m->bank = val;
+	do_inject();
+
+	return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t  - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t    format only. Safe to use.\n"
+"\t  - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t    handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t    is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t    before injecting.\n"
+"\t  - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t    error APIC interrupt handler to handle the error if the feature is \n"
+"\t    is present in hardware. \n"
+"\t  - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t    APIC interrupt handler to handle the error. \n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+					readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+	.read		= inj_readme_read,
+};
+
+static struct dfs_node {
+	char *name;
+	struct dentry *d;
+	const struct file_operations *fops;
+	umode_t perm;
+} dfs_fls[] = {
+	{ .name = "status",	.fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+	{ .name = "misc",	.fops = &misc_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "addr",	.fops = &addr_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "bank",	.fops = &bank_fops,   .perm = S_IRUSR | S_IWUSR },
+	{ .name = "flags",	.fops = &flags_fops,  .perm = S_IRUSR | S_IWUSR },
+	{ .name = "cpu",	.fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+	{ .name = "README",	.fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static int __init init_mce_inject(void)
+{
+	int i;
+	u64 cap;
+
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	n_banks = cap & MCG_BANKCNT_MASK;
+
+	dfs_inj = debugfs_create_dir("mce-inject", NULL);
+	if (!dfs_inj)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+		dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+						    dfs_fls[i].perm,
+						    dfs_inj,
+						    &i_mce,
+						    dfs_fls[i].fops);
+
+		if (!dfs_fls[i].d)
+			goto err_dfs_add;
+	}
+
+	return 0;
+
+err_dfs_add:
+	while (--i >= 0)
+		debugfs_remove(dfs_fls[i].d);
+
+	debugfs_remove(dfs_inj);
+	dfs_inj = NULL;
+
+	return -ENOMEM;
+}
+
+static void __exit exit_mce_inject(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
+		debugfs_remove(dfs_fls[i].d);
+
+	memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+	debugfs_remove(dfs_inj);
+	dfs_inj = NULL;
+}
+module_init(init_mce_inject);
+module_exit(exit_mce_inject);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Borislav Petkov <bp@alien8.de>");
+MODULE_AUTHOR("AMD Inc.");
+MODULE_DESCRIPTION("MCE injection facility for RAS testing");
diff --git a/kernel/arch/x86/um/Makefile b/kernel/arch/x86/um/Makefile
index acb384d24..a8fecc226 100644
--- a/kernel/arch/x86/um/Makefile
+++ b/kernel/arch/x86/um/Makefile
@@ -26,7 +26,7 @@ else
 
 obj-y += syscalls_64.o vdso/
 
-subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \
+subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \
 		../lib/rwsem.o
 
 endif
diff --git a/kernel/arch/x86/um/asm/barrier.h b/kernel/arch/x86/um/asm/barrier.h
index 7e8a1a650..755481f14 100644
--- a/kernel/arch/x86/um/asm/barrier.h
+++ b/kernel/arch/x86/um/asm/barrier.h
@@ -39,22 +39,10 @@
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif
diff --git a/kernel/arch/x86/um/asm/checksum.h b/kernel/arch/x86/um/asm/checksum.h
index 4b181b744..ee940185e 100644
--- a/kernel/arch/x86/um/asm/checksum.h
+++ b/kernel/arch/x86/um/asm/checksum.h
@@ -3,6 +3,7 @@
 
 #include <linux/string.h>
 #include <linux/in6.h>
+#include <linux/uaccess.h>
 
 /*
  * computes the checksum of a memory block at buff, length len,
diff --git a/kernel/arch/x86/um/asm/elf.h b/kernel/arch/x86/um/asm/elf.h
index 0a656b727..548197212 100644
--- a/kernel/arch/x86/um/asm/elf.h
+++ b/kernel/arch/x86/um/asm/elf.h
@@ -200,8 +200,6 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
 typedef struct user_i387_struct elf_fpregset_t;
 
-#define task_pt_regs(t) (&(t)->thread.regs)
-
 struct task_struct;
 
 extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
diff --git a/kernel/arch/x86/um/asm/processor.h b/kernel/arch/x86/um/asm/processor.h
index 2a206d2b1..233ee09c1 100644
--- a/kernel/arch/x86/um/asm/processor.h
+++ b/kernel/arch/x86/um/asm/processor.h
@@ -28,6 +28,8 @@ static inline void rep_nop(void)
 #define cpu_relax()		rep_nop()
 #define cpu_relax_lowlatency()	cpu_relax()
 
+#define task_pt_regs(t) (&(t)->thread.regs)
+
 #include <asm/processor-generic.h>
 
 #endif
diff --git a/kernel/arch/x86/um/asm/segment.h b/kernel/arch/x86/um/asm/segment.h
index 45183fcd1..41dd5e1f3 100644
--- a/kernel/arch/x86/um/asm/segment.h
+++ b/kernel/arch/x86/um/asm/segment.h
@@ -7,4 +7,12 @@ extern int host_gdt_entry_tls_min;
 #define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min
 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
 
+typedef struct {
+	unsigned long seg;
+} mm_segment_t;
+
+#define MAKE_MM_SEG(s)	((mm_segment_t) { (s) })
+#define KERNEL_DS	MAKE_MM_SEG(~0UL)
+#define USER_DS		MAKE_MM_SEG(TASK_SIZE)
+
 #endif
diff --git a/kernel/arch/x86/um/asm/syscall.h b/kernel/arch/x86/um/asm/syscall.h
index 9fe77b7b5..81d6562ce 100644
--- a/kernel/arch/x86/um/asm/syscall.h
+++ b/kernel/arch/x86/um/asm/syscall.h
@@ -3,6 +3,10 @@
 
 #include <uapi/linux/audit.h>
 
+typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
+					  unsigned long, unsigned long,
+					  unsigned long, unsigned long);
+
 static inline int syscall_get_arch(void)
 {
 #ifdef CONFIG_X86_32
diff --git a/kernel/arch/x86/um/ldt.c b/kernel/arch/x86/um/ldt.c
index 5c0b711d2..836a1eb5d 100644
--- a/kernel/arch/x86/um/ldt.c
+++ b/kernel/arch/x86/um/ldt.c
@@ -6,12 +6,16 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <os.h>
 #include <skas.h>
 #include <sysdep/tls.h>
 
-extern int modify_ldt(int func, void *ptr, unsigned long bytecount);
+static inline int modify_ldt (int func, void *ptr, unsigned long bytecount)
+{
+	return syscall(__NR_modify_ldt, func, ptr, bytecount);
+}
 
 static long write_ldt_entry(struct mm_id *mm_idp, int func,
 		     struct user_desc *desc, void **addr, int done)
diff --git a/kernel/arch/x86/um/mem_32.c b/kernel/arch/x86/um/mem_32.c
index f40281e5d..744afdc18 100644
--- a/kernel/arch/x86/um/mem_32.c
+++ b/kernel/arch/x86/um/mem_32.c
@@ -7,8 +7,7 @@
  */
 
 #include <linux/mm.h>
-#include <asm/page.h>
-#include <asm/mman.h>
+#include <asm/elf.h>
 
 static struct vm_area_struct gate_vma;
 
diff --git a/kernel/arch/x86/um/mem_64.c b/kernel/arch/x86/um/mem_64.c
index f8fecaddc..7642e2e2a 100644
--- a/kernel/arch/x86/um/mem_64.c
+++ b/kernel/arch/x86/um/mem_64.c
@@ -1,6 +1,5 @@
 #include <linux/mm.h>
-#include <asm/page.h>
-#include <asm/mman.h>
+#include <asm/elf.h>
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
diff --git a/kernel/arch/x86/um/ptrace_32.c b/kernel/arch/x86/um/ptrace_32.c
index ce3dd4f36..a29756f2d 100644
--- a/kernel/arch/x86/um/ptrace_32.c
+++ b/kernel/arch/x86/um/ptrace_32.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
 #include <skas.h>
 
 extern int arch_switch_tls(struct task_struct *to);
diff --git a/kernel/arch/x86/um/ptrace_64.c b/kernel/arch/x86/um/ptrace_64.c
index 3b52bf0b4..a629694ee 100644
--- a/kernel/arch/x86/um/ptrace_64.c
+++ b/kernel/arch/x86/um/ptrace_64.c
@@ -11,6 +11,7 @@
 #define __FRAME_OFFSETS
 #include <asm/ptrace.h>
 #include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
 
 /*
  * determines which flags the user has access to.
diff --git a/kernel/arch/x86/um/shared/sysdep/tls.h b/kernel/arch/x86/um/shared/sysdep/tls.h
index 27cce00c6..a682db13d 100644
--- a/kernel/arch/x86/um/shared/sysdep/tls.h
+++ b/kernel/arch/x86/um/shared/sysdep/tls.h
@@ -1,7 +1,7 @@
 #ifndef _SYSDEP_TLS_H
 #define _SYSDEP_TLS_H
 
-# ifndef __KERNEL__
+#ifdef __UM_HOST__
 
 /* Change name to avoid conflicts with the original one from <asm/ldt.h>, which
  * may be named user_desc (but in 2.4 and in header matching its API was named
@@ -22,11 +22,11 @@ typedef struct um_dup_user_desc {
 #endif
 } user_desc_t;
 
-# else /* __KERNEL__ */
+#else /* __UM_HOST__ */
 
 typedef struct user_desc user_desc_t;
 
-# endif /* __KERNEL__ */
+#endif /* __UM_HOST__ */
 
 extern int os_set_thread_area(user_desc_t *info, int pid);
 extern int os_get_thread_area(user_desc_t *info, int pid);
diff --git a/kernel/arch/x86/um/signal.c b/kernel/arch/x86/um/signal.c
index 592491d1d..14fcd01ed 100644
--- a/kernel/arch/x86/um/signal.c
+++ b/kernel/arch/x86/um/signal.c
@@ -211,7 +211,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 		if (err)
 			return 1;
 
-		err = convert_fxsr_from_user(&fpx, sc.fpstate);
+		err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate);
 		if (err)
 			return 1;
 
@@ -227,7 +227,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	{
 		struct user_i387_struct fp;
 
-		err = copy_from_user(&fp, sc.fpstate,
+		err = copy_from_user(&fp, (void *)sc.fpstate,
 				     sizeof(struct user_i387_struct));
 		if (err)
 			return 1;
@@ -291,7 +291,7 @@ static int copy_sc_to_user(struct sigcontext __user *to,
 #endif
 #undef PUTREG
 	sc.oldmask = mask;
-	sc.fpstate = to_fp;
+	sc.fpstate = (unsigned long)to_fp;
 
 	err = copy_to_user(to, &sc, sizeof(struct sigcontext));
 	if (err)
@@ -468,12 +468,10 @@ long sys_sigreturn(void)
 	struct sigframe __user *frame = (struct sigframe __user *)(sp - 8);
 	sigset_t set;
 	struct sigcontext __user *sc = &frame->sc;
-	unsigned long __user *oldmask = &sc->oldmask;
-	unsigned long __user *extramask = frame->extramask;
 	int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
 
-	if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) ||
-	    copy_from_user(&set.sig[1], extramask, sig_size))
+	if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) ||
+	    copy_from_user(&set.sig[1], frame->extramask, sig_size))
 		goto segfault;
 
 	set_current_blocked(&set);
@@ -505,6 +503,7 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
 {
 	struct rt_sigframe __user *frame;
 	int err = 0, sig = ksig->sig;
+	unsigned long fp_to;
 
 	frame = (struct rt_sigframe __user *)
 		round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -526,7 +525,10 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
 	err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs));
 	err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs,
 			       set->sig[0]);
-	err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate);
+
+	fp_to = (unsigned long)&frame->fpstate;
+
+	err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate);
 	if (sizeof(*set) == 16) {
 		err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
 		err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
@@ -541,7 +543,8 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
 	 */
 	/* x86-64 should always use SA_RESTORER. */
 	if (ksig->ka.sa.sa_flags & SA_RESTORER)
-		err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode);
+		err |= __put_user((void *)ksig->ka.sa.sa_restorer,
+				  &frame->pretcode);
 	else
 		/* could use a vstub here */
 		return err;
diff --git a/kernel/arch/x86/um/stub_32.S b/kernel/arch/x86/um/stub_32.S
index b972649d3..98816804e 100644
--- a/kernel/arch/x86/um/stub_32.S
+++ b/kernel/arch/x86/um/stub_32.S
@@ -1,6 +1,5 @@
 #include <as-layout.h>
 
-	.globl syscall_stub
 .section .__syscall_stub, "ax"
 
 	.globl batch_syscall_stub
diff --git a/kernel/arch/x86/um/stub_64.S b/kernel/arch/x86/um/stub_64.S
index 7160b2017..ba914b3b8 100644
--- a/kernel/arch/x86/um/stub_64.S
+++ b/kernel/arch/x86/um/stub_64.S
@@ -1,25 +1,9 @@
 #include <as-layout.h>
 
-	.globl syscall_stub
 .section .__syscall_stub, "ax"
-syscall_stub:
-	syscall
-	/* We don't have 64-bit constants, so this constructs the address
-	 * we need.
-	 */
-	movq	$(STUB_DATA >> 32), %rbx
-	salq	$32, %rbx
-	movq	$(STUB_DATA & 0xffffffff), %rcx
-	or	%rcx, %rbx
-	movq	%rax, (%rbx)
-	int3
-
 	.globl batch_syscall_stub
 batch_syscall_stub:
-	mov	$(STUB_DATA >> 32), %rbx
-	sal	$32, %rbx
-	mov	$(STUB_DATA & 0xffffffff), %rax
-	or	%rax, %rbx
+	mov	$(STUB_DATA), %rbx
 	/* load pointer to first operation */
 	mov	%rbx, %rsp
 	add	$0x10, %rsp
diff --git a/kernel/arch/x86/um/sys_call_table_32.c b/kernel/arch/x86/um/sys_call_table_32.c
index bd16d6c37..439c0994b 100644
--- a/kernel/arch/x86/um/sys_call_table_32.c
+++ b/kernel/arch/x86/um/sys_call_table_32.c
@@ -7,6 +7,7 @@
 #include <linux/sys.h>
 #include <linux/cache.h>
 #include <generated/user_constants.h>
+#include <asm/syscall.h>
 
 #define __NO_STUBS
 
@@ -24,15 +25,13 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
 
-typedef asmlinkage void (*sys_call_ptr_t)(void);
-
-extern asmlinkage void sys_ni_syscall(void);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
 const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
 	/*
diff --git a/kernel/arch/x86/um/sys_call_table_64.c b/kernel/arch/x86/um/sys_call_table_64.c
index a75d87004..b74ea6c2c 100644
--- a/kernel/arch/x86/um/sys_call_table_64.c
+++ b/kernel/arch/x86/um/sys_call_table_64.c
@@ -7,6 +7,7 @@
 #include <linux/sys.h>
 #include <linux/cache.h>
 #include <generated/user_constants.h>
+#include <asm/syscall.h>
 
 #define __NO_STUBS
 
@@ -37,15 +38,13 @@
 #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
 #define __SYSCALL_X32(nr, sym, compat) /* Not supported */
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
 #define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
 
-typedef void (*sys_call_ptr_t)(void);
-
-extern void sys_ni_syscall(void);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
 const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
 	/*
diff --git a/kernel/arch/x86/um/syscalls_64.c b/kernel/arch/x86/um/syscalls_64.c
index adb08eb5c..e65522753 100644
--- a/kernel/arch/x86/um/syscalls_64.c
+++ b/kernel/arch/x86/um/syscalls_64.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/uaccess.h>
 #include <asm/prctl.h> /* XXX This should get the constants from libc */
 #include <os.h>
 
diff --git a/kernel/arch/x86/um/tls_32.c b/kernel/arch/x86/um/tls_32.c
index 80ffa5b99..48e38584d 100644
--- a/kernel/arch/x86/um/tls_32.c
+++ b/kernel/arch/x86/um/tls_32.c
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/syscalls.h>
 #include <asm/uaccess.h>
+#include <asm/ptrace-abi.h>
 #include <os.h>
 #include <skas.h>
 #include <sysdep/tls.h>
diff --git a/kernel/arch/x86/um/tls_64.c b/kernel/arch/x86/um/tls_64.c
index d22363cb8..3ad714373 100644
--- a/kernel/arch/x86/um/tls_64.c
+++ b/kernel/arch/x86/um/tls_64.c
@@ -1,4 +1,5 @@
 #include <linux/sched.h>
+#include <asm/ptrace-abi.h>
 
 void clear_flushed_tls(struct task_struct *task)
 {
diff --git a/kernel/arch/x86/um/vdso/vma.c b/kernel/arch/x86/um/vdso/vma.c
index 916cda4cd..237c6831e 100644
--- a/kernel/arch/x86/um/vdso/vma.c
+++ b/kernel/arch/x86/um/vdso/vma.c
@@ -10,6 +10,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/page.h>
+#include <asm/elf.h>
 #include <linux/init.h>
 
 static unsigned int __read_mostly vdso_enabled = 1;
diff --git a/kernel/arch/x86/vdso/vdso32/int80.S b/kernel/arch/x86/vdso/vdso32/int80.S
deleted file mode 100644
index b15b7c01a..000000000
--- a/kernel/arch/x86/vdso/vdso32/int80.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Code for the vDSO.  This version uses the old int $0x80 method.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#include "sigreturn.S"
-
-	.text
-	.globl __kernel_vsyscall
-	.type __kernel_vsyscall,@function
-	ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
-	int $0x80
-	ret
-.LEND_vsyscall:
-	.size __kernel_vsyscall,.-.LSTART_vsyscall
-	.previous
-
-	.section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
-	.long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
-	.long 0			/* CIE ID */
-	.byte 1			/* Version number */
-	.string "zR"		/* NUL-terminated augmentation string */
-	.uleb128 1		/* Code alignment factor */
-	.sleb128 -4		/* Data alignment factor */
-	.byte 8			/* Return address register column */
-	.uleb128 1		/* Augmentation value length */
-	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-	.byte 0x0c		/* DW_CFA_def_cfa */
-	.uleb128 4
-	.uleb128 4
-	.byte 0x88		/* DW_CFA_offset, column 0x8 */
-	.uleb128 1
-	.align 4
-.LENDCIEDLSI:
-	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
-	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
-	.long .LSTART_vsyscall-.	/* PC-relative start address */
-	.long .LEND_vsyscall-.LSTART_vsyscall
-	.uleb128 0
-	.align 4
-.LENDFDEDLSI:
-	.previous
-
-	/*
-	 * Pad out the segment to match the size of the sysenter.S version.
-	 */
-VDSO32_vsyscall_eh_frame_size = 0x40
-	.section .data,"aw",@progbits
-	.space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
-	.previous
diff --git a/kernel/arch/x86/vdso/vdso32/syscall.S b/kernel/arch/x86/vdso/vdso32/syscall.S
deleted file mode 100644
index 6b286bb52..000000000
--- a/kernel/arch/x86/vdso/vdso32/syscall.S
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Code for the vDSO.  This version uses the syscall instruction.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#define SYSCALL_ENTER_KERNEL	syscall
-#include "sigreturn.S"
-
-#include <asm/segment.h>
-
-	.text
-	.globl __kernel_vsyscall
-	.type __kernel_vsyscall,@function
-	ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
-	push	%ebp
-.Lpush_ebp:
-	movl	%ecx, %ebp
-	syscall
-	movl	%ebp, %ecx
-	popl	%ebp
-.Lpop_ebp:
-	ret
-.LEND_vsyscall:
-	.size __kernel_vsyscall,.-.LSTART_vsyscall
-
-	.section .eh_frame,"a",@progbits
-.LSTARTFRAME:
-	.long .LENDCIE-.LSTARTCIE
-.LSTARTCIE:
-	.long 0			/* CIE ID */
-	.byte 1			/* Version number */
-	.string "zR"		/* NUL-terminated augmentation string */
-	.uleb128 1		/* Code alignment factor */
-	.sleb128 -4		/* Data alignment factor */
-	.byte 8			/* Return address register column */
-	.uleb128 1		/* Augmentation value length */
-	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-	.byte 0x0c		/* DW_CFA_def_cfa */
-	.uleb128 4
-	.uleb128 4
-	.byte 0x88		/* DW_CFA_offset, column 0x8 */
-	.uleb128 1
-	.align 4
-.LENDCIE:
-
-	.long .LENDFDE1-.LSTARTFDE1	/* Length FDE */
-.LSTARTFDE1:
-	.long .LSTARTFDE1-.LSTARTFRAME	/* CIE pointer */
-	.long .LSTART_vsyscall-.	/* PC-relative start address */
-	.long .LEND_vsyscall-.LSTART_vsyscall
-	.uleb128 0			/* Augmentation length */
-	/* What follows are the instructions for the table generation.
-	   We have to record all changes of the stack pointer.  */
-	.byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.uleb128 8
-	.byte 0x85, 0x02	/* DW_CFA_offset %ebp -8 */
-	.byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
-	.byte 0xc5		/* DW_CFA_restore %ebp */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.uleb128 4
-	.align 4
-.LENDFDE1:
-	.previous
-
-	/*
-	 * Pad out the segment to match the size of the sysenter.S version.
-	 */
-VDSO32_vsyscall_eh_frame_size = 0x40
-	.section .data,"aw",@progbits
-	.space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
-	.previous
diff --git a/kernel/arch/x86/vdso/vdso32/sysenter.S b/kernel/arch/x86/vdso/vdso32/sysenter.S
deleted file mode 100644
index e354bceee..000000000
--- a/kernel/arch/x86/vdso/vdso32/sysenter.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Code for the vDSO.  This version uses the sysenter instruction.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#include "sigreturn.S"
-
-/*
- * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
- * %ecx itself for arg2. The pushing is because the sysexit instruction
- * (found in entry.S) requires that we clobber %ecx with the desired %esp.
- * User code might expect that %ecx is unclobbered though, as it would be
- * for returning via the iret instruction, so we must push and pop.
- *
- * The caller puts arg3 in %edx, which the sysexit instruction requires
- * for %eip. Thus, exactly as for arg2, we must push and pop.
- *
- * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
- * instruction clobbers %esp, the user's %esp won't even survive entry
- * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
- * arg6 from the stack.
- *
- * You can not use this vsyscall for the clone() syscall because the
- * three words on the parent stack do not get copied to the child.
- */
-	.text
-	.globl __kernel_vsyscall
-	.type __kernel_vsyscall,@function
-	ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
-	push %ecx
-.Lpush_ecx:
-	push %edx
-.Lpush_edx:
-	push %ebp
-.Lenter_kernel:
-	movl %esp,%ebp
-	sysenter
-
-	/* 7: align return point with nop's to make disassembly easier */
-	.space 7,0x90
-
-	/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
-	int $0x80
-	/* 16: System call normal return point is here! */
-VDSO32_SYSENTER_RETURN:	/* Symbol used by sysenter.c via vdso32-syms.h */
-	pop %ebp
-.Lpop_ebp:
-	pop %edx
-.Lpop_edx:
-	pop %ecx
-.Lpop_ecx:
-	ret
-.LEND_vsyscall:
-	.size __kernel_vsyscall,.-.LSTART_vsyscall
-	.previous
-
-	.section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
-	.long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
-	.long 0			/* CIE ID */
-	.byte 1			/* Version number */
-	.string "zR"		/* NUL-terminated augmentation string */
-	.uleb128 1		/* Code alignment factor */
-	.sleb128 -4		/* Data alignment factor */
-	.byte 8			/* Return address register column */
-	.uleb128 1		/* Augmentation value length */
-	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
-	.byte 0x0c		/* DW_CFA_def_cfa */
-	.uleb128 4
-	.uleb128 4
-	.byte 0x88		/* DW_CFA_offset, column 0x8 */
-	.uleb128 1
-	.align 4
-.LENDCIEDLSI:
-	.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
-	.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
-	.long .LSTART_vsyscall-.	/* PC-relative start address */
-	.long .LEND_vsyscall-.LSTART_vsyscall
-	.uleb128 0
-	/* What follows are the instructions for the table generation.
-	   We have to record all changes of the stack pointer.  */
-	.byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x08		/* RA at offset 8 now */
-	.byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x0c		/* RA at offset 12 now */
-	.byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x10		/* RA at offset 16 now */
-	.byte 0x85, 0x04	/* DW_CFA_offset %ebp -16 */
-	/* Finally the epilogue.  */
-	.byte 0x40 + (.Lpop_ebp-.Lenter_kernel)	/* DW_CFA_advance_loc */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x0c		/* RA at offset 12 now */
-	.byte 0xc5		/* DW_CFA_restore %ebp */
-	.byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x08		/* RA at offset 8 now */
-	.byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
-	.byte 0x0e		/* DW_CFA_def_cfa_offset */
-	.byte 0x04		/* RA at offset 4 now */
-	.align 4
-.LENDFDEDLSI:
-	.previous
-
-	/*
-	 * Emit a symbol with the size of this .eh_frame data,
-	 * to verify it matches the other versions.
-	 */
-VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
diff --git a/kernel/arch/x86/xen/Kconfig b/kernel/arch/x86/xen/Kconfig
index 484145368..c7b15f3e2 100644
--- a/kernel/arch/x86/xen/Kconfig
+++ b/kernel/arch/x86/xen/Kconfig
@@ -7,6 +7,7 @@ config XEN
 	depends on PARAVIRT
 	select PARAVIRT_CLOCK
 	select XEN_HAVE_PVMMU
+	select XEN_HAVE_VPMU
 	depends on X86_64 || (X86_32 && X86_PAE)
 	depends on X86_LOCAL_APIC && X86_TSC
 	help
@@ -23,14 +24,18 @@ config XEN_PVHVM
 	def_bool y
 	depends on XEN && PCI && X86_LOCAL_APIC
 
-config XEN_MAX_DOMAIN_MEMORY
-       int
-       default 500 if X86_64
-       default 64 if X86_32
-       depends on XEN
-       help
-         This only affects the sizing of some bss arrays, the unused
-         portions of which are freed.
+config XEN_512GB
+	bool "Limit Xen pv-domain memory to 512GB"
+	depends on XEN && X86_64
+	default y
+	help
+	  Limit paravirtualized user domains to 512GB of RAM.
+
+	  The Xen tools and crash dump analysis tools might not support
+	  pv-domains with more than 512 GB of RAM. This option controls the
+	  default setting of the kernel to use only up to 512 GB or more.
+	  It is always possible to change the default via specifying the
+	  boot parameter "xen_512gb_limit".
 
 config XEN_SAVE_RESTORE
        bool
diff --git a/kernel/arch/x86/xen/Makefile b/kernel/arch/x86/xen/Makefile
index 4b6e29ac0..e47e52787 100644
--- a/kernel/arch/x86/xen/Makefile
+++ b/kernel/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o			:= $(nostackp)
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm.o xen-asm_$(BITS).o \
 			grant-table.o suspend.o platform-pci-unplug.o \
-			p2m.o apic.o
+			p2m.o apic.o pmu.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/kernel/arch/x86/xen/apic.c b/kernel/arch/x86/xen/apic.c
index 70e060ad8..acda713ab 100644
--- a/kernel/arch/x86/xen/apic.c
+++ b/kernel/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
 #include <xen/xen.h>
 #include <xen/interface/physdev.h>
 #include "xen-ops.h"
+#include "pmu.h"
 #include "smp.h"
 
 static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
 
 static void xen_apic_write(u32 reg, u32 val)
 {
+	if (reg == APIC_LVTPC) {
+		(void)pmu_apic_update(reg);
+		return;
+	}
+
 	/* Warn to see if there's any stray references */
 	WARN(1,"register: %x, value: %x\n", reg, val);
 }
diff --git a/kernel/arch/x86/xen/enlighten.c b/kernel/arch/x86/xen/enlighten.c
index a671e8372..b7de78bdc 100644
--- a/kernel/arch/x86/xen/enlighten.c
+++ b/kernel/arch/x86/xen/enlighten.c
@@ -33,6 +33,10 @@
 #include <linux/memblock.h>
 #include <linux/edd.h>
 
+#ifdef CONFIG_KEXEC_CORE
+#include <linux/kexec.h>
+#endif
+
 #include <xen/xen.h>
 #include <xen/events.h>
 #include <xen/interface/xen.h>
@@ -71,6 +75,7 @@
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
 #include <asm/pat.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
@@ -84,6 +89,7 @@
 #include "mmu.h"
 #include "smp.h"
 #include "multicalls.h"
+#include "pmu.h"
 
 EXPORT_SYMBOL_GPL(hypercall_page);
 
@@ -1010,8 +1016,7 @@ static void xen_write_cr0(unsigned long cr0)
 
 static void xen_write_cr4(unsigned long cr4)
 {
-	cr4 &= ~X86_CR4_PGE;
-	cr4 &= ~X86_CR4_PSE;
+	cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
 
 	native_write_cr4(cr4);
 }
@@ -1030,6 +1035,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
 {
 	u64 val;
 
+	if (pmu_msr_read(msr, &val, err))
+		return val;
+
 	val = native_read_msr_safe(msr, err);
 	switch (msr) {
 	case MSR_IA32_APICBASE:
@@ -1074,9 +1082,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 		/* Fast syscall setup is all done in hypercalls, so
 		   these are all ignored.  Stub them out here to stop
 		   Xen console noise. */
+		break;
 
 	default:
-		ret = native_write_msr_safe(msr, low, high);
+		if (!pmu_msr_write(msr, low, high, &ret))
+			ret = native_write_msr_safe(msr, low, high);
 	}
 
 	return ret;
@@ -1182,7 +1192,7 @@ static const struct pv_info xen_info __initconst = {
 #ifdef CONFIG_X86_64
 	.extra_user_64bit_cs = FLAT_USER_CS64,
 #endif
-
+	.features = 0,
 	.name = "Xen",
 };
 
@@ -1215,16 +1225,14 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
-	.read_tsc = native_read_tsc,
-	.read_pmc = native_read_pmc,
-
-	.read_tscp = native_read_tscp,
+	.read_pmc = xen_read_pmc,
 
 	.iret = xen_iret,
-	.irq_enable_sysexit = xen_sysexit,
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,
 	.usergs_sysret64 = xen_sysret64,
+#else
+	.irq_enable_sysexit = xen_sysexit,
 #endif
 
 	.load_tr_desc = paravirt_nop,
@@ -1266,6 +1274,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
 static void xen_reboot(int reason)
 {
 	struct sched_shutdown r = { .reason = reason };
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		xen_pmu_finish(cpu);
 
 	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
 		BUG();
@@ -1463,7 +1475,7 @@ static void xen_pvh_set_cr_flags(int cpu)
 		return;
 	/*
 	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
-	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
+	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
 	*/
 	if (cpu_has_pse)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
@@ -1507,6 +1519,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 {
 	struct physdev_set_iopl set_iopl;
 	unsigned long initrd_start = 0;
+	u64 pat;
 	int rc;
 
 	if (!xen_start_info)
@@ -1522,6 +1535,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
 
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
+	if (xen_initial_domain())
+		pv_info.features |= PV_SUPPORTED_RTC;
 	pv_init_ops = xen_init_ops;
 	pv_apic_ops = xen_apic_ops;
 	if (!xen_pvh_domain()) {
@@ -1608,14 +1623,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	early_boot_irqs_disabled = true;
 
 	xen_raw_console_write("mapping kernel into physical memory\n");
-	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+				   xen_start_info->nr_pages);
+	xen_reserve_special_pages();
 
 	/*
 	 * Modify the cache mode translation tables to match Xen's PAT
 	 * configuration.
 	 */
-
-	pat_init_cache_modes();
+	rdmsrl(MSR_IA32_CR_PAT, pat);
+	pat_init_cache_modes(pat);
 
 	/* keep using Xen gdt for now; no urgent need to change it */
 
@@ -1798,6 +1815,21 @@ static struct notifier_block xen_hvm_cpu_notifier = {
 	.notifier_call	= xen_hvm_cpu_notify,
 };
 
+#ifdef CONFIG_KEXEC_CORE
+static void xen_hvm_shutdown(void)
+{
+	native_machine_shutdown();
+	if (kexec_in_progress)
+		xen_reboot(SHUTDOWN_soft_reset);
+}
+
+static void xen_hvm_crash_shutdown(struct pt_regs *regs)
+{
+	native_machine_crash_shutdown(regs);
+	xen_reboot(SHUTDOWN_soft_reset);
+}
+#endif
+
 static void __init xen_hvm_guest_init(void)
 {
 	if (xen_pv_domain())
@@ -1817,6 +1849,10 @@ static void __init xen_hvm_guest_init(void)
 	x86_init.irqs.intr_init = xen_init_IRQ;
 	xen_hvm_init_time_ops();
 	xen_hvm_init_mmu_ops();
+#ifdef CONFIG_KEXEC_CORE
+	machine_ops.shutdown = xen_hvm_shutdown;
+	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
+#endif
 }
 #endif
 
@@ -1852,8 +1888,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
 
 static void xen_set_cpu_features(struct cpuinfo_x86 *c)
 {
-	if (xen_pv_domain())
+	if (xen_pv_domain()) {
 		clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+		set_cpu_cap(c, X86_FEATURE_XENPV);
+	}
 }
 
 const struct hypervisor_x86 x86_hyper_xen = {
@@ -1866,3 +1904,17 @@ const struct hypervisor_x86 x86_hyper_xen = {
 	.set_cpu_features       = xen_set_cpu_features,
 };
 EXPORT_SYMBOL(x86_hyper_xen);
+
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num)
+{
+	arch_register_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_register_cpu);
+
+void xen_arch_unregister_cpu(int num)
+{
+	arch_unregister_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_unregister_cpu);
+#endif
diff --git a/kernel/arch/x86/xen/grant-table.c b/kernel/arch/x86/xen/grant-table.c
index 1580e7a5a..e079500b1 100644
--- a/kernel/arch/x86/xen/grant-table.c
+++ b/kernel/arch/x86/xen/grant-table.c
@@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void)
 		kfree(pages);
 		return -ENOMEM;
 	}
-	rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+	rc = alloc_xenballooned_pages(nr_grant_frames, pages);
 	if (rc) {
 		pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
 			nr_grant_frames, rc);
diff --git a/kernel/arch/x86/xen/mmu.c b/kernel/arch/x86/xen/mmu.c
index dd151b204..cb5e266a8 100644
--- a/kernel/arch/x86/xen/mmu.c
+++ b/kernel/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
 DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
 
 /*
  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
 
 static void xen_post_allocator_init(void);
 
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct mmuext_op op;
+
+	op.cmd = cmd;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 #ifdef CONFIG_X86_64
 static void __init xen_cleanhighmap(unsigned long vaddr,
 				    unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
 	xen_mc_flush();
 }
 
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+	void *vaddr = __va(paddr);
+	void *vaddr_end = vaddr + size;
+
+	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+		make_lowmem_page_readwrite(vaddr);
+
+	memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+	if (unpin)
+		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+	ClearPagePinned(virt_to_page(__va(pa)));
+	xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+	unsigned long va = vaddr & PMD_MASK;
+	unsigned long pa;
+	pgd_t *pgd = pgd_offset_k(va);
+	pud_t *pud_page = pud_offset(pgd, 0);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned int i;
+	bool unpin;
+
+	unpin = (vaddr == 2 * PGDIR_SIZE);
+	set_pgd(pgd, __pgd(0));
+	do {
+		pud = pud_page + pud_index(va);
+		if (pud_none(*pud)) {
+			va += PUD_SIZE;
+		} else if (pud_large(*pud)) {
+			pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+			xen_free_ro_pages(pa, PUD_SIZE);
+			va += PUD_SIZE;
+		} else {
+			pmd = pmd_offset(pud, va);
+			if (pmd_large(*pmd)) {
+				pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+				xen_free_ro_pages(pa, PMD_SIZE);
+			} else if (!pmd_none(*pmd)) {
+				pte = pte_offset_kernel(pmd, va);
+				set_pmd(pmd, __pmd(0));
+				for (i = 0; i < PTRS_PER_PTE; ++i) {
+					if (pte_none(pte[i]))
+						break;
+					pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+					xen_free_ro_pages(pa, PAGE_SIZE);
+				}
+				xen_cleanmfnmap_free_pgtbl(pte, unpin);
+			}
+			va += PMD_SIZE;
+			if (pmd_index(va))
+				continue;
+			set_pud(pud, __pud(0));
+			xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+		}
+
+	} while (pud_index(va) || pmd_index(va));
+	xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
 static void __init xen_pagetable_p2m_free(void)
 {
 	unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
 	/* using __ka address and sticking INVALID_P2M_ENTRY! */
 	memset((void *)xen_start_info->mfn_list, 0xff, size);
 
-	/* We should be in __ka space. */
-	BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
 	addr = xen_start_info->mfn_list;
-	/* We roundup to the PMD, which means that if anybody at this stage is
-	 * using the __ka address of xen_start_info or xen_start_info->shared_info
-	 * they are in going to crash. Fortunatly we have already revectored
-	 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+	/*
+	 * We could be in __ka space.
+	 * We roundup to the PMD, which means that if anybody at this stage is
+	 * using the __ka address of xen_start_info or
+	 * xen_start_info->shared_info they are in going to crash. Fortunatly
+	 * we have already revectored in xen_setup_kernel_pagetable and in
+	 * xen_setup_shared_info.
+	 */
 	size = roundup(size, PMD_SIZE);
-	xen_cleanhighmap(addr, addr + size);
 
-	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-	memblock_free(__pa(xen_start_info->mfn_list), size);
+	if (addr >= __START_KERNEL_map) {
+		xen_cleanhighmap(addr, addr + size);
+		size = PAGE_ALIGN(xen_start_info->nr_pages *
+				  sizeof(unsigned long));
+		memblock_free(__pa(addr), size);
+	} else {
+		xen_cleanmfnmap(addr);
+	}
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+	unsigned long size;
+	unsigned long addr;
 
 	/* At this stage, cleanup_highmap has already cleaned __ka space
 	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
 
 #ifdef CONFIG_X86_64
 	xen_pagetable_p2m_free();
+
+	xen_pagetable_cleanhighmap();
 #endif
 	/* And revector! Bye bye old array */
 	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 #else /* CONFIG_X86_64 */
 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
+	unsigned long pfn;
+
+	if (xen_feature(XENFEAT_writable_page_tables) ||
+	    xen_feature(XENFEAT_auto_translated_physmap) ||
+	    xen_start_info->mfn_list >= __START_KERNEL_map)
+		return pte;
+
+	/*
+	 * Pages belonging to the initial p2m list mapped outside the default
+	 * address range must be mapped read-only. This region contains the
+	 * page tables for mapping the p2m list, too, and page tables MUST be
+	 * mapped read-only.
+	 */
+	pfn = pte_pfn(pte);
+	if (pfn >= xen_start_info->first_p2m_pfn &&
+	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
+
 	return pte;
 }
 #endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 	native_set_pte(ptep, pte);
 }
 
-static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-	struct mmuext_op op;
-	op.cmd = cmd;
-	op.arg1.mfn = pfn_to_mfn(pfn);
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
-}
-
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	 * mappings. Considering that on Xen after the kernel mappings we
 	 * have the mappings of some pages that don't exist in pfn space, we
 	 * set max_pfn_mapped to the last real pfn mapped. */
-	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+	if (xen_start_info->mfn_list < __START_KERNEL_map)
+		max_pfn_mapped = xen_start_info->first_p2m_pfn;
+	else
+		max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
 
 	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
 	pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 	/* Graft it onto L4[511][510] */
 	copy_page(level2_kernel_pgt, l2);
 
+	/* Copy the initial P->M table mappings if necessary. */
+	i = pgd_index(xen_start_info->mfn_list);
+	if (i && i < pgd_index(__START_KERNEL_map))
+		init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		/* Make pagetable pieces RO */
 		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 		check_pt_base(&pt_base, &pt_end, addr[i]);
 
 	/* Our (by three pages) smaller Xen pagetable that we are using */
-	memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+	xen_pt_base = PFN_PHYS(pt_base);
+	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+	memblock_reserve(xen_pt_base, xen_pt_size);
+
 	/* Revector the xen_start_info */
 	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
 }
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+	unsigned long *vaddr;
+	unsigned long val;
+
+	vaddr = early_memremap_ro(addr, sizeof(val));
+	val = *vaddr;
+	early_memunmap(vaddr, sizeof(val));
+	return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables.
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+	phys_addr_t pa;
+	pgd_t pgd;
+	pud_t pud;
+	pmd_t pmd;
+	pte_t pte;
+
+	pa = read_cr3();
+	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+						       sizeof(pgd)));
+	if (!pgd_present(pgd))
+		return 0;
+
+	pa = pgd_val(pgd) & PTE_PFN_MASK;
+	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+						       sizeof(pud)));
+	if (!pud_present(pud))
+		return 0;
+	pa = pud_pfn(pud) << PAGE_SHIFT;
+	if (pud_large(pud))
+		return pa + (vaddr & ~PUD_MASK);
+
+	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+						       sizeof(pmd)));
+	if (!pmd_present(pmd))
+		return 0;
+	pa = pmd_pfn(pmd) << PAGE_SHIFT;
+	if (pmd_large(pmd))
+		return pa + (vaddr & ~PMD_MASK);
+
+	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+						       sizeof(pte)));
+	if (!pte_present(pte))
+		return 0;
+	pa = pte_pfn(pte) << PAGE_SHIFT;
+
+	return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+	pte_t *pt;
+	pmd_t *pmd;
+	pud_t *pud;
+	pgd_t *pgd;
+	unsigned long *new_p2m;
+
+	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+	n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+	n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+	new_area = xen_find_free_area(PFN_PHYS(n_frames));
+	if (!new_area) {
+		xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+		BUG();
+	}
+
+	/*
+	 * Setup the page tables for addressing the new p2m list.
+	 * We have asked the hypervisor to map the p2m list at the user address
+	 * PUD_SIZE. It may have done so, or it may have used a kernel space
+	 * address depending on the Xen version.
+	 * To avoid any possible virtual address collision, just use
+	 * 2 * PUD_SIZE for the new area.
+	 */
+	pud_phys = new_area;
+	pmd_phys = pud_phys + PFN_PHYS(n_pud);
+	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+	pgd = __va(read_cr3());
+	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+		pud = early_memremap(pud_phys, PAGE_SIZE);
+		clear_page(pud);
+		for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+		     idx_pmd++) {
+			pmd = early_memremap(pmd_phys, PAGE_SIZE);
+			clear_page(pmd);
+			for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+			     idx_pt++) {
+				pt = early_memremap(pt_phys, PAGE_SIZE);
+				clear_page(pt);
+				for (idx_pte = 0;
+				     idx_pte < min(n_pte, PTRS_PER_PTE);
+				     idx_pte++) {
+					set_pte(pt + idx_pte,
+						pfn_pte(p2m_pfn, PAGE_KERNEL));
+					p2m_pfn++;
+				}
+				n_pte -= PTRS_PER_PTE;
+				early_memunmap(pt, PAGE_SIZE);
+				make_lowmem_page_readonly(__va(pt_phys));
+				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+						  PFN_DOWN(pt_phys));
+				set_pmd(pmd + idx_pt,
+					__pmd(_PAGE_TABLE | pt_phys));
+				pt_phys += PAGE_SIZE;
+			}
+			n_pt -= PTRS_PER_PMD;
+			early_memunmap(pmd, PAGE_SIZE);
+			make_lowmem_page_readonly(__va(pmd_phys));
+			pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+					  PFN_DOWN(pmd_phys));
+			set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+			pmd_phys += PAGE_SIZE;
+		}
+		n_pmd -= PTRS_PER_PUD;
+		early_memunmap(pud, PAGE_SIZE);
+		make_lowmem_page_readonly(__va(pud_phys));
+		pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+		set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+		pud_phys += PAGE_SIZE;
+	}
+
+	/* Now copy the old p2m info to the new area. */
+	memcpy(new_p2m, xen_p2m_addr, size);
+	xen_p2m_addr = new_p2m;
+
+	/* Release the old p2m list and set new list info. */
+	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+	BUG_ON(!p2m_pfn);
+	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+	if (xen_start_info->mfn_list < __START_KERNEL_map) {
+		pfn = xen_start_info->first_p2m_pfn;
+		pfn_end = xen_start_info->first_p2m_pfn +
+			  xen_start_info->nr_p2m_frames;
+		set_pgd(pgd + 1, __pgd(0));
+	} else {
+		pfn = p2m_pfn;
+		pfn_end = p2m_pfn_end;
+	}
+
+	memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+	while (pfn < pfn_end) {
+		if (pfn == p2m_pfn) {
+			pfn = p2m_pfn_end;
+			continue;
+		}
+		make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+		pfn++;
+	}
+
+	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+	xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
+	xen_start_info->nr_p2m_frames = n_frames;
+}
+
 #else	/* !CONFIG_X86_64 */
 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
 }
 
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+{
+	phys_addr_t pt_base, paddr;
+	unsigned pmdidx;
+
+	pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+		if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+			paddr = m2p(pmd[pmdidx].pmd);
+			pt_base = min(pt_base, paddr);
+		}
+
+	return pt_base;
+}
+
 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
 
+	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+	xen_pt_base = xen_find_pt_base(kernel_pmd);
+	xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
 	initial_kernel_pmd =
 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
 
-	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
-				  xen_start_info->nr_pt_frames * PAGE_SIZE +
-				  512*1024);
+	max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
 
-	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	copy_page(initial_kernel_pmd, kernel_pmd);
 
 	xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 			  PFN_DOWN(__pa(initial_page_table)));
 	xen_write_cr3(__pa(initial_page_table));
 
-	memblock_reserve(__pa(xen_start_info->pt_base),
-			 xen_start_info->nr_pt_frames * PAGE_SIZE);
+	memblock_reserve(xen_pt_base, xen_pt_size);
 }
 #endif	/* CONFIG_X86_64 */
 
+void __init xen_reserve_special_pages(void)
+{
+	phys_addr_t paddr;
+
+	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+	if (xen_start_info->store_mfn) {
+		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+		memblock_reserve(paddr, PAGE_SIZE);
+	}
+	if (!xen_initial_domain()) {
+		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+		memblock_reserve(paddr, PAGE_SIZE);
+	}
+}
+
+void __init xen_pt_check_e820(void)
+{
+	if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+		xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+		BUG();
+	}
+}
+
 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
 
 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
@@ -2148,14 +2495,9 @@ void __init xen_init_mmu_ops(void)
 {
 	x86_init.paging.pagetable_init = xen_pagetable_init;
 
-	/* Optimization - we can use the HVM one but it has no idea which
-	 * VCPUs are descheduled - which means that it will needlessly IPI
-	 * them. Xen knows so let it do the job.
-	 */
-	if (xen_feature(XENFEAT_auto_translated_physmap)) {
-		pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return;
-	}
+
 	pv_mmu_ops = xen_mmu_ops;
 
 	memset(dummy_mapping, 0xff, PAGE_SIZE);
@@ -2465,9 +2807,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 	return 0;
 }
 
-static int do_remap_mfn(struct vm_area_struct *vma,
+static int do_remap_gfn(struct vm_area_struct *vma,
 			unsigned long addr,
-			xen_pfn_t *mfn, int nr,
+			xen_pfn_t *gfn, int nr,
 			int *err_ptr, pgprot_t prot,
 			unsigned domid,
 			struct page **pages)
@@ -2483,14 +2825,14 @@ static int do_remap_mfn(struct vm_area_struct *vma,
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
 		/* We need to update the local page tables and the xen HAP */
-		return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
 						 prot, domid, pages);
 #else
 		return -EINVAL;
 #endif
         }
 
-	rmd.mfn = mfn;
+	rmd.mfn = gfn;
 	rmd.prot = prot;
 	/* We use the err_ptr to indicate if there we are doing a contigious
 	 * mapping or a discontigious mapping. */
@@ -2518,8 +2860,8 @@ static int do_remap_mfn(struct vm_area_struct *vma,
 						    batch_left, &done, domid);
 
 			/*
-			 * @err_ptr may be the same buffer as @mfn, so
-			 * only clear it after each chunk of @mfn is
+			 * @err_ptr may be the same buffer as @gfn, so
+			 * only clear it after each chunk of @gfn is
 			 * used.
 			 */
 			if (err_ptr) {
@@ -2541,6 +2883,7 @@ static int do_remap_mfn(struct vm_area_struct *vma,
 		addr += range;
 		if (err_ptr)
 			err_ptr += batch;
+		cond_resched();
 	}
 out:
 
@@ -2549,19 +2892,19 @@ out:
 	return err < 0 ? err : mapped;
 }
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t mfn, int nr,
+			       xen_pfn_t gfn, int nr,
 			       pgprot_t prot, unsigned domid,
 			       struct page **pages)
 {
-	return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
+	return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t *mfn, int nr,
+			       xen_pfn_t *gfn, int nr,
 			       int *err_ptr, pgprot_t prot,
 			       unsigned domid, struct page **pages)
 {
@@ -2570,13 +2913,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
 	 * cause of "wrong memory was mapped in".
 	 */
 	BUG_ON(err_ptr == NULL);
-	return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+	return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 
 /* Returns: 0 success */
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages)
 {
 	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
@@ -2588,4 +2931,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 	return -EINVAL;
 #endif
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/kernel/arch/x86/xen/p2m.c b/kernel/arch/x86/xen/p2m.c
index b47124d4c..cab9f766b 100644
--- a/kernel/arch/x86/xen/p2m.c
+++ b/kernel/arch/x86/xen/p2m.c
@@ -67,6 +67,7 @@
 #include <linux/seq_file.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cache.h>
 #include <asm/setup.h>
@@ -78,10 +79,14 @@
 #include <xen/balloon.h>
 #include <xen/grant_table.h>
 
-#include "p2m.h"
 #include "multicalls.h"
 #include "xen-ops.h"
 
+#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN	(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
 #define PMDS_PER_MID_PAGE	(P2M_MID_PER_PAGE / PTRS_PER_PTE)
 
 unsigned long *xen_p2m_addr __read_mostly;
@@ -107,6 +112,15 @@ static unsigned long *p2m_identity;
 static pte_t *p2m_missing_pte;
 static pte_t *p2m_identity_pte;
 
+/*
+ * Hint at last populated PFN.
+ *
+ * Used to set HYPERVISOR_shared_info->arch.max_pfn so the toolstack
+ * can avoid scanning the whole P2M (which may be sized to account for
+ * hotplugged memory).
+ */
+static unsigned long xen_p2m_last_pfn;
+
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
 	BUG_ON(pfn >= MAX_P2M_PFN);
@@ -198,7 +212,8 @@ void __ref xen_build_mfn_list_list(void)
 	unsigned int level, topidx, mididx;
 	unsigned long *mid_mfn_p;
 
-	if (xen_feature(XENFEAT_auto_translated_physmap))
+	if (xen_feature(XENFEAT_auto_translated_physmap) ||
+	    xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
 		return;
 
 	/* Pre-initialize p2m_top_mfn to be completely missing */
@@ -259,9 +274,16 @@ void xen_setup_mfn_list_list(void)
 
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
-	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-		virt_to_mfn(p2m_top_mfn);
-	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+	if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+	else
+		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+			virt_to_mfn(p2m_top_mfn);
+	HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
+	HYPERVISOR_shared_info->arch.p2m_generation = 0;
+	HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+	HYPERVISOR_shared_info->arch.p2m_cr3 =
+		xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -393,6 +415,8 @@ void __init xen_vmalloc_p2m_tree(void)
 	static struct vm_struct vm;
 	unsigned long p2m_limit;
 
+	xen_p2m_last_pfn = xen_max_p2m_pfn;
+
 	p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
 	vm.flags = VM_ALLOC;
 	vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
@@ -477,8 +501,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
 
 		ptechk = lookup_address(vaddr, &level);
 		if (ptechk == pte_pg) {
+			HYPERVISOR_shared_info->arch.p2m_generation++;
+			wmb(); /* Tools are synchronizing via p2m_generation. */
 			set_pmd(pmdp,
 				__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+			wmb(); /* Tools are synchronizing via p2m_generation. */
+			HYPERVISOR_shared_info->arch.p2m_generation++;
 			pte_newpg[i] = NULL;
 		}
 
@@ -502,9 +530,9 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
  * the new pages are installed with cmpxchg; if we lose the race then
  * simply free the page we allocated and use the one that's there.
  */
-static bool alloc_p2m(unsigned long pfn)
+int xen_alloc_p2m_entry(unsigned long pfn)
 {
-	unsigned topidx, mididx;
+	unsigned topidx;
 	unsigned long *top_mfn_p, *mid_mfn;
 	pte_t *ptep, *pte_pg;
 	unsigned int level;
@@ -512,8 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
 	unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
 	unsigned long p2m_pfn;
 
-	topidx = p2m_top_index(pfn);
-	mididx = p2m_mid_index(pfn);
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
 
 	ptep = lookup_address(addr, &level);
 	BUG_ON(!ptep || level != PG_LEVEL_4K);
@@ -523,10 +551,11 @@ static bool alloc_p2m(unsigned long pfn)
 		/* PMD level is missing, allocate a new one */
 		ptep = alloc_p2m_pmd(addr, pte_pg);
 		if (!ptep)
-			return false;
+			return -ENOMEM;
 	}
 
-	if (p2m_top_mfn) {
+	if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+		topidx = p2m_top_index(pfn);
 		top_mfn_p = &p2m_top_mfn[topidx];
 		mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
 
@@ -540,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn)
 
 			mid_mfn = alloc_p2m_page();
 			if (!mid_mfn)
-				return false;
+				return -ENOMEM;
 
 			p2m_mid_mfn_init(mid_mfn, p2m_missing);
 
@@ -566,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn)
 
 		p2m = alloc_p2m_page();
 		if (!p2m)
-			return false;
+			return -ENOMEM;
 
 		if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
 			p2m_init(p2m);
@@ -576,10 +605,14 @@ static bool alloc_p2m(unsigned long pfn)
 		spin_lock_irqsave(&p2m_update_lock, flags);
 
 		if (pte_pfn(*ptep) == p2m_pfn) {
+			HYPERVISOR_shared_info->arch.p2m_generation++;
+			wmb(); /* Tools are synchronizing via p2m_generation. */
 			set_pte(ptep,
 				pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+			wmb(); /* Tools are synchronizing via p2m_generation. */
+			HYPERVISOR_shared_info->arch.p2m_generation++;
 			if (mid_mfn)
-				mid_mfn[mididx] = virt_to_mfn(p2m);
+				mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
 			p2m = NULL;
 		}
 
@@ -589,8 +622,15 @@ static bool alloc_p2m(unsigned long pfn)
 			free_p2m_page(p2m);
 	}
 
-	return true;
+	/* Expanded the p2m? */
+	if (pfn > xen_p2m_last_pfn) {
+		xen_p2m_last_pfn = pfn;
+		HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
+	}
+
+	return 0;
 }
+EXPORT_SYMBOL(xen_alloc_p2m_entry);
 
 unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 				      unsigned long pfn_e)
@@ -629,6 +669,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 		return true;
 	}
 
+	/*
+	 * The interface requires atomic updates on p2m elements.
+	 * xen_safe_write_ulong() is using __put_user which does an atomic
+	 * store via asm().
+	 */
 	if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
 		return true;
 
@@ -647,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
-		if (!alloc_p2m(pfn))
+		int ret;
+
+		ret = xen_alloc_p2m_entry(pfn);
+		if (ret < 0)
 			return false;
 
 		return __set_phys_to_machine(pfn, mfn);
diff --git a/kernel/arch/x86/xen/p2m.h b/kernel/arch/x86/xen/p2m.h
deleted file mode 100644
index ad8aee24a..000000000
--- a/kernel/arch/x86/xen/p2m.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _XEN_P2M_H
-#define _XEN_P2M_H
-
-#define P2M_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN         (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-#define MAX_REMAP_RANGES    10
-
-extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
-                                      unsigned long pfn_e);
-
-#endif  /* _XEN_P2M_H */
diff --git a/kernel/arch/x86/xen/platform-pci-unplug.c b/kernel/arch/x86/xen/platform-pci-unplug.c
index a8261716d..9586ff328 100644
--- a/kernel/arch/x86/xen/platform-pci-unplug.c
+++ b/kernel/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
 	return 0;
 }
 
-bool xen_has_pv_devices()
+bool xen_has_pv_devices(void)
 {
 	if (!xen_domain())
 		return false;
diff --git a/kernel/arch/x86/xen/pmu.c b/kernel/arch/x86/xen/pmu.c
new file mode 100644
index 000000000..724a08740
--- /dev/null
+++ b/kernel/arch/x86/xen/pmu.c
@@ -0,0 +1,570 @@
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../kernel/cpu/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING    1
+struct xenpmu {
+	/* Shared page between hypervisor and domain */
+	struct xen_pmu_data *xenpmu_data;
+
+	uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data()    (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags()   (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+					    (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS   6
+#define F10H_NUM_COUNTERS   4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER            0
+#define MSR_TYPE_CTRL               1
+#define MSR_TYPE_GLOBAL             2
+#define MSR_TYPE_ARCH_COUNTER       3
+#define MSR_TYPE_ARCH_CTRL          4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT        8
+#define PMU_GENERAL_NR_BITS         8
+#define PMU_GENERAL_NR_MASK         (((1 << PMU_GENERAL_NR_BITS) - 1) \
+				     << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT          0
+#define PMU_FIXED_NR_BITS           5
+#define PMU_FIXED_NR_MASK           (((1 << PMU_FIXED_NR_BITS) - 1) \
+				     << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK          (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT        30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+		switch (boot_cpu_data.x86) {
+		case 0x15:
+			amd_num_counters = F15H_NUM_COUNTERS;
+			amd_counters_base = MSR_F15H_PERF_CTR;
+			amd_ctrls_base = MSR_F15H_PERF_CTL;
+			amd_msr_step = 2;
+			k7_counters_mirrored = 1;
+			break;
+		case 0x10:
+		case 0x12:
+		case 0x14:
+		case 0x16:
+		default:
+			amd_num_counters = F10H_NUM_COUNTERS;
+			amd_counters_base = MSR_K7_PERFCTR0;
+			amd_ctrls_base = MSR_K7_EVNTSEL0;
+			amd_msr_step = 1;
+			k7_counters_mirrored = 0;
+			break;
+		}
+	} else {
+		uint32_t eax, ebx, ecx, edx;
+
+		cpuid(0xa, &eax, &ebx, &ecx, &edx);
+
+		intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
+			PMU_GENERAL_NR_SHIFT;
+		intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
+			PMU_FIXED_NR_SHIFT;
+	}
+}
+
+static inline uint32_t get_fam15h_addr(u32 addr)
+{
+	switch (addr) {
+	case MSR_K7_PERFCTR0:
+	case MSR_K7_PERFCTR1:
+	case MSR_K7_PERFCTR2:
+	case MSR_K7_PERFCTR3:
+		return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
+	case MSR_K7_EVNTSEL0:
+	case MSR_K7_EVNTSEL1:
+	case MSR_K7_EVNTSEL2:
+	case MSR_K7_EVNTSEL3:
+		return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
+	default:
+		break;
+	}
+
+	return addr;
+}
+
+static inline bool is_amd_pmu_msr(unsigned int msr)
+{
+	if ((msr >= MSR_F15H_PERF_CTL &&
+	     msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
+	    (msr >= MSR_K7_EVNTSEL0 &&
+	     msr < MSR_K7_PERFCTR0 + amd_num_counters))
+		return true;
+
+	return false;
+}
+
+static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+{
+	u32 msr_index_pmc;
+
+	switch (msr_index) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+	case MSR_IA32_DS_AREA:
+	case MSR_IA32_PEBS_ENABLE:
+		*type = MSR_TYPE_CTRL;
+		return true;
+
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		*type = MSR_TYPE_GLOBAL;
+		return true;
+
+	default:
+
+		if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+		    (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
+				 intel_num_fixed_counters)) {
+			*index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
+			*type = MSR_TYPE_COUNTER;
+			return true;
+		}
+
+		if ((msr_index >= MSR_P6_EVNTSEL0) &&
+		    (msr_index < MSR_P6_EVNTSEL0 +  intel_num_arch_counters)) {
+			*index = msr_index - MSR_P6_EVNTSEL0;
+			*type = MSR_TYPE_ARCH_CTRL;
+			return true;
+		}
+
+		msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+		if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+		    (msr_index_pmc < MSR_IA32_PERFCTR0 +
+				     intel_num_arch_counters)) {
+			*type = MSR_TYPE_ARCH_COUNTER;
+			*index = msr_index_pmc - MSR_IA32_PERFCTR0;
+			return true;
+		}
+		return false;
+	}
+}
+
+static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
+				  int index, bool is_read)
+{
+	uint64_t *reg = NULL;
+	struct xen_pmu_intel_ctxt *ctxt;
+	uint64_t *fix_counters;
+	struct xen_pmu_cntr_pair *arch_cntr_pair;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+		return false;
+
+	ctxt = &xenpmu_data->pmu.c.intel;
+
+	switch (msr) {
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		reg = &ctxt->global_ovf_ctrl;
+		break;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		reg = &ctxt->global_status;
+		break;
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		reg = &ctxt->global_ctrl;
+		break;
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		reg = &ctxt->fixed_ctrl;
+		break;
+	default:
+		switch (type) {
+		case MSR_TYPE_COUNTER:
+			fix_counters = field_offset(ctxt, fixed_counters);
+			reg = &fix_counters[index];
+			break;
+		case MSR_TYPE_ARCH_COUNTER:
+			arch_cntr_pair = field_offset(ctxt, arch_counters);
+			reg = &arch_cntr_pair[index].counter;
+			break;
+		case MSR_TYPE_ARCH_CTRL:
+			arch_cntr_pair = field_offset(ctxt, arch_counters);
+			reg = &arch_cntr_pair[index].control;
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (reg) {
+		if (is_read)
+			*val = *reg;
+		else {
+			*reg = *val;
+
+			if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
+				ctxt->global_status &= (~(*val));
+		}
+		return true;
+	}
+
+	return false;
+}
+
+static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+{
+	uint64_t *reg = NULL;
+	int i, off = 0;
+	struct xen_pmu_amd_ctxt *ctxt;
+	uint64_t *counter_regs, *ctrl_regs;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+		return false;
+
+	if (k7_counters_mirrored &&
+	    ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
+		msr = get_fam15h_addr(msr);
+
+	ctxt = &xenpmu_data->pmu.c.amd;
+	for (i = 0; i < amd_num_counters; i++) {
+		if (msr == amd_ctrls_base + off) {
+			ctrl_regs = field_offset(ctxt, ctrls);
+			reg = &ctrl_regs[i];
+			break;
+		} else if (msr == amd_counters_base + off) {
+			counter_regs = field_offset(ctxt, counters);
+			reg = &counter_regs[i];
+			break;
+		}
+		off += amd_msr_step;
+	}
+
+	if (reg) {
+		if (is_read)
+			*val = *reg;
+		else
+			*reg = *val;
+
+		return true;
+	}
+	return false;
+}
+
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		if (is_amd_pmu_msr(msr)) {
+			if (!xen_amd_pmu_emulate(msr, val, 1))
+				*val = native_read_msr_safe(msr, err);
+			return true;
+		}
+	} else {
+		int type, index;
+
+		if (is_intel_pmu_msr(msr, &type, &index)) {
+			if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
+				*val = native_read_msr_safe(msr, err);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+{
+	uint64_t val = ((uint64_t)high << 32) | low;
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		if (is_amd_pmu_msr(msr)) {
+			if (!xen_amd_pmu_emulate(msr, &val, 0))
+				*err = native_write_msr_safe(msr, low, high);
+			return true;
+		}
+	} else {
+		int type, index;
+
+		if (is_intel_pmu_msr(msr, &type, &index)) {
+			if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
+				*err = native_write_msr_safe(msr, low, high);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static unsigned long long xen_amd_read_pmc(int counter)
+{
+	struct xen_pmu_amd_ctxt *ctxt;
+	uint64_t *counter_regs;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+		uint32_t msr;
+		int err;
+
+		msr = amd_counters_base + (counter * amd_msr_step);
+		return native_read_msr_safe(msr, &err);
+	}
+
+	ctxt = &xenpmu_data->pmu.c.amd;
+	counter_regs = field_offset(ctxt, counters);
+	return counter_regs[counter];
+}
+
+static unsigned long long xen_intel_read_pmc(int counter)
+{
+	struct xen_pmu_intel_ctxt *ctxt;
+	uint64_t *fixed_counters;
+	struct xen_pmu_cntr_pair *arch_cntr_pair;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+		uint32_t msr;
+		int err;
+
+		if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
+			msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
+		else
+			msr = MSR_IA32_PERFCTR0 + counter;
+
+		return native_read_msr_safe(msr, &err);
+	}
+
+	ctxt = &xenpmu_data->pmu.c.intel;
+	if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
+		fixed_counters = field_offset(ctxt, fixed_counters);
+		return fixed_counters[counter & 0xffff];
+	}
+
+	arch_cntr_pair = field_offset(ctxt, arch_counters);
+	return arch_cntr_pair[counter].counter;
+}
+
+unsigned long long xen_read_pmc(int counter)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return xen_amd_read_pmc(counter);
+	else
+		return xen_intel_read_pmc(counter);
+}
+
+int pmu_apic_update(uint32_t val)
+{
+	int ret;
+	struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return -EINVAL;
+	}
+
+	xenpmu_data->pmu.l.lapic_lvtpc = val;
+
+	if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
+		return 0;
+
+	ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
+
+	return ret;
+}
+
+/* perf callbacks */
+static int xen_is_in_guest(void)
+{
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return 0;
+	}
+
+	if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
+		return 0;
+
+	return 1;
+}
+
+static int xen_is_user_mode(void)
+{
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return 0;
+	}
+
+	if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
+		return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
+	else
+		return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+}
+
+static unsigned long xen_get_guest_ip(void)
+{
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return 0;
+	}
+
+	return xenpmu_data->pmu.r.regs.ip;
+}
+
+static struct perf_guest_info_callbacks xen_guest_cbs = {
+	.is_in_guest            = xen_is_in_guest,
+	.is_user_mode           = xen_is_user_mode,
+	.get_guest_ip           = xen_get_guest_ip,
+};
+
+/* Convert registers from Xen's format to Linux' */
+static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
+			     struct pt_regs *regs, uint64_t pmu_flags)
+{
+	regs->ip = xen_regs->ip;
+	regs->cs = xen_regs->cs;
+	regs->sp = xen_regs->sp;
+
+	if (pmu_flags & PMU_SAMPLE_PV) {
+		if (pmu_flags & PMU_SAMPLE_USER)
+			regs->cs |= 3;
+		else
+			regs->cs &= ~3;
+	} else {
+		if (xen_regs->cpl)
+			regs->cs |= 3;
+		else
+			regs->cs &= ~3;
+	}
+}
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
+{
+	int err, ret = IRQ_NONE;
+	struct pt_regs regs;
+	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	uint8_t xenpmu_flags = get_xenpmu_flags();
+
+	if (!xenpmu_data) {
+		pr_warn_once("%s: pmudata not initialized\n", __func__);
+		return ret;
+	}
+
+	this_cpu_ptr(&xenpmu_shared)->flags =
+		xenpmu_flags | XENPMU_IRQ_PROCESSING;
+	xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
+			 xenpmu_data->pmu.pmu_flags);
+	if (x86_pmu.handle_irq(&regs))
+		ret = IRQ_HANDLED;
+
+	/* Write out cached context to HW */
+	err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
+	this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
+	if (err) {
+		pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
+		return IRQ_NONE;
+	}
+
+	return ret;
+}
+
+bool is_xen_pmu(int cpu)
+{
+	return (get_xenpmu_data() != NULL);
+}
+
+void xen_pmu_init(int cpu)
+{
+	int err;
+	struct xen_pmu_params xp;
+	unsigned long pfn;
+	struct xen_pmu_data *xenpmu_data;
+
+	BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
+
+	if (xen_hvm_domain())
+		return;
+
+	xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
+	if (!xenpmu_data) {
+		pr_err("VPMU init: No memory\n");
+		return;
+	}
+	pfn = virt_to_pfn(xenpmu_data);
+
+	xp.val = pfn_to_mfn(pfn);
+	xp.vcpu = cpu;
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+	err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
+	if (err)
+		goto fail;
+
+	per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
+	per_cpu(xenpmu_shared, cpu).flags = 0;
+
+	if (cpu == 0) {
+		perf_register_guest_info_callbacks(&xen_guest_cbs);
+		xen_pmu_arch_init();
+	}
+
+	return;
+
+fail:
+	pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
+		cpu, err);
+	free_pages((unsigned long)xenpmu_data, 0);
+}
+
+void xen_pmu_finish(int cpu)
+{
+	struct xen_pmu_params xp;
+
+	if (xen_hvm_domain())
+		return;
+
+	xp.vcpu = cpu;
+	xp.version.maj = XENPMU_VER_MAJ;
+	xp.version.min = XENPMU_VER_MIN;
+
+	(void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
+
+	free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
+	per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+}
diff --git a/kernel/arch/x86/xen/pmu.h b/kernel/arch/x86/xen/pmu.h
new file mode 100644
index 000000000..af5f0ad94
--- /dev/null
+++ b/kernel/arch/x86/xen/pmu.h
@@ -0,0 +1,15 @@
+#ifndef __XEN_PMU_H
+#define __XEN_PMU_H
+
+#include <xen/interface/xenpmu.h>
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+bool is_xen_pmu(int cpu);
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#endif /* __XEN_PMU_H */
diff --git a/kernel/arch/x86/xen/setup.c b/kernel/arch/x86/xen/setup.c
index 55f388ef4..7ab29518a 100644
--- a/kernel/arch/x86/xen/setup.c
+++ b/kernel/arch/x86/xen/setup.c
@@ -27,17 +27,23 @@
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
+#include <xen/hvc-console.h>
 #include "xen-ops.h"
 #include "vdso.h"
-#include "p2m.h"
 #include "mmu.h"
 
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* E820 map used during setting up memory. */
+static struct e820entry xen_e820_map[E820MAX] __initdata;
+static u32 xen_e820_map_entries __initdata;
+
 /*
  * Buffer used to remap identity mapped pages. We only need the virtual space.
  * The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
  */
 #define EXTRA_MEM_RATIO		(10)
 
-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+	bool val = false;
+	char *arg;
+
+	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+	if (!arg)
+		return;
+
+	arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+	if (!arg)
+		val = true;
+	else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+		return;
+
+	xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+				     unsigned long n_pfns)
 {
 	int i;
 
+	/*
+	 * No need to check for zero size, should happen rarely and will only
+	 * write a new entry regarded to be unused due to zero size.
+	 */
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
 		/* Add new region. */
-		if (xen_extra_mem[i].size == 0) {
-			xen_extra_mem[i].start = start;
-			xen_extra_mem[i].size  = size;
+		if (xen_extra_mem[i].n_pfns == 0) {
+			xen_extra_mem[i].start_pfn = start_pfn;
+			xen_extra_mem[i].n_pfns = n_pfns;
 			break;
 		}
 		/* Append to existing region. */
-		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
-			xen_extra_mem[i].size += size;
+		if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+		    start_pfn) {
+			xen_extra_mem[i].n_pfns += n_pfns;
 			break;
 		}
 	}
 	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
 		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
 
-	memblock_reserve(start, size);
+	memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+				     unsigned long n_pfns)
 {
 	int i;
-	phys_addr_t start_r, size_r;
+	unsigned long start_r, size_r;
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		start_r = xen_extra_mem[i].start;
-		size_r = xen_extra_mem[i].size;
+		start_r = xen_extra_mem[i].start_pfn;
+		size_r = xen_extra_mem[i].n_pfns;
 
 		/* Start of region. */
-		if (start_r == start) {
-			BUG_ON(size > size_r);
-			xen_extra_mem[i].start += size;
-			xen_extra_mem[i].size -= size;
+		if (start_r == start_pfn) {
+			BUG_ON(n_pfns > size_r);
+			xen_extra_mem[i].start_pfn += n_pfns;
+			xen_extra_mem[i].n_pfns -= n_pfns;
 			break;
 		}
 		/* End of region. */
-		if (start_r + size_r == start + size) {
-			BUG_ON(size > size_r);
-			xen_extra_mem[i].size -= size;
+		if (start_r + size_r == start_pfn + n_pfns) {
+			BUG_ON(n_pfns > size_r);
+			xen_extra_mem[i].n_pfns -= n_pfns;
 			break;
 		}
 		/* Mid of region. */
-		if (start > start_r && start < start_r + size_r) {
-			BUG_ON(start + size > start_r + size_r);
-			xen_extra_mem[i].size = start - start_r;
+		if (start_pfn > start_r && start_pfn < start_r + size_r) {
+			BUG_ON(start_pfn + n_pfns > start_r + size_r);
+			xen_extra_mem[i].n_pfns = start_pfn - start_r;
 			/* Calling memblock_reserve() again is okay. */
-			xen_add_extra_mem(start + size, start_r + size_r -
-					  (start + size));
+			xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+					  (start_pfn + n_pfns));
 			break;
 		}
 	}
-	memblock_free(start, size);
+	memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
 /*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
 {
 	int i;
-	phys_addr_t addr = PFN_PHYS(pfn);
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		if (addr >= xen_extra_mem[i].start &&
-		    addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+		if (pfn >= xen_extra_mem[i].start_pfn &&
+		    pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
 			return INVALID_P2M_ENTRY;
 	}
 
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
 	int i;
 
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		if (!xen_extra_mem[i].size)
+		if (!xen_extra_mem[i].n_pfns)
 			continue;
-		pfn_s = PFN_DOWN(xen_extra_mem[i].start);
-		pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+		pfn_s = xen_extra_mem[i].start_pfn;
+		pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
 		for (pfn = pfn_s; pfn < pfn_e; pfn++)
 			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 	}
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
  * This function updates min_pfn with the pfn found and returns
  * the size of that range or zero if not found.
  */
-static unsigned long __init xen_find_pfn_range(
-	const struct e820entry *list, size_t map_size,
-	unsigned long *min_pfn)
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
 {
-	const struct e820entry *entry;
+	const struct e820entry *entry = xen_e820_map;
 	unsigned int i;
 	unsigned long done = 0;
 
-	for (i = 0, entry = list; i < map_size; i++, entry++) {
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
 		unsigned long s_pfn;
 		unsigned long e_pfn;
 
@@ -182,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(
 		e_pfn = PFN_DOWN(entry->addr + entry->size);
 
 		/* We only care about E820 after this */
-		if (e_pfn < *min_pfn)
+		if (e_pfn <= *min_pfn)
 			continue;
 
 		s_pfn = PFN_UP(entry->addr);
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
  * as a fallback if the remapping fails.
  */
 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
-	unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
+			unsigned long end_pfn, unsigned long nr_pages)
 {
 	unsigned long pfn, end;
 	int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
 		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
 
 		if (ret == 1) {
-			(*released)++;
+			xen_released_pages++;
 			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
 				break;
 		} else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
  * to Xen and not remapped.
  */
 static unsigned long __init xen_set_identity_and_remap_chunk(
-        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
-	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
-	unsigned long *released, unsigned long *remapped)
+	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+	unsigned long remap_pfn)
 {
 	unsigned long pfn;
 	unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
 		if (cur_pfn + size > nr_pages)
 			size = nr_pages - cur_pfn;
 
-		remap_range_size = xen_find_pfn_range(list, map_size,
-						      &remap_pfn);
+		remap_range_size = xen_find_pfn_range(&remap_pfn);
 		if (!remap_range_size) {
 			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
 			xen_set_identity_and_release_chunk(cur_pfn,
-				cur_pfn + left, nr_pages, released);
+						cur_pfn + left, nr_pages);
 			break;
 		}
 		/* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
 		/* Update variables to reflect new mappings. */
 		i += size;
 		remap_pfn += size;
-		*remapped += size;
 	}
 
 	/*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
 	return remap_pfn;
 }
 
-static void __init xen_set_identity_and_remap(
-	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
-	unsigned long *released, unsigned long *remapped)
+static void __init xen_set_identity_and_remap(unsigned long nr_pages)
 {
 	phys_addr_t start = 0;
 	unsigned long last_pfn = nr_pages;
-	const struct e820entry *entry;
-	unsigned long num_released = 0;
-	unsigned long num_remapped = 0;
+	const struct e820entry *entry = xen_e820_map;
 	int i;
 
 	/*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
 	 * example) the DMI tables in a reserved region that begins on
 	 * a non-page boundary.
 	 */
-	for (i = 0, entry = list; i < map_size; i++, entry++) {
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
 		phys_addr_t end = entry->addr + entry->size;
-		if (entry->type == E820_RAM || i == map_size - 1) {
+		if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
 			unsigned long start_pfn = PFN_DOWN(start);
 			unsigned long end_pfn = PFN_UP(end);
 
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
 
 			if (start_pfn < end_pfn)
 				last_pfn = xen_set_identity_and_remap_chunk(
-						list, map_size, start_pfn,
-						end_pfn, nr_pages, last_pfn,
-						&num_released, &num_remapped);
+						start_pfn, end_pfn, nr_pages,
+						last_pfn);
 			start = end;
 		}
 	}
 
-	*released = num_released;
-	*remapped = num_remapped;
-
-	pr_info("Released %ld page(s)\n", num_released);
+	pr_info("Released %ld page(s)\n", xen_released_pages);
 }
 
 /*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
 		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
 			len += xen_remap_buf.size;
 		} else {
-			xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+			xen_del_extra_mem(pfn_s, len);
 			pfn_s = xen_remap_buf.target_pfn;
 			len = xen_remap_buf.size;
 		}
@@ -504,18 +523,35 @@ void __init xen_remap_memory(void)
 	}
 
 	if (pfn_s != ~0UL && len)
-		xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+		xen_del_extra_mem(pfn_s, len);
 
 	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
 
 	pr_info("Remapped %ld page(s)\n", remapped);
 }
 
+static unsigned long __init xen_get_pages_limit(void)
+{
+	unsigned long limit;
+
+#ifdef CONFIG_X86_32
+	limit = GB(64) / PAGE_SIZE;
+#else
+	limit = MAXMEM / PAGE_SIZE;
+	if (!xen_initial_domain() && xen_512gb_limit)
+		limit = GB(512) / PAGE_SIZE;
+#endif
+	return limit;
+}
+
 static unsigned long __init xen_get_max_pages(void)
 {
-	unsigned long max_pages = MAX_DOMAIN_PAGES;
+	unsigned long max_pages, limit;
 	domid_t domid = DOMID_SELF;
-	int ret;
+	long ret;
+
+	limit = xen_get_pages_limit();
+	max_pages = limit;
 
 	/*
 	 * For the initial domain we use the maximum reservation as
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
 			max_pages = ret;
 	}
 
-	return min(max_pages, MAX_DOMAIN_PAGES);
+	return min(max_pages, limit);
 }
 
 static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
 	e820_add_region(start, end - start, type);
 }
 
-static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(void)
 {
-	struct e820entry *entry;
+	struct e820entry *entry = xen_e820_map;
 	unsigned int i;
 
-	for (i = 0, entry = list; i < map_size; i++, entry++) {
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
 		if (entry->type == E820_UNUSABLE)
 			entry->type = E820_RAM;
 	}
 }
 
+static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
+{
+	unsigned long extra = 0;
+	unsigned long start_pfn, end_pfn;
+	const struct e820entry *entry = xen_e820_map;
+	int i;
+
+	end_pfn = 0;
+	for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+		start_pfn = PFN_DOWN(entry->addr);
+		/* Adjacent regions on non-page boundaries handling! */
+		end_pfn = min(end_pfn, start_pfn);
+
+		if (start_pfn >= max_pfn)
+			return extra + max_pfn - end_pfn;
+
+		/* Add any holes in map to result. */
+		extra += start_pfn - end_pfn;
+
+		end_pfn = PFN_UP(entry->addr + entry->size);
+		end_pfn = min(end_pfn, max_pfn);
+
+		if (entry->type != E820_RAM)
+			extra += end_pfn - start_pfn;
+	}
+
+	return extra;
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+	struct e820entry *entry;
+	unsigned mapcnt;
+	phys_addr_t end;
+
+	if (!size)
+		return false;
+
+	end = start + size;
+	entry = xen_e820_map;
+
+	for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
+		if (entry->type == E820_RAM && entry->addr <= start &&
+		    (entry->addr + entry->size) >= end)
+			return false;
+
+		entry++;
+	}
+
+	return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+	unsigned mapcnt;
+	phys_addr_t addr, start;
+	struct e820entry *entry = xen_e820_map;
+
+	for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
+		if (entry->type != E820_RAM || entry->size < size)
+			continue;
+		start = entry->addr;
+		for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+			if (!memblock_is_reserved(addr))
+				continue;
+			start = addr + PAGE_SIZE;
+			if (start + size > entry->addr + entry->size)
+				break;
+		}
+		if (addr >= start + size) {
+			memblock_reserve(start, size);
+			return start;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+				   phys_addr_t n)
+{
+	phys_addr_t dest_off, src_off, dest_len, src_len, len;
+	void *from, *to;
+
+	while (n) {
+		dest_off = dest & ~PAGE_MASK;
+		src_off = src & ~PAGE_MASK;
+		dest_len = n;
+		if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+			dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+		src_len = n;
+		if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+			src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+		len = min(dest_len, src_len);
+		to = early_memremap(dest - dest_off, dest_len + dest_off);
+		from = early_memremap(src - src_off, src_len + src_off);
+		memcpy(to, from, len);
+		early_memunmap(to, dest_len + dest_off);
+		early_memunmap(from, src_len + src_off);
+		n -= len;
+		dest += len;
+		src += len;
+	}
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+	phys_addr_t start, size;
+
+	if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+		start = __pa(xen_start_info->mfn_list);
+		size = PFN_ALIGN(xen_start_info->nr_pages *
+				 sizeof(unsigned long));
+	} else {
+		start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+		size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+	}
+
+	if (!xen_is_e820_reserved(start, size)) {
+		memblock_reserve(start, size);
+		return;
+	}
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Relocating the p2m on 32 bit system to an arbitrary virtual address
+	 * is not supported, so just give up.
+	 */
+	xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+	BUG();
+#else
+	xen_relocate_p2m();
+#endif
+}
+
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
-	static struct e820entry map[E820MAX] __initdata;
-
-	unsigned long max_pfn = xen_start_info->nr_pages;
-	phys_addr_t mem_end;
+	unsigned long max_pfn, pfn_s, n_pfns;
+	phys_addr_t mem_end, addr, size, chunk_size;
+	u32 type;
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
 	unsigned long extra_pages = 0;
-	unsigned long remapped_pages;
 	int i;
 	int op;
 
-	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+	xen_parse_512gb();
+	max_pfn = xen_get_pages_limit();
+	max_pfn = min(max_pfn, xen_start_info->nr_pages);
 	mem_end = PFN_PHYS(max_pfn);
 
 	memmap.nr_entries = E820MAX;
-	set_xen_guest_handle(memmap.buffer, map);
+	set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
 	op = xen_initial_domain() ?
 		XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
 	if (rc == -ENOSYS) {
 		BUG_ON(xen_initial_domain());
 		memmap.nr_entries = 1;
-		map[0].addr = 0ULL;
-		map[0].size = mem_end;
+		xen_e820_map[0].addr = 0ULL;
+		xen_e820_map[0].size = mem_end;
 		/* 8MB slack (to balance backend allocations). */
-		map[0].size += 8ULL << 20;
-		map[0].type = E820_RAM;
+		xen_e820_map[0].size += 8ULL << 20;
+		xen_e820_map[0].type = E820_RAM;
 		rc = 0;
 	}
 	BUG_ON(rc);
 	BUG_ON(memmap.nr_entries == 0);
+	xen_e820_map_entries = memmap.nr_entries;
 
 	/*
 	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
 	 * a patch in the future.
 	 */
 	if (xen_initial_domain())
-		xen_ignore_unusable(map, memmap.nr_entries);
+		xen_ignore_unusable();
 
 	/* Make sure the Xen-supplied memory map is well-ordered. */
-	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+	sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+			  &xen_e820_map_entries);
 
 	max_pages = xen_get_max_pages();
-	if (max_pages > max_pfn)
-		extra_pages += max_pages - max_pfn;
 
-	/*
-	 * Set identity map on non-RAM pages and prepare remapping the
-	 * underlying RAM.
-	 */
-	xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
-				   &xen_released_pages, &remapped_pages);
+	/* How many extra pages do we need due to remapping? */
+	max_pages += xen_count_remap_pages(max_pfn);
 
-	extra_pages += xen_released_pages;
-	extra_pages += remapped_pages;
+	if (max_pages > max_pfn)
+		extra_pages += max_pages - max_pfn;
 
 	/*
 	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,57 @@ char * __init xen_memory_setup(void)
 	 * is limited to the max size of lowmem, so that it doesn't
 	 * get completely filled.
 	 *
+	 * Make sure we have no memory above max_pages, as this area
+	 * isn't handled by the p2m management.
+	 *
 	 * In principle there could be a problem in lowmem systems if
 	 * the initial memory is also very large with respect to
 	 * lowmem, but we won't try to deal with that here.
 	 */
-	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-			  extra_pages);
+	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+			   extra_pages, max_pages - max_pfn);
 	i = 0;
-	while (i < memmap.nr_entries) {
-		phys_addr_t addr = map[i].addr;
-		phys_addr_t size = map[i].size;
-		u32 type = map[i].type;
+	addr = xen_e820_map[0].addr;
+	size = xen_e820_map[0].size;
+	while (i < xen_e820_map_entries) {
+		bool discard = false;
+
+		chunk_size = size;
+		type = xen_e820_map[i].type;
 
 		if (type == E820_RAM) {
 			if (addr < mem_end) {
-				size = min(size, mem_end - addr);
+				chunk_size = min(size, mem_end - addr);
 			} else if (extra_pages) {
-				size = min(size, PFN_PHYS(extra_pages));
-				extra_pages -= PFN_DOWN(size);
-				xen_add_extra_mem(addr, size);
-				xen_max_p2m_pfn = PFN_DOWN(addr + size);
+				chunk_size = min(size, PFN_PHYS(extra_pages));
+				pfn_s = PFN_UP(addr);
+				n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+				extra_pages -= n_pfns;
+				xen_add_extra_mem(pfn_s, n_pfns);
+				xen_max_p2m_pfn = pfn_s + n_pfns;
 			} else
-				type = E820_UNUSABLE;
+				discard = true;
 		}
 
-		xen_align_and_add_e820_region(addr, size, type);
+		if (!discard)
+			xen_align_and_add_e820_region(addr, chunk_size, type);
 
-		map[i].addr += size;
-		map[i].size -= size;
-		if (map[i].size == 0)
+		addr += chunk_size;
+		size -= chunk_size;
+		if (size == 0) {
 			i++;
+			if (i < xen_e820_map_entries) {
+				addr = xen_e820_map[i].addr;
+				size = xen_e820_map[i].size;
+			}
+		}
 	}
 
 	/*
 	 * Set the rest as identity mapped, in case PCI BARs are
 	 * located here.
-	 *
-	 * PFNs above MAX_P2M_PFN are considered identity mapped as
-	 * well.
 	 */
-	set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+	set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
 
 	/*
 	 * In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +876,53 @@ char * __init xen_memory_setup(void)
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
 
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
 	/*
-	 * Reserve Xen bits:
-	 *  - mfn_list
-	 *  - xen_start_info
-	 * See comment above "struct start_info" in <xen/interface/xen.h>
-	 * We tried to make the the memblock_reserve more selective so
-	 * that it would be clear what region is reserved. Sadly we ran
-	 * in the problem wherein on a 64-bit hypervisor with a 32-bit
-	 * initial domain, the pt_base has the cr3 value which is not
-	 * neccessarily where the pagetable starts! As Jan put it: "
-	 * Actually, the adjustment turns out to be correct: The page
-	 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
-	 * "first L2", "first L3", so the offset to the page table base is
-	 * indeed 2. When reading xen/include/public/xen.h's comment
-	 * very strictly, this is not a violation (since there nothing is said
-	 * that the first thing in the page table space is pointed to by
-	 * pt_base; I admit that this seems to be implied though, namely
-	 * do I think that it is implied that the page table space is the
-	 * range [pt_base, pt_base + nt_pt_frames), whereas that
-	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
-	 * which - without a priori knowledge - the kernel would have
-	 * difficulty to figure out)." - so lets just fall back to the
-	 * easy way and reserve the whole region.
+	 * Check whether the kernel itself conflicts with the target E820 map.
+	 * Failing now is better than running into weird problems later due
+	 * to relocating (and even reusing) pages with kernel text or data.
 	 */
-	memblock_reserve(__pa(xen_start_info->mfn_list),
-			 xen_start_info->pt_base - xen_start_info->mfn_list);
+	if (xen_is_e820_reserved(__pa_symbol(_text),
+			__pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+		xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+		BUG();
+	}
 
-	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	/*
+	 * Check for a conflict of the hypervisor supplied page tables with
+	 * the target E820 map.
+	 */
+	xen_pt_check_e820();
+
+	xen_reserve_xen_mfnlist();
+
+	/* Check for a conflict of the initrd with the target E820 map. */
+	if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+				 boot_params.hdr.ramdisk_size)) {
+		phys_addr_t new_area, start, size;
+
+		new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+		if (!new_area) {
+			xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+			BUG();
+		}
+
+		start = boot_params.hdr.ramdisk_image;
+		size = boot_params.hdr.ramdisk_size;
+		xen_phys_memcpy(new_area, start, size);
+		pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+			start, start + size, new_area, new_area + size);
+		memblock_free(start, size);
+		boot_params.hdr.ramdisk_image = new_area;
+		boot_params.ext_ramdisk_image = new_area >> 32;
+	}
+
+	/*
+	 * Set identity map on non-RAM pages and prepare remapping the
+	 * underlying RAM.
+	 */
+	xen_set_identity_and_remap(max_pfn);
 
 	return "Xen";
 }
@@ -721,26 +932,30 @@ char * __init xen_memory_setup(void)
  */
 char * __init xen_auto_xlated_memory_setup(void)
 {
-	static struct e820entry map[E820MAX] __initdata;
-
 	struct xen_memory_map memmap;
 	int i;
 	int rc;
 
 	memmap.nr_entries = E820MAX;
-	set_xen_guest_handle(memmap.buffer, map);
+	set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
 	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
 	if (rc < 0)
 		panic("No memory map (%d)\n", rc);
 
-	sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+	xen_e820_map_entries = memmap.nr_entries;
+
+	sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+			  &xen_e820_map_entries);
 
-	for (i = 0; i < memmap.nr_entries; i++)
-		e820_add_region(map[i].addr, map[i].size, map[i].type);
+	for (i = 0; i < xen_e820_map_entries; i++)
+		e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
+				xen_e820_map[i].type);
 
-	memblock_reserve(__pa(xen_start_info->mfn_list),
-			 xen_start_info->pt_base - xen_start_info->mfn_list);
+	/* Remove p2m info, it is not needed. */
+	xen_start_info->mfn_list = 0;
+	xen_start_info->first_p2m_pfn = 0;
+	xen_start_info->nr_p2m_frames = 0;
 
 	return "Xen";
 }
@@ -753,17 +968,8 @@ char * __init xen_auto_xlated_memory_setup(void)
 static void __init fiddle_vdso(void)
 {
 #ifdef CONFIG_X86_32
-	/*
-	 * This could be called before selected_vdso32 is initialized, so
-	 * just fiddle with both possible images.  vdso_image_32_syscall
-	 * can't be selected, since it only exists on 64-bit systems.
-	 */
-	u32 *mask;
-	mask = vdso_image_32_int80.data +
-		vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
-	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
-	mask = vdso_image_32_sysenter.data +
-		vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
+	u32 *mask = vdso_image_32.data +
+		vdso_image_32.sym_VDSO32_NOTE_MASK;
 	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
 #endif
 }
diff --git a/kernel/arch/x86/xen/smp.c b/kernel/arch/x86/xen/smp.c
index 864843844..3f4ebf026 100644
--- a/kernel/arch/x86/xen/smp.c
+++ b/kernel/arch/x86/xen/smp.c
@@ -26,6 +26,7 @@
 
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
 
 #include <asm/xen/interface.h>
 #include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 #include "smp.h"
+#include "pmu.h"
 
 cpumask_var_t xen_cpu_initialized_map;
 
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
 		kfree(per_cpu(xen_irq_work, cpu).name);
 		per_cpu(xen_irq_work, cpu).name = NULL;
 	}
+
+	if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+		unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+		per_cpu(xen_pmu_irq, cpu).irq = -1;
+		kfree(per_cpu(xen_pmu_irq, cpu).name);
+		per_cpu(xen_pmu_irq, cpu).name = NULL;
+	}
 };
 static int xen_smp_intr_init(unsigned int cpu)
 {
 	int rc;
-	char *resched_name, *callfunc_name, *debug_name;
+	char *resched_name, *callfunc_name, *debug_name, *pmu_name;
 
 	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
 	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
 	per_cpu(xen_irq_work, cpu).irq = rc;
 	per_cpu(xen_irq_work, cpu).name = callfunc_name;
 
+	if (is_xen_pmu(cpu)) {
+		pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+		rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+					     xen_pmu_irq_handler,
+					     IRQF_PERCPU|IRQF_NOBALANCING,
+					     pmu_name, NULL);
+		if (rc < 0)
+			goto fail;
+		per_cpu(xen_pmu_irq, cpu).irq = rc;
+		per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+	}
+
 	return 0;
 
  fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 	}
 	set_cpu_sibling_map(0);
 
+	xen_pmu_init(0);
+
 	if (xen_smp_intr_init(0))
 		BUG();
 
@@ -429,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	}
 #endif
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
 		BUG();
 
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	if (rc)
 		return rc;
 
+	xen_pmu_init(cpu);
+
 	rc = xen_smp_intr_init(cpu);
 	if (rc)
 		return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
 		xen_smp_intr_free(cpu);
 		xen_uninit_lock_cpu(cpu);
 		xen_teardown_timer(cpu);
+		xen_pmu_finish(cpu);
 	}
 }
 
diff --git a/kernel/arch/x86/xen/spinlock.c b/kernel/arch/x86/xen/spinlock.c
index 956374c1e..9e2ba5c6e 100644
--- a/kernel/arch/x86/xen/spinlock.c
+++ b/kernel/arch/x86/xen/spinlock.c
@@ -17,6 +17,56 @@
 #include "xen-ops.h"
 #include "debugfs.h"
 
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static bool xen_pvspin = true;
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void xen_qlock_kick(int cpu)
+{
+	xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void xen_qlock_wait(u8 *byte, u8 val)
+{
+	int irq = __this_cpu_read(lock_kicker_irq);
+
+	/* If kicker interrupts not initialized yet, just spin */
+	if (irq == -1)
+		return;
+
+	/* clear pending */
+	xen_clear_irq_pending(irq);
+	barrier();
+
+	/*
+	 * We check the byte value after clearing pending IRQ to make sure
+	 * that we won't miss a wakeup event because of the clearing.
+	 *
+	 * The sync_clear_bit() call in xen_clear_irq_pending() is atomic.
+	 * So it is effectively a memory barrier for x86.
+	 */
+	if (READ_ONCE(*byte) != val)
+		return;
+
+	/*
+	 * If an interrupt happens here, it will leave the wakeup irq
+	 * pending, which will cause xen_poll_irq() to return
+	 * immediately.
+	 */
+
+	/* Block until irq becomes pending (or perhaps a spurious wakeup) */
+	xen_poll_irq(irq);
+}
+
+#else /* CONFIG_QUEUED_SPINLOCKS */
+
 enum xen_contention_stat {
 	TAKEN_SLOW,
 	TAKEN_SLOW_PICKUP,
@@ -100,12 +150,9 @@ struct xen_lock_waiting {
 	__ticket_t want;
 };
 
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
 
-static bool xen_pvspin = true;
 __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
 	int irq = __this_cpu_read(lock_kicker_irq);
@@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 		}
 	}
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
@@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void)
 		return;
 	}
 	printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	__pv_init_lock_hash();
+	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_lock_ops.wait = xen_qlock_wait;
+	pv_lock_ops.kick = xen_qlock_kick;
+#else
 	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
 	pv_lock_ops.unlock_kick = xen_unlock_kick;
+#endif
 }
 
 /*
@@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg)
 }
 early_param("xen_nopvspin", xen_parse_nopvspin);
 
-#ifdef CONFIG_XEN_DEBUG_FS
+#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
 
 static struct dentry *d_spin_debug;
 
diff --git a/kernel/arch/x86/xen/suspend.c b/kernel/arch/x86/xen/suspend.c
index 53b4c0811..7f664c416 100644
--- a/kernel/arch/x86/xen/suspend.c
+++ b/kernel/arch/x86/xen/suspend.c
@@ -1,6 +1,7 @@
 #include <linux/types.h>
 #include <linux/tick.h>
 
+#include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
@@ -11,6 +12,7 @@
 
 #include "xen-ops.h"
 #include "mmu.h"
+#include "pmu.h"
 
 static void xen_pv_pre_suspend(void)
 {
@@ -32,7 +34,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled)
 {
 #ifdef CONFIG_XEN_PVHVM
 	int cpu;
-	xen_hvm_init_shared_info();
+	if (!suspend_cancelled)
+	    xen_hvm_init_shared_info();
 	xen_callback_vector();
 	xen_unplug_emulated_devices();
 	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
@@ -67,16 +70,16 @@ static void xen_pv_post_suspend(int suspend_cancelled)
 
 void xen_arch_pre_suspend(void)
 {
-    if (xen_pv_domain())
-        xen_pv_pre_suspend();
+	if (xen_pv_domain())
+		xen_pv_pre_suspend();
 }
 
 void xen_arch_post_suspend(int cancelled)
 {
-    if (xen_pv_domain())
-        xen_pv_post_suspend(cancelled);
-    else
-        xen_hvm_post_suspend(cancelled);
+	if (xen_pv_domain())
+		xen_pv_post_suspend(cancelled);
+	else
+		xen_hvm_post_suspend(cancelled);
 }
 
 static void xen_vcpu_notify_restore(void *data)
@@ -95,10 +98,20 @@ static void xen_vcpu_notify_suspend(void *data)
 
 void xen_arch_resume(void)
 {
+	int cpu;
+
 	on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
+
+	for_each_online_cpu(cpu)
+		xen_pmu_init(cpu);
 }
 
 void xen_arch_suspend(void)
 {
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		xen_pmu_finish(cpu);
+
 	on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
 }
diff --git a/kernel/arch/x86/xen/time.c b/kernel/arch/x86/xen/time.c
index 55da33b1d..f1ba6a092 100644
--- a/kernel/arch/x86/xen/time.c
+++ b/kernel/arch/x86/xen/time.c
@@ -274,30 +274,18 @@ static s64 get_abs_timeout(unsigned long delta)
 	return xen_clocksource_read() + delta;
 }
 
-static void xen_timerop_set_mode(enum clock_event_mode mode,
-				 struct clock_event_device *evt)
+static int xen_timerop_shutdown(struct clock_event_device *evt)
 {
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		/* unsupported */
-		WARN_ON(1);
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
-		break;
-	}
+	/* cancel timeout */
+	HYPERVISOR_set_timer_op(0);
+
+	return 0;
 }
 
 static int xen_timerop_set_next_event(unsigned long delta,
 				      struct clock_event_device *evt)
 {
-	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	WARN_ON(!clockevent_state_oneshot(evt));
 
 	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 		BUG();
@@ -310,46 +298,39 @@ static int xen_timerop_set_next_event(unsigned long delta,
 }
 
 static const struct clock_event_device xen_timerop_clockevent = {
-	.name = "xen",
-	.features = CLOCK_EVT_FEAT_ONESHOT,
+	.name			= "xen",
+	.features		= CLOCK_EVT_FEAT_ONESHOT,
 
-	.max_delta_ns = 0xffffffff,
-	.min_delta_ns = TIMER_SLOP,
+	.max_delta_ns		= 0xffffffff,
+	.min_delta_ns		= TIMER_SLOP,
 
-	.mult = 1,
-	.shift = 0,
-	.rating = 500,
+	.mult			= 1,
+	.shift			= 0,
+	.rating			= 500,
 
-	.set_mode = xen_timerop_set_mode,
-	.set_next_event = xen_timerop_set_next_event,
+	.set_state_shutdown	= xen_timerop_shutdown,
+	.set_next_event		= xen_timerop_set_next_event,
 };
 
+static int xen_vcpuop_shutdown(struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
 
+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+	    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+		BUG();
+
+	return 0;
+}
 
-static void xen_vcpuop_set_mode(enum clock_event_mode mode,
-				struct clock_event_device *evt)
+static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
 {
 	int cpu = smp_processor_id();
 
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		WARN_ON(1);	/* unsupported */
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
-			BUG();
-		break;
+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+		BUG();
 
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
-		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
-			BUG();
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
+	return 0;
 }
 
 static int xen_vcpuop_set_next_event(unsigned long delta,
@@ -359,7 +340,7 @@ static int xen_vcpuop_set_next_event(unsigned long delta,
 	struct vcpu_set_singleshot_timer single;
 	int ret;
 
-	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	WARN_ON(!clockevent_state_oneshot(evt));
 
 	single.timeout_abs_ns = get_abs_timeout(delta);
 	single.flags = VCPU_SSHOTTMR_future;
@@ -382,7 +363,8 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
 	.shift = 0,
 	.rating = 500,
 
-	.set_mode = xen_vcpuop_set_mode,
+	.set_state_shutdown = xen_vcpuop_shutdown,
+	.set_state_oneshot = xen_vcpuop_set_oneshot,
 	.set_next_event = xen_vcpuop_set_next_event,
 };
 
diff --git a/kernel/arch/x86/xen/xen-asm_64.S b/kernel/arch/x86/xen/xen-asm_64.S
index 985fc3ee0..f22667abf 100644
--- a/kernel/arch/x86/xen/xen-asm_64.S
+++ b/kernel/arch/x86/xen/xen-asm_64.S
@@ -15,6 +15,8 @@
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
 
 #include <xen/interface/xen.h>
 
@@ -47,29 +49,13 @@ ENTRY(xen_iret)
 ENDPATCH(xen_iret)
 RELOC(xen_iret, 1b+1)
 
-/*
- * sysexit is not used for 64-bit processes, so it's only ever used to
- * return to 32-bit compat userspace.
- */
-ENTRY(xen_sysexit)
-	pushq $__USER32_DS
-	pushq %rcx
-	pushq $X86_EFLAGS_IF
-	pushq $__USER32_CS
-	pushq %rdx
-
-	pushq $0
-1:	jmp hypercall_iret
-ENDPATCH(xen_sysexit)
-RELOC(xen_sysexit, 1b+1)
-
 ENTRY(xen_sysret64)
 	/*
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	pushq $__USER_DS
 	pushq PER_CPU_VAR(rsp_scratch)
@@ -88,7 +74,7 @@ ENTRY(xen_sysret32)
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	pushq $__USER32_DS
 	pushq PER_CPU_VAR(rsp_scratch)
@@ -128,7 +114,7 @@ RELOC(xen_sysret32, 1b+1)
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
 	undo_xen_syscall
-	jmp system_call_after_swapgs
+	jmp entry_SYSCALL_64_after_swapgs
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
@@ -136,13 +122,13 @@ ENDPROC(xen_syscall_target)
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
 	undo_xen_syscall
-	jmp ia32_cstar_target
+	jmp entry_SYSCALL_compat
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
 ENTRY(xen_sysenter_target)
 	undo_xen_syscall
-	jmp ia32_sysenter_target
+	jmp entry_SYSENTER_compat
 ENDPROC(xen_sysenter_target)
 
 #else /* !CONFIG_IA32_EMULATION */
diff --git a/kernel/arch/x86/xen/xen-head.S b/kernel/arch/x86/xen/xen-head.S
index 8afdfccf6..b65f59a35 100644
--- a/kernel/arch/x86/xen/xen-head.S
+++ b/kernel/arch/x86/xen/xen-head.S
@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __PAGE_OFFSET)
 #else
 	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
+	/* Map the p2m table to a 512GB-aligned user address. */
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
 #endif
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
diff --git a/kernel/arch/x86/xen/xen-ops.h b/kernel/arch/x86/xen/xen-ops.h
index bef30cbb5..1399423f3 100644
--- a/kernel/arch/x86/xen/xen-ops.h
+++ b/kernel/arch/x86/xen/xen-ops.h
@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
 void xen_setup_machphys_mapping(void);
 void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_reserve_top(void);
+void __init xen_reserve_special_pages(void);
+void __init xen_pt_check_e820(void);
 
 void xen_mm_pin_all(void);
 void xen_mm_unpin_all(void);
+#ifdef CONFIG_X86_64
+void __init xen_relocate_p2m(void);
+#endif
 
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
 unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
 void __init xen_inv_extra_mem(void);
 void __init xen_remap_memory(void);
+phys_addr_t __init xen_find_free_area(phys_addr_t size);
 char * __init xen_memory_setup(void);
 char * xen_auto_xlated_memory_setup(void);
 void __init xen_arch_setup(void);
@@ -132,7 +139,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
+#ifdef CONFIG_X86_32
 __visible void xen_sysexit(void);
+#endif
 __visible void xen_sysret32(void);
 __visible void xen_sysret64(void);
 __visible void xen_adjust_exception_frame(void);
author	José Pekkarinen <jose.pekkarinen@nokia.com>	2016-04-11 10:41:07 +0300
committer	José Pekkarinen <jose.pekkarinen@nokia.com>	2016-04-13 08:17:18 +0300
commit	e09b41010ba33a20a87472ee821fa407a5b8da36 (patch)
tree	d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/arch/x86
parent	f93b97fd65072de626c074dbe099a1fff05ce060 (diff)