From e09b41010ba33a20a87472ee821fa407a5b8da36 Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Mon, 11 Apr 2016 10:41:07 +0300 Subject: These changes are the raw update to linux-4.4.6-rt14. Kernel sources are taken from kernel.org, and rt patch from the rt wiki download page. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen --- kernel/arch/x86/include/asm/Kbuild | 2 +- kernel/arch/x86/include/asm/acpi.h | 23 + kernel/arch/x86/include/asm/alternative-asm.h | 18 + kernel/arch/x86/include/asm/alternative.h | 6 + kernel/arch/x86/include/asm/amd_nb.h | 13 +- kernel/arch/x86/include/asm/apic.h | 118 ++-- kernel/arch/x86/include/asm/arch_hweight.h | 13 +- kernel/arch/x86/include/asm/asm.h | 25 + kernel/arch/x86/include/asm/atomic.h | 59 +- kernel/arch/x86/include/asm/atomic64_32.h | 14 + kernel/arch/x86/include/asm/atomic64_64.h | 27 +- kernel/arch/x86/include/asm/barrier.h | 23 +- kernel/arch/x86/include/asm/boot.h | 2 +- kernel/arch/x86/include/asm/cacheflush.h | 9 +- kernel/arch/x86/include/asm/calling.h | 247 -------- kernel/arch/x86/include/asm/cmpxchg.h | 2 - kernel/arch/x86/include/asm/context_tracking.h | 10 - kernel/arch/x86/include/asm/cpufeature.h | 11 +- kernel/arch/x86/include/asm/crypto/glue_helper.h | 2 +- kernel/arch/x86/include/asm/delay.h | 1 + kernel/arch/x86/include/asm/dma-mapping.h | 70 +-- kernel/arch/x86/include/asm/dwarf2.h | 106 +--- kernel/arch/x86/include/asm/edac.h | 2 +- kernel/arch/x86/include/asm/efi.h | 15 +- kernel/arch/x86/include/asm/elf.h | 27 +- kernel/arch/x86/include/asm/entry_arch.h | 5 + kernel/arch/x86/include/asm/espfix.h | 2 +- kernel/arch/x86/include/asm/fpu-internal.h | 626 ------------------- kernel/arch/x86/include/asm/fpu/api.h | 48 ++ kernel/arch/x86/include/asm/fpu/internal.h | 694 +++++++++++++++++++++ kernel/arch/x86/include/asm/fpu/regset.h | 21 + kernel/arch/x86/include/asm/fpu/signal.h | 33 + kernel/arch/x86/include/asm/fpu/types.h | 355 +++++++++++ kernel/arch/x86/include/asm/fpu/xstate.h | 51 ++ kernel/arch/x86/include/asm/frame.h | 7 +- kernel/arch/x86/include/asm/ftrace.h | 4 +- kernel/arch/x86/include/asm/hardirq.h | 4 + kernel/arch/x86/include/asm/highmem.h | 1 - kernel/arch/x86/include/asm/hpet.h | 22 +- kernel/arch/x86/include/asm/hugetlb.h | 12 - kernel/arch/x86/include/asm/hw_irq.h | 146 +++-- kernel/arch/x86/include/asm/i387.h | 108 ---- kernel/arch/x86/include/asm/i8259.h | 1 + kernel/arch/x86/include/asm/ia32.h | 13 +- kernel/arch/x86/include/asm/intel_pmc_ipc.h | 55 ++ kernel/arch/x86/include/asm/io.h | 11 +- kernel/arch/x86/include/asm/io_apic.h | 114 +--- kernel/arch/x86/include/asm/iosf_mbi.h | 8 +- kernel/arch/x86/include/asm/irq.h | 11 +- kernel/arch/x86/include/asm/irq_remapping.h | 80 +-- kernel/arch/x86/include/asm/irq_vectors.h | 61 +- kernel/arch/x86/include/asm/irqdomain.h | 63 ++ kernel/arch/x86/include/asm/jump_label.h | 23 +- kernel/arch/x86/include/asm/kasan.h | 3 + kernel/arch/x86/include/asm/kdebug.h | 6 - kernel/arch/x86/include/asm/kvm_emulate.h | 19 +- kernel/arch/x86/include/asm/kvm_host.h | 204 ++++-- kernel/arch/x86/include/asm/livepatch.h | 1 + kernel/arch/x86/include/asm/math_emu.h | 6 +- kernel/arch/x86/include/asm/mce.h | 36 +- kernel/arch/x86/include/asm/microcode.h | 26 +- kernel/arch/x86/include/asm/microcode_amd.h | 7 +- kernel/arch/x86/include/asm/microcode_intel.h | 23 +- kernel/arch/x86/include/asm/mmu.h | 2 + kernel/arch/x86/include/asm/mmu_context.h | 75 ++- kernel/arch/x86/include/asm/mpx.h | 74 +-- kernel/arch/x86/include/asm/mshyperv.h | 5 + kernel/arch/x86/include/asm/msi.h | 7 + kernel/arch/x86/include/asm/msr-index.h | 694 +++++++++++++++++++++ kernel/arch/x86/include/asm/msr.h | 82 ++- kernel/arch/x86/include/asm/mtrr.h | 15 +- kernel/arch/x86/include/asm/mwait.h | 45 ++ kernel/arch/x86/include/asm/numachip/numachip.h | 1 + .../arch/x86/include/asm/numachip/numachip_csr.h | 153 ++--- kernel/arch/x86/include/asm/page_64_types.h | 3 - kernel/arch/x86/include/asm/page_types.h | 13 +- kernel/arch/x86/include/asm/paravirt.h | 75 +-- kernel/arch/x86/include/asm/paravirt_types.h | 25 +- kernel/arch/x86/include/asm/pat.h | 9 +- kernel/arch/x86/include/asm/pci.h | 14 +- kernel/arch/x86/include/asm/pci_x86.h | 2 - kernel/arch/x86/include/asm/perf_event.h | 7 + kernel/arch/x86/include/asm/pgtable.h | 47 +- kernel/arch/x86/include/asm/pgtable_types.h | 43 +- kernel/arch/x86/include/asm/pmc_atom.h | 29 + kernel/arch/x86/include/asm/pmem.h | 153 +++++ kernel/arch/x86/include/asm/preempt.h | 19 +- kernel/arch/x86/include/asm/processor.h | 189 +----- kernel/arch/x86/include/asm/proto.h | 10 +- kernel/arch/x86/include/asm/ptrace.h | 1 - kernel/arch/x86/include/asm/pvclock-abi.h | 2 + kernel/arch/x86/include/asm/pvclock.h | 11 +- kernel/arch/x86/include/asm/qrwlock.h | 10 - kernel/arch/x86/include/asm/qspinlock.h | 66 ++ kernel/arch/x86/include/asm/qspinlock_paravirt.h | 6 + kernel/arch/x86/include/asm/serial.h | 2 +- kernel/arch/x86/include/asm/setup.h | 7 + kernel/arch/x86/include/asm/sigcontext.h | 75 +-- kernel/arch/x86/include/asm/sigframe.h | 18 +- kernel/arch/x86/include/asm/signal.h | 6 +- kernel/arch/x86/include/asm/simd.h | 2 +- kernel/arch/x86/include/asm/smp.h | 10 - kernel/arch/x86/include/asm/special_insns.h | 38 ++ kernel/arch/x86/include/asm/spinlock.h | 5 + kernel/arch/x86/include/asm/spinlock_types.h | 4 + kernel/arch/x86/include/asm/stackprotector.h | 7 +- kernel/arch/x86/include/asm/string_64.h | 5 +- kernel/arch/x86/include/asm/suspend_32.h | 2 +- kernel/arch/x86/include/asm/suspend_64.h | 2 +- kernel/arch/x86/include/asm/syscall.h | 14 +- kernel/arch/x86/include/asm/syscalls.h | 1 + kernel/arch/x86/include/asm/thread_info.h | 42 +- kernel/arch/x86/include/asm/tlbflush.h | 6 + kernel/arch/x86/include/asm/topology.h | 4 +- kernel/arch/x86/include/asm/trace/irq_vectors.h | 6 + kernel/arch/x86/include/asm/trace/mpx.h | 133 ++++ kernel/arch/x86/include/asm/traps.h | 7 +- kernel/arch/x86/include/asm/tsc.h | 19 +- kernel/arch/x86/include/asm/uaccess.h | 14 +- kernel/arch/x86/include/asm/uaccess_32.h | 4 + kernel/arch/x86/include/asm/user.h | 12 +- kernel/arch/x86/include/asm/uv/uv_hub.h | 2 +- kernel/arch/x86/include/asm/vdso.h | 10 +- kernel/arch/x86/include/asm/vm86.h | 57 +- kernel/arch/x86/include/asm/vmx.h | 53 +- kernel/arch/x86/include/asm/x86_init.h | 22 - kernel/arch/x86/include/asm/xcr.h | 49 -- kernel/arch/x86/include/asm/xen/events.h | 11 + kernel/arch/x86/include/asm/xen/hypercall.h | 10 +- kernel/arch/x86/include/asm/xen/hypervisor.h | 5 + kernel/arch/x86/include/asm/xen/interface.h | 219 ++++++- kernel/arch/x86/include/asm/xen/page.h | 53 +- kernel/arch/x86/include/asm/xor.h | 2 +- kernel/arch/x86/include/asm/xor_32.h | 2 +- kernel/arch/x86/include/asm/xor_avx.h | 2 +- kernel/arch/x86/include/asm/xsave.h | 257 -------- 136 files changed, 4086 insertions(+), 2755 deletions(-) delete mode 100644 kernel/arch/x86/include/asm/calling.h delete mode 100644 kernel/arch/x86/include/asm/context_tracking.h delete mode 100644 kernel/arch/x86/include/asm/fpu-internal.h create mode 100644 kernel/arch/x86/include/asm/fpu/api.h create mode 100644 kernel/arch/x86/include/asm/fpu/internal.h create mode 100644 kernel/arch/x86/include/asm/fpu/regset.h create mode 100644 kernel/arch/x86/include/asm/fpu/signal.h create mode 100644 kernel/arch/x86/include/asm/fpu/types.h create mode 100644 kernel/arch/x86/include/asm/fpu/xstate.h delete mode 100644 kernel/arch/x86/include/asm/i387.h create mode 100644 kernel/arch/x86/include/asm/intel_pmc_ipc.h create mode 100644 kernel/arch/x86/include/asm/irqdomain.h create mode 100644 kernel/arch/x86/include/asm/msi.h create mode 100644 kernel/arch/x86/include/asm/msr-index.h create mode 100644 kernel/arch/x86/include/asm/pmem.h create mode 100644 kernel/arch/x86/include/asm/qspinlock.h create mode 100644 kernel/arch/x86/include/asm/qspinlock_paravirt.h create mode 100644 kernel/arch/x86/include/asm/trace/mpx.h delete mode 100644 kernel/arch/x86/include/asm/xcr.h delete mode 100644 kernel/arch/x86/include/asm/xsave.h (limited to 'kernel/arch/x86/include/asm') diff --git a/kernel/arch/x86/include/asm/Kbuild b/kernel/arch/x86/include/asm/Kbuild index d55a210a4..aeac434c9 100644 --- a/kernel/arch/x86/include/asm/Kbuild +++ b/kernel/arch/x86/include/asm/Kbuild @@ -9,4 +9,4 @@ generic-y += cputime.h generic-y += dma-contiguous.h generic-y += early_ioremap.h generic-y += mcs_spinlock.h -generic-y += scatterlist.h +generic-y += mm-arch-hooks.h diff --git a/kernel/arch/x86/include/asm/acpi.h b/kernel/arch/x86/include/asm/acpi.h index 3a45668f6..94c18ebfd 100644 --- a/kernel/arch/x86/include/asm/acpi.h +++ b/kernel/arch/x86/include/asm/acpi.h @@ -32,6 +32,10 @@ #include #include +#ifdef CONFIG_ACPI_APEI +# include +#endif + #ifdef CONFIG_ACPI extern int acpi_lapic; extern int acpi_ioapic; @@ -147,4 +151,23 @@ extern int x86_acpi_numa_init(void); #define acpi_unlazy_tlb(x) leave_mm(x) +#ifdef CONFIG_ACPI_APEI +static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) +{ + /* + * We currently have no way to look up the EFI memory map + * attributes for a region in a consistent way, because the + * memmap is discarded after efi_free_boot_services(). So if + * you call efi_mem_attributes() during boot and at runtime, + * you could theoretically see different attributes. + * + * Since we are yet to see any x86 platforms that require + * anything other than PAGE_KERNEL (some arm64 platforms + * require the equivalent of PAGE_KERNEL_NOCACHE), return that + * until we know differently. + */ + return PAGE_KERNEL; +} +#endif + #endif /* _ASM_X86_ACPI_H */ diff --git a/kernel/arch/x86/include/asm/alternative-asm.h b/kernel/arch/x86/include/asm/alternative-asm.h index bdf02eeee..e7636bac7 100644 --- a/kernel/arch/x86/include/asm/alternative-asm.h +++ b/kernel/arch/x86/include/asm/alternative-asm.h @@ -18,6 +18,12 @@ .endm #endif +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ .macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . @@ -27,6 +33,12 @@ .byte \pad_len .endm +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ .macro ALTERNATIVE oldinstr, newinstr, feature 140: \oldinstr @@ -55,6 +67,12 @@ */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + +/* + * Same as ALTERNATIVE macro above but for two alternatives. If CPU + * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has + * @feature2, it replaces @oldinstr with @feature2. + */ .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 140: \oldinstr diff --git a/kernel/arch/x86/include/asm/alternative.h b/kernel/arch/x86/include/asm/alternative.h index ba32af062..7bfc85bbb 100644 --- a/kernel/arch/x86/include/asm/alternative.h +++ b/kernel/arch/x86/include/asm/alternative.h @@ -52,6 +52,12 @@ struct alt_instr { u8 padlen; /* length of build-time padding */ } __packed; +/* + * Debug flag that can be tested to see whether alternative + * instructions were patched in already: + */ +extern int alternatives_patched; + extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); diff --git a/kernel/arch/x86/include/asm/amd_nb.h b/kernel/arch/x86/include/asm/amd_nb.h index aaac3b2fb..3c56ef1ae 100644 --- a/kernel/arch/x86/include/asm/amd_nb.h +++ b/kernel/arch/x86/include/asm/amd_nb.h @@ -81,7 +81,7 @@ static inline struct amd_northbridge *node_to_amd_nb(int node) return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } -static inline u16 amd_get_node_id(struct pci_dev *pdev) +static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) { struct pci_dev *misc; int i; @@ -98,11 +98,22 @@ static inline u16 amd_get_node_id(struct pci_dev *pdev) return 0; } +static inline bool amd_gart_present(void) +{ + /* GART present only on Fam15h, upto model 0fh */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) + return true; + + return false; +} + #else #define amd_nb_num(x) 0 #define amd_nb_has_feature(x) false #define node_to_amd_nb(x) NULL +#define amd_gart_present(x) false #endif diff --git a/kernel/arch/x86/include/asm/apic.h b/kernel/arch/x86/include/asm/apic.h index 976b86a32..a30316bf8 100644 --- a/kernel/arch/x86/include/asm/apic.h +++ b/kernel/arch/x86/include/asm/apic.h @@ -115,6 +115,59 @@ static inline bool apic_is_x2apic_enabled(void) return msr & X2APIC_ENABLE; } +extern void enable_IR_x2apic(void); + +extern int get_physical_broadcast(void); + +extern int lapic_get_maxlvt(void); +extern void clear_local_APIC(void); +extern void disconnect_bsp_APIC(int virt_wire_setup); +extern void disable_local_APIC(void); +extern void lapic_shutdown(void); +extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); +extern void setup_local_APIC(void); +extern void init_apic_mappings(void); +void register_lapic_address(unsigned long address); +extern void setup_boot_APIC_clock(void); +extern void setup_secondary_APIC_clock(void); +extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else +extern int apic_force_enable(unsigned long addr); +#endif + +extern int apic_bsp_setup(bool upmode); +extern void apic_ap_setup(void); + +/* + * On 32bit this is mach-xxx local + */ +#ifdef CONFIG_X86_64 +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} +#endif + +extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); + +#else /* !CONFIG_X86_LOCAL_APIC */ +static inline void lapic_shutdown(void) { } +#define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } +# define setup_boot_APIC_clock x86_init_noop +# define setup_secondary_APIC_clock x86_init_noop +#endif /* !CONFIG_X86_LOCAL_APIC */ + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -186,67 +239,14 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) -#else +#else /* !CONFIG_X86_X2APIC */ static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } #define x2apic_mode (0) #define x2apic_supported() (0) -#endif - -extern void enable_IR_x2apic(void); - -extern int get_physical_broadcast(void); - -extern int lapic_get_maxlvt(void); -extern void clear_local_APIC(void); -extern void disconnect_bsp_APIC(int virt_wire_setup); -extern void disable_local_APIC(void); -extern void lapic_shutdown(void); -extern void sync_Arb_IDs(void); -extern void init_bsp_APIC(void); -extern void setup_local_APIC(void); -extern void init_apic_mappings(void); -void register_lapic_address(unsigned long address); -extern void setup_boot_APIC_clock(void); -extern void setup_secondary_APIC_clock(void); -extern int APIC_init_uniprocessor(void); - -#ifdef CONFIG_X86_64 -static inline int apic_force_enable(unsigned long addr) -{ - return -1; -} -#else -extern int apic_force_enable(unsigned long addr); -#endif - -extern int apic_bsp_setup(bool upmode); -extern void apic_ap_setup(void); - -/* - * On 32bit this is mach-xxx local - */ -#ifdef CONFIG_X86_64 -extern int apic_is_clustered_box(void); -#else -static inline int apic_is_clustered_box(void) -{ - return 0; -} -#endif - -extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); - -#else /* !CONFIG_X86_LOCAL_APIC */ -static inline void lapic_shutdown(void) { } -#define local_apic_timer_c2_ok 1 -static inline void init_apic_mappings(void) { } -static inline void disable_local_APIC(void) { } -# define setup_boot_APIC_clock x86_init_noop -# define setup_secondary_APIC_clock x86_init_noop -#endif /* !CONFIG_X86_LOCAL_APIC */ +#endif /* !CONFIG_X86_X2APIC */ #ifdef CONFIG_X86_64 #define SET_APIC_ID(x) (apic->set_apic_id(x)) @@ -313,7 +313,6 @@ struct apic { /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); - bool wait_for_init_deassert; void (*inquire_remote_apic)(int apicid); /* apic ops */ @@ -378,7 +377,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[]; * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP -extern atomic_t init_deasserted; extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); #endif @@ -644,6 +642,12 @@ static inline void entering_ack_irq(void) entering_irq(); } +static inline void ipi_entering_ack_irq(void) +{ + ack_APIC_irq(); + irq_enter(); +} + static inline void exiting_irq(void) { irq_exit(); diff --git a/kernel/arch/x86/include/asm/arch_hweight.h b/kernel/arch/x86/include/asm/arch_hweight.h index 9686c3d9f..259a7c1ef 100644 --- a/kernel/arch/x86/include/asm/arch_hweight.h +++ b/kernel/arch/x86/include/asm/arch_hweight.h @@ -21,7 +21,7 @@ * ARCH_HWEIGHT_CFLAGS in for the respective * compiler switches. */ -static inline unsigned int __arch_hweight32(unsigned int w) +static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res = 0; @@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w) return __arch_hweight32(w & 0xff); } +#ifdef CONFIG_X86_32 static inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; - -#ifdef CONFIG_X86_32 return __arch_hweight32((u32)w) + __arch_hweight32((u32)(w >> 32)); +} #else +static __always_inline unsigned long __arch_hweight64(__u64 w) +{ + unsigned long res = 0; + asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); -#endif /* CONFIG_X86_32 */ return res; } +#endif /* CONFIG_X86_32 */ #endif diff --git a/kernel/arch/x86/include/asm/asm.h b/kernel/arch/x86/include/asm/asm.h index 7730c1c5c..189679aba 100644 --- a/kernel/arch/x86/include/asm/asm.h +++ b/kernel/arch/x86/include/asm/asm.h @@ -63,6 +63,31 @@ _ASM_ALIGN ; \ _ASM_PTR (entry); \ .popsection + +.macro ALIGN_DESTINATION + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) + .endm + #else # define _ASM_EXTABLE(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ diff --git a/kernel/arch/x86/include/asm/atomic.h b/kernel/arch/x86/include/asm/atomic.h index 5e5cd123f..ae5fb83e6 100644 --- a/kernel/arch/x86/include/asm/atomic.h +++ b/kernel/arch/x86/include/asm/atomic.h @@ -22,9 +22,9 @@ * * Atomically reads the value of @v. */ -static inline int atomic_read(const atomic_t *v) +static __always_inline int atomic_read(const atomic_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** @@ -34,9 +34,9 @@ static inline int atomic_read(const atomic_t *v) * * Atomically sets the value of @v to @i. */ -static inline void atomic_set(atomic_t *v, int i) +static __always_inline void atomic_set(atomic_t *v, int i) { - v->counter = i; + WRITE_ONCE(v->counter, i); } /** @@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i) * * Atomically adds @i to @v. */ -static inline void atomic_add(int i, atomic_t *v) +static __always_inline void atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) @@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v) * * Atomically subtracts @i from @v. */ -static inline void atomic_sub(int i, atomic_t *v) +static __always_inline void atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) @@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic_sub_and_test(int i, atomic_t *v) +static __always_inline int atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); } @@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ -static inline void atomic_inc(atomic_t *v) +static __always_inline void atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter)); @@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic_dec(atomic_t *v) +static __always_inline void atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter)); @@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic_dec_and_test(atomic_t *v) +static __always_inline int atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } @@ -126,7 +126,7 @@ static inline int atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic_inc_and_test(atomic_t *v) +static __always_inline int atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); } @@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic_add_negative(int i, atomic_t *v) +static __always_inline int atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); } @@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline int atomic_add_return(int i, atomic_t *v) +static __always_inline int atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } @@ -164,7 +164,7 @@ static inline int atomic_add_return(int i, atomic_t *v) * * Atomically subtracts @i from @v and returns @v - @i */ -static inline int atomic_sub_return(int i, atomic_t *v) +static __always_inline int atomic_sub_return(int i, atomic_t *v) { return atomic_add_return(-i, v); } @@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) -static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { return cmpxchg(&v->counter, old, new); } @@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } +#define ATOMIC_OP(op) \ +static inline void atomic_##op(int i, atomic_t *v) \ +{ \ + asm volatile(LOCK_PREFIX #op"l %1,%0" \ + : "+m" (v->counter) \ + : "ir" (i) \ + : "memory"); \ +} + +ATOMIC_OP(and) +ATOMIC_OP(or) +ATOMIC_OP(xor) + +#undef ATOMIC_OP + /** * __atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t @@ -191,7 +206,7 @@ static inline int atomic_xchg(atomic_t *v, int new) * Atomically adds @a to @v, so long as @v was not already @u. * Returns the old value of @v. */ -static inline int __atomic_add_unless(atomic_t *v, int a, int u) +static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); @@ -213,22 +228,12 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) * Atomically adds 1 to @v * Returns the new value of @u */ -static inline short int atomic_inc_short(short int *v) +static __always_inline short int atomic_inc_short(short int *v) { asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); return *v; } -/* These are x86-specific, used by some header files */ -#define atomic_clear_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "andl %0,%1" \ - : : "r" (~(mask)), "m" (*(addr)) : "memory") - -#define atomic_set_mask(mask, addr) \ - asm volatile(LOCK_PREFIX "orl %0,%1" \ - : : "r" ((unsigned)(mask)), "m" (*(addr)) \ - : "memory") - #ifdef CONFIG_X86_32 # include #else diff --git a/kernel/arch/x86/include/asm/atomic64_32.h b/kernel/arch/x86/include/asm/atomic64_32.h index b154de75c..a11c30b77 100644 --- a/kernel/arch/x86/include/asm/atomic64_32.h +++ b/kernel/arch/x86/include/asm/atomic64_32.h @@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 +#define ATOMIC64_OP(op, c_op) \ +static inline void atomic64_##op(long long i, atomic64_t *v) \ +{ \ + long long old, c = 0; \ + while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ + c = old; \ +} + +ATOMIC64_OP(and, &) +ATOMIC64_OP(or, |) +ATOMIC64_OP(xor, ^) + +#undef ATOMIC64_OP + #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/kernel/arch/x86/include/asm/atomic64_64.h b/kernel/arch/x86/include/asm/atomic64_64.h index f8d273e18..037351022 100644 --- a/kernel/arch/x86/include/asm/atomic64_64.h +++ b/kernel/arch/x86/include/asm/atomic64_64.h @@ -18,7 +18,7 @@ */ static inline long atomic64_read(const atomic64_t *v) { - return ACCESS_ONCE((v)->counter); + return READ_ONCE((v)->counter); } /** @@ -30,7 +30,7 @@ static inline long atomic64_read(const atomic64_t *v) */ static inline void atomic64_set(atomic64_t *v, long i) { - v->counter = i; + WRITE_ONCE(v->counter, i); } /** @@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i) * * Atomically adds @i to @v. */ -static inline void atomic64_add(long i, atomic64_t *v) +static __always_inline void atomic64_add(long i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) @@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ -static inline void atomic64_inc(atomic64_t *v) +static __always_inline void atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) @@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ -static inline void atomic64_dec(atomic64_t *v) +static __always_inline void atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) @@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static inline long atomic64_add_return(long i, atomic64_t *v) +static __always_inline long atomic64_add_return(long i, atomic64_t *v) { return i + xadd(&v->counter, i); } @@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } +#define ATOMIC64_OP(op) \ +static inline void atomic64_##op(long i, atomic64_t *v) \ +{ \ + asm volatile(LOCK_PREFIX #op"q %1,%0" \ + : "+m" (v->counter) \ + : "er" (i) \ + : "memory"); \ +} + +ATOMIC64_OP(and) +ATOMIC64_OP(or) +ATOMIC64_OP(xor) + +#undef ATOMIC64_OP + #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/kernel/arch/x86/include/asm/barrier.h b/kernel/arch/x86/include/asm/barrier.h index 959e45b81..0681d2532 100644 --- a/kernel/arch/x86/include/asm/barrier.h +++ b/kernel/arch/x86/include/asm/barrier.h @@ -35,12 +35,12 @@ #define smp_mb() mb() #define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #endif /* SMP */ #define read_barrier_depends() do { } while (0) @@ -57,12 +57,12 @@ do { \ compiletime_assert_atomic_type(*p); \ smp_mb(); \ - ACCESS_ONCE(*p) = (v); \ + WRITE_ONCE(*p, v); \ } while (0) #define smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ smp_mb(); \ ___p1; \ @@ -74,12 +74,12 @@ do { \ do { \ compiletime_assert_atomic_type(*p); \ barrier(); \ - ACCESS_ONCE(*p) = (v); \ + WRITE_ONCE(*p, v); \ } while (0) #define smp_load_acquire(p) \ ({ \ - typeof(*p) ___p1 = ACCESS_ONCE(*p); \ + typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ barrier(); \ ___p1; \ @@ -91,15 +91,4 @@ do { \ #define smp_mb__before_atomic() barrier() #define smp_mb__after_atomic() barrier() -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - */ -static __always_inline void rdtsc_barrier(void) -{ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/kernel/arch/x86/include/asm/boot.h b/kernel/arch/x86/include/asm/boot.h index 4fa687a47..6b8d6e8cd 100644 --- a/kernel/arch/x86/include/asm/boot.h +++ b/kernel/arch/x86/include/asm/boot.h @@ -27,7 +27,7 @@ #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ -#define BOOT_HEAP_SIZE 0x8000 +#define BOOT_HEAP_SIZE 0x10000 #endif /* !CONFIG_KERNEL_BZIP2 */ diff --git a/kernel/arch/x86/include/asm/cacheflush.h b/kernel/arch/x86/include/asm/cacheflush.h index 47c8e32f6..e63aa38e8 100644 --- a/kernel/arch/x86/include/asm/cacheflush.h +++ b/kernel/arch/x86/include/asm/cacheflush.h @@ -4,11 +4,12 @@ /* Caches aren't brain-dead on the intel. */ #include #include +#include /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteBack + * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent @@ -35,9 +36,11 @@ int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); +int _set_memory_wt(unsigned long addr, int numpages); int _set_memory_wb(unsigned long addr, int numpages); int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); +int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); @@ -48,10 +51,12 @@ int set_memory_4k(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); +int set_memory_array_wt(unsigned long *addr, int addrinarray); int set_memory_array_wb(unsigned long *addr, int addrinarray); int set_pages_array_uc(struct page **pages, int addrinarray); int set_pages_array_wc(struct page **pages, int addrinarray); +int set_pages_array_wt(struct page **pages, int addrinarray); int set_pages_array_wb(struct page **pages, int addrinarray); /* @@ -84,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages); void clflush_cache_range(void *addr, unsigned int size); +#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) + #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; diff --git a/kernel/arch/x86/include/asm/calling.h b/kernel/arch/x86/include/asm/calling.h deleted file mode 100644 index 1c8b50edb..000000000 --- a/kernel/arch/x86/include/asm/calling.h +++ /dev/null @@ -1,247 +0,0 @@ -/* - - x86 function call convention, 64-bit: - ------------------------------------- - arguments | callee-saved | extra caller-saved | return - [callee-clobbered] | | [callee-clobbered] | - --------------------------------------------------------------------------- - rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**] - - ( rsp is obviously invariant across normal function calls. (gcc can 'merge' - functions when it sees tail-call optimization possibilities) rflags is - clobbered. Leftover arguments are passed over the stack frame.) - - [*] In the frame-pointers case rbp is fixed to the stack frame. - - [**] for struct return values wider than 64 bits the return convention is a - bit more complex: up to 128 bits width we return small structures - straight in rax, rdx. For structures larger than that (3 words or - larger) the caller puts a pointer to an on-stack return struct - [allocated in the caller's stack frame] into the first argument - i.e. - into rdi. All other arguments shift up by one in this case. - Fortunately this case is rare in the kernel. - -For 32-bit we have the following conventions - kernel is built with --mregparm=3 and -freg-struct-return: - - x86 function calling convention, 32-bit: - ---------------------------------------- - arguments | callee-saved | extra caller-saved | return - [callee-clobbered] | | [callee-clobbered] | - ------------------------------------------------------------------------- - eax edx ecx | ebx edi esi ebp [*] | | eax, edx [**] - - ( here too esp is obviously invariant across normal function calls. eflags - is clobbered. Leftover arguments are passed over the stack frame. ) - - [*] In the frame-pointers case ebp is fixed to the stack frame. - - [**] We build with -freg-struct-return, which on 32-bit means similar - semantics as on 64-bit: edx can be used for a second return value - (i.e. covering integer and structure sizes up to 64 bits) - after that - it gets more complex and more expensive: 3-word or larger struct returns - get done in the caller's frame and the pointer to the return struct goes - into regparm0, i.e. eax - the other arguments shift up and the - function's register parameters degenerate to regparm=2 in essence. - -*/ - -#include - -#ifdef CONFIG_X86_64 - -/* - * 64-bit system call stack frame layout defines and helpers, - * for assembly code: - */ - -/* The layout forms the "struct pt_regs" on the stack: */ -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ -#define R15 0*8 -#define R14 1*8 -#define R13 2*8 -#define R12 3*8 -#define RBP 4*8 -#define RBX 5*8 -/* These regs are callee-clobbered. Always saved on kernel entry. */ -#define R11 6*8 -#define R10 7*8 -#define R9 8*8 -#define R8 9*8 -#define RAX 10*8 -#define RCX 11*8 -#define RDX 12*8 -#define RSI 13*8 -#define RDI 14*8 -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ -#define ORIG_RAX 15*8 -/* Return frame for iretq */ -#define RIP 16*8 -#define CS 17*8 -#define EFLAGS 18*8 -#define RSP 19*8 -#define SS 20*8 - -#define SIZEOF_PTREGS 21*8 - - .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - subq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 15*8+\addskip - .endm - - .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 - .if \r11 - movq_cfi r11, 6*8+\offset - .endif - .if \r8910 - movq_cfi r10, 7*8+\offset - movq_cfi r9, 8*8+\offset - movq_cfi r8, 9*8+\offset - .endif - .if \rax - movq_cfi rax, 10*8+\offset - .endif - .if \rcx - movq_cfi rcx, 11*8+\offset - .endif - movq_cfi rdx, 12*8+\offset - movq_cfi rsi, 13*8+\offset - movq_cfi rdi, 14*8+\offset - .endm - .macro SAVE_C_REGS offset=0 - SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 - .endm - .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 - SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 - .endm - .macro SAVE_C_REGS_EXCEPT_R891011 - SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 - .endm - .macro SAVE_C_REGS_EXCEPT_RCX_R891011 - SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 - .endm - .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 - SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 - .endm - - .macro SAVE_EXTRA_REGS offset=0 - movq_cfi r15, 0*8+\offset - movq_cfi r14, 1*8+\offset - movq_cfi r13, 2*8+\offset - movq_cfi r12, 3*8+\offset - movq_cfi rbp, 4*8+\offset - movq_cfi rbx, 5*8+\offset - .endm - .macro SAVE_EXTRA_REGS_RBP offset=0 - movq_cfi rbp, 4*8+\offset - .endm - - .macro RESTORE_EXTRA_REGS offset=0 - movq_cfi_restore 0*8+\offset, r15 - movq_cfi_restore 1*8+\offset, r14 - movq_cfi_restore 2*8+\offset, r13 - movq_cfi_restore 3*8+\offset, r12 - movq_cfi_restore 4*8+\offset, rbp - movq_cfi_restore 5*8+\offset, rbx - .endm - - .macro ZERO_EXTRA_REGS - xorl %r15d, %r15d - xorl %r14d, %r14d - xorl %r13d, %r13d - xorl %r12d, %r12d - xorl %ebp, %ebp - xorl %ebx, %ebx - .endm - - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq_cfi_restore 6*8, r11 - .endif - .if \rstor_r8910 - movq_cfi_restore 7*8, r10 - movq_cfi_restore 8*8, r9 - movq_cfi_restore 9*8, r8 - .endif - .if \rstor_rax - movq_cfi_restore 10*8, rax - .endif - .if \rstor_rcx - movq_cfi_restore 11*8, rcx - .endif - .if \rstor_rdx - movq_cfi_restore 12*8, rdx - .endif - movq_cfi_restore 13*8, rsi - movq_cfi_restore 14*8, rdi - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RAX - RESTORE_C_REGS_HELPER 0,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX - RESTORE_C_REGS_HELPER 1,0,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_R11 - RESTORE_C_REGS_HELPER 1,1,0,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX_R11 - RESTORE_C_REGS_HELPER 1,0,0,1,1 - .endm - .macro RESTORE_RSI_RDI - RESTORE_C_REGS_HELPER 0,0,0,0,0 - .endm - .macro RESTORE_RSI_RDI_RDX - RESTORE_C_REGS_HELPER 0,0,0,0,1 - .endm - - .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - addq $15*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) - .endm - - .macro icebp - .byte 0xf1 - .endm - -#else /* CONFIG_X86_64 */ - -/* - * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These - * are different from the entry_32.S versions in not changing the segment - * registers. So only suitable for in kernel use, not when transitioning - * from or to user space. The resulting stack frame is not a standard - * pt_regs frame. The main use case is calling C code from assembler - * when all the registers need to be preserved. - */ - - .macro SAVE_ALL - pushl_cfi_reg eax - pushl_cfi_reg ebp - pushl_cfi_reg edi - pushl_cfi_reg esi - pushl_cfi_reg edx - pushl_cfi_reg ecx - pushl_cfi_reg ebx - .endm - - .macro RESTORE_ALL - popl_cfi_reg ebx - popl_cfi_reg ecx - popl_cfi_reg edx - popl_cfi_reg esi - popl_cfi_reg edi - popl_cfi_reg ebp - popl_cfi_reg eax - .endm - -#endif /* CONFIG_X86_64 */ - diff --git a/kernel/arch/x86/include/asm/cmpxchg.h b/kernel/arch/x86/include/asm/cmpxchg.h index 99c105d78..ad19841ed 100644 --- a/kernel/arch/x86/include/asm/cmpxchg.h +++ b/kernel/arch/x86/include/asm/cmpxchg.h @@ -4,8 +4,6 @@ #include #include /* Provides LOCK_PREFIX */ -#define __HAVE_ARCH_CMPXCHG 1 - /* * Non-existant functions to indicate usage errors at link time * (or compile-time if the compiler implements __compiletime_error(). diff --git a/kernel/arch/x86/include/asm/context_tracking.h b/kernel/arch/x86/include/asm/context_tracking.h deleted file mode 100644 index 1fe49704b..000000000 --- a/kernel/arch/x86/include/asm/context_tracking.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _ASM_X86_CONTEXT_TRACKING_H -#define _ASM_X86_CONTEXT_TRACKING_H - -#ifdef CONFIG_CONTEXT_TRACKING -# define SCHEDULE_USER call schedule_user -#else -# define SCHEDULE_USER call schedule -#endif - -#endif diff --git a/kernel/arch/x86/include/asm/cpufeature.h b/kernel/arch/x86/include/asm/cpufeature.h index 3d6606fb9..f7ba9fbf1 100644 --- a/kernel/arch/x86/include/asm/cpufeature.h +++ b/kernel/arch/x86/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include #endif -#define NCAPINTS 13 /* N 32-bit words worth of info */ +#define NCAPINTS 14 /* N 32-bit words worth of info */ #define NBUGINTS 1 /* N 32-bit bug flags */ /* @@ -119,6 +119,7 @@ #define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ #define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ #define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ #define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ @@ -176,6 +177,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -191,7 +193,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */ -#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ +#define X86_FEATURE_HWP_NOTIFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ @@ -214,6 +216,7 @@ #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ @@ -239,6 +242,7 @@ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ #define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ @@ -252,6 +256,9 @@ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ +#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ + /* * BUG word(s) */ diff --git a/kernel/arch/x86/include/asm/crypto/glue_helper.h b/kernel/arch/x86/include/asm/crypto/glue_helper.h index 1eef55596..03bb1065c 100644 --- a/kernel/arch/x86/include/asm/crypto/glue_helper.h +++ b/kernel/arch/x86/include/asm/crypto/glue_helper.h @@ -7,7 +7,7 @@ #include #include -#include +#include #include typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); diff --git a/kernel/arch/x86/include/asm/delay.h b/kernel/arch/x86/include/asm/delay.h index 9b3b4f275..36a760bda 100644 --- a/kernel/arch/x86/include/asm/delay.h +++ b/kernel/arch/x86/include/asm/delay.h @@ -4,5 +4,6 @@ #include void use_tsc_delay(void); +void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/kernel/arch/x86/include/asm/dma-mapping.h b/kernel/arch/x86/include/asm/dma-mapping.h index 808dae63e..953b7263f 100644 --- a/kernel/arch/x86/include/asm/dma-mapping.h +++ b/kernel/arch/x86/include/asm/dma-mapping.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #ifdef CONFIG_ISA @@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev) #endif } -#include - -/* Make sure we keep the same behaviour */ -static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - debug_dma_mapping_error(dev, dma_addr); - if (ops->mapping_error) - return ops->mapping_error(dev, dma_addr); - - return (dma_addr == DMA_ERROR_CODE); -} - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) +bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); +#define arch_dma_alloc_attrs arch_dma_alloc_attrs +#define HAVE_ARCH_DMA_SUPPORTED 1 extern int dma_supported(struct device *hwdev, u64 mask); -extern int dma_set_mask(struct device *dev, u64 mask); + +#include extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, @@ -125,52 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp) return gfp; } -#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) - -static inline void * -dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - void *memory; - - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) - dev = &x86_dma_fallback_dev; - - if (!is_device_dma_capable(dev)) - return NULL; - - if (!ops->alloc) - return NULL; - - memory = ops->alloc(dev, size, dma_handle, - dma_alloc_coherent_gfp_flags(dev, gfp), attrs); - debug_dma_alloc_coherent(dev, size, *dma_handle, memory); - - return memory; -} - -#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL) - -static inline void dma_free_attrs(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus, - struct dma_attrs *attrs) -{ - struct dma_map_ops *ops = get_dma_ops(dev); - - WARN_ON(irqs_disabled()); /* for portability */ - - if (dma_release_from_coherent(dev, get_order(size), vaddr)) - return; - - debug_dma_free_coherent(dev, size, vaddr, bus); - if (ops->free) - ops->free(dev, size, vaddr, bus, attrs); -} - #endif diff --git a/kernel/arch/x86/include/asm/dwarf2.h b/kernel/arch/x86/include/asm/dwarf2.h index de1cdaf4d..b7a1ab865 100644 --- a/kernel/arch/x86/include/asm/dwarf2.h +++ b/kernel/arch/x86/include/asm/dwarf2.h @@ -36,15 +36,22 @@ #endif #if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) +#ifndef BUILD_VDSO /* * Emit CFI data in .debug_frame sections, not .eh_frame sections. * The latter we currently just discard since we don't do DWARF * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if this - * file is used in the vDSO assembly, or if vmlinux.lds.S gets - * changed so it doesn't discard .eh_frame. + * useful to anyone. Note we should not use this directive if + * vmlinux.lds.S gets changed so it doesn't discard .eh_frame. */ .cfi_sections .debug_frame +#else + /* + * For the vDSO, emit both runtime unwind information and debug + * symbols for the .dbg file. + */ + .cfi_sections .eh_frame, .debug_frame +#endif #endif #else @@ -74,97 +81,4 @@ #endif -/* - * An attempt to make CFI annotations more or less - * correct and shorter. It is implied that you know - * what you're doing if you use them. - */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_64 - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro pushq_cfi_reg reg - pushq %\reg - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro popq_cfi_reg reg - popq %\reg - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE \reg - .endm - - .macro pushfq_cfi - pushfq - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popfq_cfi - popfq - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm -#else /*!CONFIG_X86_64*/ - .macro pushl_cfi reg - pushl \reg - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro pushl_cfi_reg reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popl_cfi reg - popl \reg - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro popl_cfi_reg reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE \reg - .endm - - .macro pushfl_cfi - pushfl - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro popfl_cfi - popfl - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro movl_cfi reg offset=0 - movl %\reg, \offset(%esp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movl_cfi_restore offset reg - movl \offset(%esp), %\reg - CFI_RESTORE \reg - .endm -#endif /*!CONFIG_X86_64*/ -#endif /*__ASSEMBLY__*/ - #endif /* _ASM_X86_DWARF2_H */ diff --git a/kernel/arch/x86/include/asm/edac.h b/kernel/arch/x86/include/asm/edac.h index e9b57ecc7..cf8fdf83b 100644 --- a/kernel/arch/x86/include/asm/edac.h +++ b/kernel/arch/x86/include/asm/edac.h @@ -3,7 +3,7 @@ /* ECC atomic, DMA, SMP and interrupt safe scrub function */ -static inline void atomic_scrub(void *va, u32 size) +static inline void edac_atomic_scrub(void *va, u32 size) { u32 i, *virt_addr = va; diff --git a/kernel/arch/x86/include/asm/efi.h b/kernel/arch/x86/include/asm/efi.h index 3738b138b..0010c78c4 100644 --- a/kernel/arch/x86/include/asm/efi.h +++ b/kernel/arch/x86/include/asm/efi.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H -#include +#include #include /* @@ -86,6 +86,18 @@ extern u64 asmlinkage efi_call(void *fp, ...); extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); +#ifdef CONFIG_KASAN +/* + * CONFIG_KASAN may redefine memset to __memset. __memset function is present + * only in kernel binary. Since the EFI stub linked into a separate binary it + * doesn't have __memset(). So we should use standard memset from + * arch/x86/boot/compressed/string.c. The same applies to memcpy and memmove. + */ +#undef memcpy +#undef memset +#undef memmove +#endif + #endif /* CONFIG_X86_32 */ extern struct efi_scratch efi_scratch; @@ -93,6 +105,7 @@ extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); extern pgd_t * __init efi_call_phys_prolog(void); extern void __init efi_call_phys_epilog(pgd_t *save_pgd); +extern void __init efi_print_memmap(void); extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); diff --git a/kernel/arch/x86/include/asm/elf.h b/kernel/arch/x86/include/asm/elf.h index f161c189c..1514753fd 100644 --- a/kernel/arch/x86/include/asm/elf.h +++ b/kernel/arch/x86/include/asm/elf.h @@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #ifdef CONFIG_X86_64 extern unsigned int vdso64_enabled; #endif -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) extern unsigned int vdso32_enabled; #endif @@ -171,11 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - /* Commented-out registers are cleared in stub_execve */ - /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; - regs->si = regs->di /*= regs->bp*/ = 0; + /* ax gets execve's return value. */ + /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0; + regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ + regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; @@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t, #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); -#define compat_start_thread start_thread_ia32 +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); +#define compat_start_thread compat_start_thread void set_personality_ia32(bool); #define COMPAT_SET_PERSONALITY(ex) \ @@ -328,7 +328,7 @@ else \ #define VDSO_ENTRY \ ((unsigned long)current->mm->context.vdso + \ - selected_vdso32->sym___kernel_vsyscall) + vdso_image_32.sym___kernel_vsyscall) struct linux_binprm; @@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, */ static inline int mmap_is_ia32(void) { -#ifdef CONFIG_X86_32 - return 1; -#endif -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_ADDR32)) - return 1; -#endif - return 0; + return config_enabled(CONFIG_X86_32) || + (config_enabled(CONFIG_COMPAT) && + test_thread_flag(TIF_ADDR32)); } /* Do not change the values. See get_align_mask() */ diff --git a/kernel/arch/x86/include/asm/entry_arch.h b/kernel/arch/x86/include/asm/entry_arch.h index dc5fa6614..df002992d 100644 --- a/kernel/arch/x86/include/asm/entry_arch.h +++ b/kernel/arch/x86/include/asm/entry_arch.h @@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_HAVE_KVM BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, smp_kvm_posted_intr_ipi) +BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, + smp_kvm_posted_intr_wakeup_ipi) #endif /* @@ -50,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif +#ifdef CONFIG_X86_MCE_AMD +BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) +#endif #endif diff --git a/kernel/arch/x86/include/asm/espfix.h b/kernel/arch/x86/include/asm/espfix.h index 99efebb2f..ca3ce9ab9 100644 --- a/kernel/arch/x86/include/asm/espfix.h +++ b/kernel/arch/x86/include/asm/espfix.h @@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); extern void init_espfix_bsp(void); -extern void init_espfix_ap(void); +extern void init_espfix_ap(int cpu); #endif /* CONFIG_X86_64 */ diff --git a/kernel/arch/x86/include/asm/fpu-internal.h b/kernel/arch/x86/include/asm/fpu-internal.h deleted file mode 100644 index da5e96756..000000000 --- a/kernel/arch/x86/include/asm/fpu-internal.h +++ /dev/null @@ -1,626 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes , May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _FPU_INTERNAL_H -#define _FPU_INTERNAL_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_X86_64 -# include -# include -struct ksignal; -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -#else -# define user_i387_ia32_struct user_i387_struct -# define user32_fxsr_struct user_fxsr_struct -# define ia32_setup_frame __setup_frame -# define ia32_setup_rt_frame __setup_rt_frame -#endif - -extern unsigned int mxcsr_feature_mask; -extern void fpu_init(void); -extern void eager_fpu_init(void); - -DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); - -extern void convert_from_fxsr(struct user_i387_ia32_struct *env, - struct task_struct *tsk); -extern void convert_to_fxsr(struct task_struct *tsk, - const struct user_i387_ia32_struct *env); - -extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; - -/* - * xstateregs_active == fpregs_active. Please refer to the comment - * at the definition of fpregs_active. - */ -#define xstateregs_active fpregs_active - -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - -/* - * Must be run with preemption disabled: this clears the fpu_owner_task, - * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. - */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) -{ - per_cpu(fpu_owner_task, cpu) = NULL; -} - -/* - * Used to indicate that the FPU state in memory is newer than the FPU - * state in registers, and the FPU state should be reloaded next time the - * task is run. Only safe on the current task, or non-running tasks. - */ -static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) -{ - tsk->thread.fpu.last_cpu = ~0; -} - -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == this_cpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - -static inline int is_ia32_compat_frame(void) -{ - return config_enabled(CONFIG_IA32_EMULATION) && - test_thread_flag(TIF_IA32); -} - -static inline int is_ia32_frame(void) -{ - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); -} - -static inline int is_x32_frame(void) -{ - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); -} - -#define X87_FSW_ES (1 << 7) /* Exception Summary */ - -static __always_inline __pure bool use_eager_fpu(void) -{ - return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); -} - -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has_safe(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has_safe(X86_FEATURE_FXSR); -} - -static inline void fx_finit(struct i387_fxsave_struct *fx) -{ - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; -} - -extern void __sanitize_i387_state(struct task_struct *); - -static inline void sanitize_i387_state(struct task_struct *tsk) -{ - if (!use_xsaveopt()) - return; - __sanitize_i387_state(tsk); -} - -#define user_insn(insn, output, input...) \ -({ \ - int err; \ - asm volatile(ASM_STAC "\n" \ - "1:" #insn "\n\t" \ - "2: " ASM_CLAC "\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define check_insn(insn, output, input...) \ -({ \ - int err; \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -static inline int fsave_user(struct i387_fsave_struct __user *fx) -{ - return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); -} - -static inline int fxsave_user(struct i387_fxsave_struct __user *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); -} - -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); -} - -static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) -{ - if (config_enabled(CONFIG_X86_32)) - return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - - /* See comment in fpu_fxsave() below. */ - return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); -} - -static inline int frstor_checking(struct i387_fsave_struct *fx) -{ - return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_user(struct i387_fsave_struct __user *fx) -{ - return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void fpu_fxsave(struct fpu *fpu) -{ - if (config_enabled(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) - asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); - else { - /* Using "rex64; fxsave %0" is broken because, if the memory - * operand uses any extended registers for addressing, a second - * REX prefix will be generated (to the assembler, rex64 - * followed by semicolon is a separate instruction), and hence - * the 64-bitness is lost. - * - * Using "fxsaveq %0" would be the ideal choice, but is only - * supported starting with gas 2.16. - * - * Using, as a workaround, the properly prefixed form below - * isn't accepted by any binutils version so far released, - * complaining that the same type of prefix is used twice if - * an extended register is needed for addressing (fix submitted - * to mainline 2005-11-21). - * - * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); - * - * This, however, we can work around by forcing the compiler to - * select an addressing mode that doesn't require extended - * registers. - */ - asm volatile( "rex64/fxsave (%[fx])" - : "=m" (fpu->state->fxsave) - : [fx] "R" (&fpu->state->fxsave)); - } -} - -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact. - */ -static inline int fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) { - fpu_xsave(fpu); - - /* - * xsave header may indicate the init state of the FP. - */ - if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) - return 1; - } else if (use_fxsr()) { - fpu_fxsave(fpu); - } else { - asm volatile("fnsave %[fx]; fwait" - : [fx] "=m" (fpu->state->fsave)); - return 0; - } - - /* - * If exceptions are pending, we need to clear them so - * that we don't randomly get exceptions later. - * - * FIXME! Is this perhaps only true for the old-style - * irq13 case? Maybe we could leave the x87 state - * intact otherwise? - */ - if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) { - asm volatile("fnclex"); - return 0; - } - return 1; -} - -static inline int __save_init_fpu(struct task_struct *tsk) -{ - return fpu_save_init(&tsk->thread.fpu); -} - -static inline int fpu_restore_checking(struct fpu *fpu) -{ - if (use_xsave()) - return fpu_xrstor_checking(&fpu->state->xsave); - else if (use_fxsr()) - return fxrstor_checking(&fpu->state->fxsave); - else - return frstor_checking(&fpu->state->fsave); -} - -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - /* - * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is - * pending. Clear the x87 state here by setting it to fixed values. - * "m" is a random variable that should be in L1. - */ - if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { - asm volatile( - "fnclex\n\t" - "emms\n\t" - "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (tsk->thread.fpu.has_fpu)); - } - - return fpu_restore_checking(&tsk->thread.fpu); -} - -/* - * Software FPU state helpers. Careful: these need to - * be preemption protection *and* they need to be - * properly paired with the CR0.TS changes! - */ -static inline int __thread_has_fpu(struct task_struct *tsk) -{ - return tsk->thread.fpu.has_fpu; -} - -/* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 0; - this_cpu_write(fpu_owner_task, NULL); -} - -/* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct task_struct *tsk) -{ - tsk->thread.fpu.has_fpu = 1; - this_cpu_write(fpu_owner_task, tsk); -} - -/* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void __thread_fpu_end(struct task_struct *tsk) -{ - __thread_clear_has_fpu(tsk); - if (!use_eager_fpu()) - stts(); -} - -static inline void __thread_fpu_begin(struct task_struct *tsk) -{ - if (!use_eager_fpu()) - clts(); - __thread_set_has_fpu(tsk); -} - -static inline void drop_fpu(struct task_struct *tsk) -{ - /* - * Forget coprocessor state.. - */ - preempt_disable(); - tsk->thread.fpu_counter = 0; - - if (__thread_has_fpu(tsk)) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(tsk); - } - - clear_stopped_child_used_math(tsk); - preempt_enable(); -} - -static inline void restore_init_xstate(void) -{ - if (use_xsave()) - xrstor_state(init_xstate_buf, -1); - else - fxrstor_checking(&init_xstate_buf->i387); -} - -/* - * Reset the FPU state in the eager case and drop it in the lazy case (later use - * will reinit it). - */ -static inline void fpu_reset_state(struct task_struct *tsk) -{ - if (!use_eager_fpu()) - drop_fpu(tsk); - else - restore_init_xstate(); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. - * - * - switch_fpu_finish() restores the new state as - * necessary. - */ -typedef struct { int preload; } fpu_switch_t; - -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) -{ - fpu_switch_t fpu; - - /* - * If the task has used the math, pre-load the FPU on xsave processors - * or if the past 5 consecutive context-switches used math. - */ - fpu.preload = tsk_used_math(new) && - (use_eager_fpu() || new->thread.fpu_counter > 5); - - if (__thread_has_fpu(old)) { - if (!__save_init_fpu(old)) - task_disable_lazy_fpu_restore(old); - else - old->thread.fpu.last_cpu = cpu; - - /* But leave fpu_owner_task! */ - old->thread.fpu.has_fpu = 0; - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new->thread.fpu_counter++; - __thread_set_has_fpu(new); - prefetch(new->thread.fpu.state); - } else if (!use_eager_fpu()) - stts(); - } else { - old->thread.fpu_counter = 0; - task_disable_lazy_fpu_restore(old); - if (fpu.preload) { - new->thread.fpu_counter++; - if (fpu_lazy_restore(new, cpu)) - fpu.preload = 0; - else - prefetch(new->thread.fpu.state); - __thread_fpu_begin(new); - } - } - return fpu; -} - -/* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. - */ -static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) -{ - if (fpu.preload) { - if (unlikely(restore_fpu_checking(new))) - fpu_reset_state(new); - } -} - -/* - * Signal frame handlers... - */ -extern int save_xstate_sig(void __user *buf, void __user *fx, int size); -extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); - -static inline int xstate_sigframe_size(void) -{ - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; -} - -static inline int restore_xstate_sig(void __user *buf, int ia32_frame) -{ - void __user *buf_fx = buf; - int size = xstate_sigframe_size(); - - if (ia32_frame && use_fxsr()) { - buf_fx = buf + sizeof(struct i387_fsave_struct); - size += sizeof(struct i387_fsave_struct); - } - - return __restore_xstate_sig(buf, buf_fx, size); -} - -/* - * Needs to be preemption-safe. - * - * NOTE! user_fpu_begin() must be used only immediately before restoring - * the save state. It does not do any saving/restoring on its own. In - * lazy FPU mode, it is just an optimization to avoid a #NM exception, - * the task can lose the FPU right after preempt_enable(). - */ -static inline void user_fpu_begin(void) -{ - preempt_disable(); - if (!user_has_fpu()) - __thread_fpu_begin(current); - preempt_enable(); -} - -static inline void __save_fpu(struct task_struct *tsk) -{ - if (use_xsave()) { - if (unlikely(system_state == SYSTEM_BOOTING)) - xsave_state_booting(&tsk->thread.fpu.state->xsave, -1); - else - xsave_state(&tsk->thread.fpu.state->xsave, -1); - } else - fpu_fxsave(&tsk->thread.fpu); -} - -/* - * i387 state interaction - */ -static inline unsigned short get_fpu_cwd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.cwd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.cwd; - } -} - -static inline unsigned short get_fpu_swd(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - return tsk->thread.fpu.state->fxsave.swd; - } else { - return (unsigned short)tsk->thread.fpu.state->fsave.swd; - } -} - -static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk) -{ - if (cpu_has_xmm) { - return tsk->thread.fpu.state->fxsave.mxcsr; - } else { - return MXCSR_DEFAULT; - } -} - -static bool fpu_allocated(struct fpu *fpu) -{ - return fpu->state != NULL; -} - -static inline int fpu_alloc(struct fpu *fpu) -{ - if (fpu_allocated(fpu)) - return 0; - fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); - if (!fpu->state) - return -ENOMEM; - WARN_ON((unsigned long)fpu->state & 15); - return 0; -} - -static inline void fpu_free(struct fpu *fpu) -{ - if (fpu->state) { - kmem_cache_free(task_xstate_cachep, fpu->state); - fpu->state = NULL; - } -} - -static inline void fpu_copy(struct task_struct *dst, struct task_struct *src) -{ - if (use_eager_fpu()) { - memset(&dst->thread.fpu.state->xsave, 0, xstate_size); - __save_fpu(dst); - } else { - struct fpu *dfpu = &dst->thread.fpu; - struct fpu *sfpu = &src->thread.fpu; - - unlazy_fpu(src); - memcpy(dfpu->state, sfpu->state, xstate_size); - } -} - -static inline unsigned long -alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, - unsigned long *size) -{ - unsigned long frame_size = xstate_sigframe_size(); - - *buf_fx = sp = round_down(sp - frame_size, 64); - if (ia32_frame && use_fxsr()) { - frame_size += sizeof(struct i387_fsave_struct); - sp -= sizeof(struct i387_fsave_struct); - } - - *size = frame_size; - return sp; -} - -#endif diff --git a/kernel/arch/x86/include/asm/fpu/api.h b/kernel/arch/x86/include/asm/fpu/api.h new file mode 100644 index 000000000..1429a7c73 --- /dev/null +++ b/kernel/arch/x86/include/asm/fpu/api.h @@ -0,0 +1,48 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _ASM_X86_FPU_API_H +#define _ASM_X86_FPU_API_H + +/* + * Careful: __kernel_fpu_begin/end() must be called with preempt disabled + * and they don't touch the preempt state on their own. + * If you enable preemption after __kernel_fpu_begin(), preempt notifier + * should call the __kernel_fpu_end() to prevent the kernel/user FPU + * state from getting corrupted. KVM for example uses this model. + * + * All other cases use kernel_fpu_begin/end() which disable preemption + * during kernel FPU usage. + */ +extern void __kernel_fpu_begin(void); +extern void __kernel_fpu_end(void); +extern void kernel_fpu_begin(void); +extern void kernel_fpu_end(void); +extern bool irq_fpu_usable(void); + +/* + * Some instructions like VIA's padlock instructions generate a spurious + * DNA fault but don't modify SSE registers. And these instructions + * get used from interrupt context as well. To prevent these kernel instructions + * in interrupt context interacting wrongly with other user/kernel fpu usage, we + * should use them only in the context of irq_ts_save/restore() + */ +extern int irq_ts_save(void); +extern void irq_ts_restore(int TS_state); + +/* + * Query the presence of one or more xfeatures. Works on any legacy CPU as well. + * + * If 'feature_name' is set then put a human-readable description of + * the feature there as well - this can be used to print error (or success) + * messages. + */ +extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); + +#endif /* _ASM_X86_FPU_API_H */ diff --git a/kernel/arch/x86/include/asm/fpu/internal.h b/kernel/arch/x86/include/asm/fpu/internal.h new file mode 100644 index 000000000..3c3550c3a --- /dev/null +++ b/kernel/arch/x86/include/asm/fpu/internal.h @@ -0,0 +1,694 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * x86-64 work by Andi Kleen 2002 + */ + +#ifndef _ASM_X86_FPU_INTERNAL_H +#define _ASM_X86_FPU_INTERNAL_H + +#include +#include +#include + +#include +#include +#include + +/* + * High level FPU state handling functions: + */ +extern void fpu__activate_curr(struct fpu *fpu); +extern void fpu__activate_fpstate_read(struct fpu *fpu); +extern void fpu__activate_fpstate_write(struct fpu *fpu); +extern void fpu__save(struct fpu *fpu); +extern void fpu__restore(struct fpu *fpu); +extern int fpu__restore_sig(void __user *buf, int ia32_frame); +extern void fpu__drop(struct fpu *fpu); +extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); +extern void fpu__clear(struct fpu *fpu); +extern int fpu__exception_code(struct fpu *fpu, int trap_nr); +extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); + +/* + * Boot time FPU initialization functions: + */ +extern void fpu__init_cpu(void); +extern void fpu__init_system_xstate(void); +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_check_bugs(void); +extern void fpu__resume_cpu(void); + +/* + * Debugging facility: + */ +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +#endif + +/* + * FPU related CPU feature flag helper routines: + */ +static __always_inline __pure bool use_eager_fpu(void) +{ + return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); +} + +static __always_inline __pure bool use_xsaveopt(void) +{ + return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); +} + +static __always_inline __pure bool use_xsave(void) +{ + return static_cpu_has_safe(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has_safe(X86_FEATURE_FXSR); +} + +/* + * fpstate handling functions: + */ + +extern union fpregs_state init_fpstate; + +extern void fpstate_init(union fpregs_state *state); +#ifdef CONFIG_MATH_EMULATION +extern void fpstate_init_soft(struct swregs_state *soft); +#else +static inline void fpstate_init_soft(struct swregs_state *soft) {} +#endif +static inline void fpstate_init_fxstate(struct fxregs_state *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = MXCSR_DEFAULT; +} +extern void fpstate_sanitize_xstate(struct fpu *fpu); + +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile(ASM_STAC "\n" \ + "1:" #insn "\n\t" \ + "2: " ASM_CLAC "\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define check_insn(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +static inline int copy_fregs_to_user(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + + /* See comment in copy_fxregs_to_kernel() below. */ + return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); +} + +static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) +{ + int err; + + if (config_enabled(CONFIG_X86_32)) { + err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + } else { + if (config_enabled(CONFIG_AS_FXSAVEQ)) { + err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + } else { + /* See comment in copy_fxregs_to_kernel() below. */ + err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); + } + } + /* Copying from a kernel buffer to FPU registers should never fail: */ + WARN_ON_FPU(err); +} + +static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) +{ + if (config_enabled(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + + /* See comment in copy_fxregs_to_kernel() below. */ + return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), + "m" (*fx)); +} + +static inline void copy_kernel_to_fregs(struct fregs_state *fx) +{ + int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + + WARN_ON_FPU(err); +} + +static inline int copy_user_to_fregs(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void copy_fxregs_to_kernel(struct fpu *fpu) +{ + if (config_enabled(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); + else if (config_enabled(CONFIG_AS_FXSAVEQ)) + asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); + else { + /* Using "rex64; fxsave %0" is broken because, if the memory + * operand uses any extended registers for addressing, a second + * REX prefix will be generated (to the assembler, rex64 + * followed by semicolon is a separate instruction), and hence + * the 64-bitness is lost. + * + * Using "fxsaveq %0" would be the ideal choice, but is only + * supported starting with gas 2.16. + * + * Using, as a workaround, the properly prefixed form below + * isn't accepted by any binutils version so far released, + * complaining that the same type of prefix is used twice if + * an extended register is needed for addressing (fix submitted + * to mainline 2005-11-21). + * + * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave)); + * + * This, however, we can work around by forcing the compiler to + * select an addressing mode that doesn't require extended + * registers. + */ + asm volatile( "rex64/fxsave (%[fx])" + : "=m" (fpu->state.fxsave) + : [fx] "R" (&fpu->state.fxsave)); + } +} + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* xstate instruction fault handler: */ +#define xstate_fault(__err) \ + \ + ".section .fixup,\"ax\"\n" \ + \ + "3: movl $-2,%[_err]\n" \ + " jmp 2b\n" \ + \ + ".previous\n" \ + \ + _ASM_EXTABLE(1b, 3b) \ + : [_err] "=r" (__err) + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XSAVES"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + else + asm volatile("1:"XSAVE"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(system_state != SYSTEM_BOOTING); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + asm volatile("1:"XRSTORS"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + else + asm volatile("1:"XRSTOR"\n\t" + "2:\n\t" + xstate_fault(err) + : "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); + + /* We should never fault when copying from a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Save processor xstate to xsave area. + */ +static inline void copy_xregs_to_kernel(struct xregs_state *xstate) +{ + u64 mask = -1; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + WARN_ON(!alternatives_patched); + + /* + * If xsaves is enabled, xsaves replaces xsaveopt because + * it supports compact format and supervisor states in addition to + * modified optimization in xsaveopt. + * + * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave + * because xsaveopt supports modified optimization which is not + * supported by xsave. + * + * If none of xsaves and xsaveopt is enabled, use xsave. + */ + alternative_input_2( + "1:"XSAVE, + XSAVEOPT, + X86_FEATURE_XSAVEOPT, + XSAVES, + X86_FEATURE_XSAVES, + [xstate] "D" (xstate), "a" (lmask), "d" (hmask) : + "memory"); + asm volatile("2:\n\t" + xstate_fault(err) + : "0" (err) + : "memory"); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + */ +static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + /* + * Use xrstors to restore context if it is enabled. xrstors supports + * compacted format of xsave area which is not supported by xrstor. + */ + alternative_input( + "1: " XRSTOR, + XRSTORS, + X86_FEATURE_XSAVES, + "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask) + : "memory"); + + asm volatile("2:\n" + xstate_fault(err) + : "0" (err) + : "memory"); + + /* We should never fault when copying from a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for + * backward compatibility for old applications which don't understand + * compacted format of xsave area. + */ +static inline int copy_xregs_to_user(struct xregs_state __user *buf) +{ + int err; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + err = __clear_user(&buf->header, sizeof(buf->header)); + if (unlikely(err)) + return -EFAULT; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XSAVE"\n" + "2: " ASM_CLAC "\n" + xstate_fault(err) + : "D" (buf), "a" (-1), "d" (-1), "0" (err) + : "memory"); + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err = 0; + + __asm__ __volatile__(ASM_STAC "\n" + "1:"XRSTOR"\n" + "2: " ASM_CLAC "\n" + xstate_fault(err) + : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err) + : "memory"); /* memory required? */ + return err; +} + +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +static inline int copy_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + return 1; + } + + if (likely(use_fxsr())) { + copy_fxregs_to_kernel(fpu); + return 1; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to mark them inactive: + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); + + return 0; +} + +static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate) +{ + if (use_xsave()) { + copy_kernel_to_xregs(&fpstate->xsave, -1); + } else { + if (use_fxsr()) + copy_kernel_to_fxregs(&fpstate->fxsave); + else + copy_kernel_to_fregs(&fpstate->fsave); + } +} + +static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) +{ + /* + * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is + * pending. Clear the x87 state here by setting it to fixed values. + * "m" is a random variable that should be in L1. + */ + if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (fpstate)); + } + + __copy_kernel_to_fpregs(fpstate); +} + +extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); + +/* + * FPU context switch related helper methods: + */ + +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* + * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, + * on this CPU. + * + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. + */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; +} + +static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + + +/* + * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' + * idiom, which is then paired with the sw-flag (fpregs_active) later on: + */ + +static inline void __fpregs_activate_hw(void) +{ + if (!use_eager_fpu()) + clts(); +} + +static inline void __fpregs_deactivate_hw(void) +{ + if (!use_eager_fpu()) + stts(); +} + +/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ +static inline void __fpregs_deactivate(struct fpu *fpu) +{ + WARN_ON_FPU(!fpu->fpregs_active); + + fpu->fpregs_active = 0; + this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ +static inline void __fpregs_activate(struct fpu *fpu) +{ + WARN_ON_FPU(fpu->fpregs_active); + + fpu->fpregs_active = 1; + this_cpu_write(fpu_fpregs_owner_ctx, fpu); +} + +/* + * The question "does this thread have fpu access?" + * is slightly racy, since preemption could come in + * and revoke it immediately after the test. + * + * However, even in that very unlikely scenario, + * we can just assume we have FPU access - typically + * to save the FP state - we'll just take a #NM + * fault and get the FPU access back. + */ +static inline int fpregs_active(void) +{ + return current->thread.fpu.fpregs_active; +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void fpregs_activate(struct fpu *fpu) +{ + __fpregs_activate_hw(); + __fpregs_activate(fpu); +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __fpregs_deactivate(fpu); + __fpregs_deactivate_hw(); +} + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state and + * sets the new state of the CR0.TS bit. This is + * done within the context of the old process. + * + * - switch_fpu_finish() restores the new state as + * necessary. + */ +typedef struct { int preload; } fpu_switch_t; + +static inline fpu_switch_t +switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) +{ + fpu_switch_t fpu; + + /* + * If the task has used the math, pre-load the FPU on xsave processors + * or if the past 5 consecutive context-switches used math. + */ + fpu.preload = new_fpu->fpstate_active && + (use_eager_fpu() || new_fpu->counter > 5); + + if (old_fpu->fpregs_active) { + if (!copy_fpregs_to_fpstate(old_fpu)) + old_fpu->last_cpu = -1; + else + old_fpu->last_cpu = cpu; + + /* But leave fpu_fpregs_owner_ctx! */ + old_fpu->fpregs_active = 0; + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new_fpu->counter++; + __fpregs_activate(new_fpu); + prefetch(&new_fpu->state); + } else { + __fpregs_deactivate_hw(); + } + } else { + old_fpu->counter = 0; + old_fpu->last_cpu = -1; + if (fpu.preload) { + new_fpu->counter++; + if (fpu_want_lazy_restore(new_fpu, cpu)) + fpu.preload = 0; + else + prefetch(&new_fpu->state); + fpregs_activate(new_fpu); + } + } + return fpu; +} + +/* + * Misc helper functions: + */ + +/* + * By the time this gets called, we've already cleared CR0.TS and + * given the process the FPU if we are going to preload the FPU + * state - all we need to do is to conditionally restore the register + * state itself. + */ +static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) +{ + if (fpu_switch.preload) + copy_kernel_to_fpregs(&new_fpu->state); +} + +/* + * Needs to be preemption-safe. + * + * NOTE! user_fpu_begin() must be used only immediately before restoring + * the save state. It does not do any saving/restoring on its own. In + * lazy FPU mode, it is just an optimization to avoid a #NM exception, + * the task can lose the FPU right after preempt_enable(). + */ +static inline void user_fpu_begin(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + preempt_disable(); + if (!fpregs_active()) + fpregs_activate(fpu); + preempt_enable(); +} + +/* + * MXCSR and XCR definitions: + */ + +extern unsigned int mxcsr_feature_mask; + +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ + : : "a" (eax), "d" (edx), "c" (index)); +} + +#endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/kernel/arch/x86/include/asm/fpu/regset.h b/kernel/arch/x86/include/asm/fpu/regset.h new file mode 100644 index 000000000..39d3107ac --- /dev/null +++ b/kernel/arch/x86/include/asm/fpu/regset.h @@ -0,0 +1,21 @@ +/* + * FPU regset handling methods: + */ +#ifndef _ASM_X86_FPU_REGSET_H +#define _ASM_X86_FPU_REGSET_H + +#include + +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == regset_fpregs_active. Please refer to the comment + * at the definition of regset_fpregs_active. + */ +#define xstateregs_active regset_fpregs_active + +#endif /* _ASM_X86_FPU_REGSET_H */ diff --git a/kernel/arch/x86/include/asm/fpu/signal.h b/kernel/arch/x86/include/asm/fpu/signal.h new file mode 100644 index 000000000..0e970d00d --- /dev/null +++ b/kernel/arch/x86/include/asm/fpu/signal.h @@ -0,0 +1,33 @@ +/* + * x86 FPU signal frame handling methods: + */ +#ifndef _ASM_X86_FPU_SIGNAL_H +#define _ASM_X86_FPU_SIGNAL_H + +#ifdef CONFIG_X86_64 +# include +# include +struct ksignal; +int ia32_setup_rt_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct ksignal *ksig, + compat_sigset_t *set, struct pt_regs *regs); +#else +# define user_i387_ia32_struct user_i387_struct +# define user32_fxsr_struct user_fxsr_struct +# define ia32_setup_frame __setup_frame +# define ia32_setup_rt_frame __setup_rt_frame +#endif + +extern void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk); +extern void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env); + +unsigned long +fpu__alloc_mathframe(unsigned long sp, int ia32_frame, + unsigned long *buf_fx, unsigned long *size); + +extern void fpu__init_prepare_fx_sw_frame(void); + +#endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/kernel/arch/x86/include/asm/fpu/types.h b/kernel/arch/x86/include/asm/fpu/types.h new file mode 100644 index 000000000..1c6f6ac52 --- /dev/null +++ b/kernel/arch/x86/include/asm/fpu/types.h @@ -0,0 +1,355 @@ +/* + * FPU data structures: + */ +#ifndef _ASM_X86_FPU_H +#define _ASM_X86_FPU_H + +/* + * The legacy x87 FPU state format, as saved by FSAVE and + * restored by the FRSTOR instructions: + */ +struct fregs_state { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE]: */ + u32 status; +}; + +/* + * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and + * restored by the FXRSTOR instructions. It's similar to the FSAVE + * format, but differs in some areas, plus has extensions at + * the end for the XMM registers. + */ +struct fxregs_state { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +} __attribute__((aligned(16))); + +/* Default value for fxregs_state.mxcsr: */ +#define MXCSR_DEFAULT 0x1f80 + +/* + * Software based FPU emulation state. This is arbitrary really, + * it matches the x87 format to make it easier to understand: + */ +struct swregs_state { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct math_emu_info *info; + u32 entry_eip; +}; + +/* + * List of XSAVE features Linux knows about: + */ +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + /* + * Values above here are "legacy states". + * Those below are "extended states". + */ + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, + + XFEATURE_MAX, +}; + +#define XFEATURE_MASK_FP (1 << XFEATURE_FP) +#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) +#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) +#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) +#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) +#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) +#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) +#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) + +#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ + | XFEATURE_MASK_ZMM_Hi256 \ + | XFEATURE_MASK_Hi16_ZMM) + +#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM + +struct reg_128_bit { + u8 regbytes[128/8]; +}; +struct reg_256_bit { + u8 regbytes[256/8]; +}; +struct reg_512_bit { + u8 regbytes[512/8]; +}; + +/* + * State component 2: + * + * There are 16x 256-bit AVX registers named YMM0-YMM15. + * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) + * and are stored in 'struct fxregs_state::xmm_space[]' in the + * "legacy" area. + * + * The high 128 bits are stored here. + */ +struct ymmh_struct { + struct reg_128_bit hi_ymm[16]; +} __packed; + +/* Intel MPX support: */ + +struct mpx_bndreg { + u64 lower_bound; + u64 upper_bound; +} __packed; +/* + * State component 3 is used for the 4 128-bit bounds registers + */ +struct mpx_bndreg_state { + struct mpx_bndreg bndreg[4]; +} __packed; + +/* + * State component 4 is used for the 64-bit user-mode MPX + * configuration register BNDCFGU and the 64-bit MPX status + * register BNDSTATUS. We call the pair "BNDCSR". + */ +struct mpx_bndcsr { + u64 bndcfgu; + u64 bndstatus; +} __packed; + +/* + * The BNDCSR state is padded out to be 64-bytes in size. + */ +struct mpx_bndcsr_state { + union { + struct mpx_bndcsr bndcsr; + u8 pad_to_64_bytes[64]; + }; +} __packed; + +/* AVX-512 Components: */ + +/* + * State component 5 is used for the 8 64-bit opmask registers + * k0-k7 (opmask state). + */ +struct avx_512_opmask_state { + u64 opmask_reg[8]; +} __packed; + +/* + * State component 6 is used for the upper 256 bits of the + * registers ZMM0-ZMM15. These 16 256-bit values are denoted + * ZMM0_H-ZMM15_H (ZMM_Hi256 state). + */ +struct avx_512_zmm_uppers_state { + struct reg_256_bit zmm_upper[16]; +} __packed; + +/* + * State component 7 is used for the 16 512-bit registers + * ZMM16-ZMM31 (Hi16_ZMM state). + */ +struct avx_512_hi16_state { + struct reg_512_bit hi16_zmm[16]; +} __packed; + +struct xstate_header { + u64 xfeatures; + u64 xcomp_bv; + u64 reserved[6]; +} __attribute__((packed)); + +/* + * This is our most modern FPU state format, as saved by the XSAVE + * and restored by the XRSTOR instructions. + * + * It consists of a legacy fxregs portion, an xstate header and + * subsequent areas as defined by the xstate header. Not all CPUs + * support all the extensions, so the size of the extended area + * can vary quite a bit between CPUs. + */ +struct xregs_state { + struct fxregs_state i387; + struct xstate_header header; + u8 extended_state_area[0]; +} __attribute__ ((packed, aligned (64))); + +/* + * This is a union of all the possible FPU state formats + * put together, so that we can pick the right one runtime. + * + * The size of the structure is determined by the largest + * member - which is the xsave area. The padding is there + * to ensure that statically-allocated task_structs (just + * the init_task today) have enough space. + */ +union fpregs_state { + struct fregs_state fsave; + struct fxregs_state fxsave; + struct swregs_state soft; + struct xregs_state xsave; + u8 __padding[PAGE_SIZE]; +}; + +/* + * Highest level per task FPU state data structure that + * contains the FPU register state plus various FPU + * state fields: + */ +struct fpu { + /* + * @last_cpu: + * + * Records the last CPU on which this context was loaded into + * FPU registers. (In the lazy-restore case we might be + * able to reuse FPU registers across multiple context switches + * this way, if no intermediate task used the FPU.) + * + * A value of -1 is used to indicate that the FPU state in context + * memory is newer than the FPU state in registers, and that the + * FPU state should be reloaded next time the task is run. + */ + unsigned int last_cpu; + + /* + * @fpstate_active: + * + * This flag indicates whether this context is active: if the task + * is not running then we can restore from this context, if the task + * is running then we should save into this context. + */ + unsigned char fpstate_active; + + /* + * @fpregs_active: + * + * This flag determines whether a given context is actively + * loaded into the FPU's registers and that those registers + * represent the task's current FPU state. + * + * Note the interaction with fpstate_active: + * + * # task does not use the FPU: + * fpstate_active == 0 + * + * # task uses the FPU and regs are active: + * fpstate_active == 1 && fpregs_active == 1 + * + * # the regs are inactive but still match fpstate: + * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu + * + * The third state is what we use for the lazy restore optimization + * on lazy-switching CPUs. + */ + unsigned char fpregs_active; + + /* + * @counter: + * + * This counter contains the number of consecutive context switches + * during which the FPU stays used. If this is over a threshold, the + * lazy FPU restore logic becomes eager, to save the trap overhead. + * This is an unsigned char so that after 256 iterations the counter + * wraps and the context switch behavior turns lazy again; this is to + * deal with bursty apps that only use the FPU for a short time: + */ + unsigned char counter; + /* + * @state: + * + * In-memory copy of all FPU registers that we save/restore + * over context switches. If the task is using the FPU then + * the registers in the FPU are more recent than this state + * copy. If the task context-switches away then they get + * saved here and represent the FPU state. + * + * After context switches there may be a (short) time period + * during which the in-FPU hardware registers are unchanged + * and still perfectly match this state, if the tasks + * scheduled afterwards are not using the FPU. + * + * This is the 'lazy restore' window of optimization, which + * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. + * + * We detect whether a subsequent task uses the FPU via setting + * CR0::TS to 1, which causes any FPU use to raise a #NM fault. + * + * During this window, if the task gets scheduled again, we + * might be able to skip having to do a restore from this + * memory buffer to the hardware registers - at the cost of + * incurring the overhead of #NM fault traps. + * + * Note that on modern CPUs that support the XSAVEOPT (or other + * optimized XSAVE instructions), we don't use #NM traps anymore, + * as the hardware can track whether FPU registers need saving + * or not. On such CPUs we activate the non-lazy ('eagerfpu') + * logic, which unconditionally saves/restores all FPU state + * across context switches. (if FPU state exists.) + */ + union fpregs_state state; + /* + * WARNING: 'state' is dynamically-sized. Do not put + * anything after it here. + */ +}; + +#endif /* _ASM_X86_FPU_H */ diff --git a/kernel/arch/x86/include/asm/fpu/xstate.h b/kernel/arch/x86/include/asm/fpu/xstate.h new file mode 100644 index 000000000..3a6c89b70 --- /dev/null +++ b/kernel/arch/x86/include/asm/fpu/xstate.h @@ -0,0 +1,51 @@ +#ifndef __ASM_X86_XSAVE_H +#define __ASM_X86_XSAVE_H + +#include +#include +#include + +/* Bit 63 of XCR0 is reserved for future expansion */ +#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) + +#define XSTATE_CPUID 0x0000000d + +#define FXSAVE_SIZE 512 + +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) + +/* Supported features which support lazy state saving */ +#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM) + +/* Supported features which require eager state saving */ +#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) + +/* All currently supported features */ +#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +extern unsigned int xstate_size; +extern u64 xfeatures_mask; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); + +void fpu__xstate_clear_all_cpu_caps(void); +void *get_xsave_addr(struct xregs_state *xsave, int xstate); +const void *get_xsave_field_ptr(int xstate_field); + +#endif diff --git a/kernel/arch/x86/include/asm/frame.h b/kernel/arch/x86/include/asm/frame.h index 3b629f47e..793179cf8 100644 --- a/kernel/arch/x86/include/asm/frame.h +++ b/kernel/arch/x86/include/asm/frame.h @@ -1,20 +1,17 @@ #ifdef __ASSEMBLY__ #include -#include /* The annotation hides the frame from the unwinder and makes it look like a ordinary ebp save/restore. This avoids some special cases for frame pointer later */ #ifdef CONFIG_FRAME_POINTER .macro FRAME - __ASM_SIZE(push,_cfi) %__ASM_REG(bp) - CFI_REL_OFFSET __ASM_REG(bp), 0 + __ASM_SIZE(push,) %__ASM_REG(bp) __ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp) .endm .macro ENDFRAME - __ASM_SIZE(pop,_cfi) %__ASM_REG(bp) - CFI_RESTORE __ASM_REG(bp) + __ASM_SIZE(pop,) %__ASM_REG(bp) .endm #else .macro FRAME diff --git a/kernel/arch/x86/include/asm/ftrace.h b/kernel/arch/x86/include/asm/ftrace.h index f45acad3c..24938852d 100644 --- a/kernel/arch/x86/include/asm/ftrace.h +++ b/kernel/arch/x86/include/asm/ftrace.h @@ -3,9 +3,9 @@ #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY -# define MCOUNT_ADDR ((long)(__fentry__)) +# define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else -# define MCOUNT_ADDR ((long)(mcount)) +# define MCOUNT_ADDR ((unsigned long)(mcount)) #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/kernel/arch/x86/include/asm/hardirq.h b/kernel/arch/x86/include/asm/hardirq.h index 0f5fb6b65..7178043b0 100644 --- a/kernel/arch/x86/include/asm/hardirq.h +++ b/kernel/arch/x86/include/asm/hardirq.h @@ -14,6 +14,7 @@ typedef struct { #endif #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; + unsigned int kvm_posted_intr_wakeup_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; @@ -33,6 +34,9 @@ typedef struct { #ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; #endif +#ifdef CONFIG_X86_MCE_AMD + unsigned int irq_deferred_error_count; +#endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif diff --git a/kernel/arch/x86/include/asm/highmem.h b/kernel/arch/x86/include/asm/highmem.h index 04e9d0231..1c0b43724 100644 --- a/kernel/arch/x86/include/asm/highmem.h +++ b/kernel/arch/x86/include/asm/highmem.h @@ -68,7 +68,6 @@ void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); -struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) diff --git a/kernel/arch/x86/include/asm/hpet.h b/kernel/arch/x86/include/asm/hpet.h index 36f712594..cc285ec4b 100644 --- a/kernel/arch/x86/include/asm/hpet.h +++ b/kernel/arch/x86/include/asm/hpet.h @@ -63,10 +63,10 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; -extern int boot_hpet_disable; +extern bool boot_hpet_disable; extern u8 hpet_blockid; -extern int hpet_force_user; -extern u8 hpet_msi_disable; +extern bool hpet_force_user; +extern bool hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); @@ -74,20 +74,16 @@ extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); struct irq_data; +struct hpet_dev; +struct irq_domain; + extern void hpet_msi_unmask(struct irq_data *data); extern void hpet_msi_mask(struct irq_data *data); -struct hpet_dev; extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); - -#ifdef CONFIG_PCI_MSI -extern int default_setup_hpet_msi(unsigned int irq, unsigned int id); -#else -static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - return -EINVAL; -} -#endif +extern struct irq_domain *hpet_create_irq_domain(int hpet_id); +extern int hpet_assign_irq(struct irq_domain *domain, + struct hpet_dev *dev, int dev_num); #ifdef CONFIG_HPET_EMULATE_RTC diff --git a/kernel/arch/x86/include/asm/hugetlb.h b/kernel/arch/x86/include/asm/hugetlb.h index 68c05398b..f8a29d2c9 100644 --- a/kernel/arch/x86/include/asm/hugetlb.h +++ b/kernel/arch/x86/include/asm/hugetlb.h @@ -26,9 +26,6 @@ static inline int prepare_hugepage_range(struct file *file, return 0; } -static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { -} - static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, @@ -83,15 +80,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return *ptep; } -static inline int arch_prepare_hugepage(struct page *page) -{ - return 0; -} - -static inline void arch_release_hugepage(struct page *page) -{ -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/kernel/arch/x86/include/asm/hw_irq.h b/kernel/arch/x86/include/asm/hw_irq.h index e9571ddab..1e3408e88 100644 --- a/kernel/arch/x86/include/asm/hw_irq.h +++ b/kernel/arch/x86/include/asm/hw_irq.h @@ -29,6 +29,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); +extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -36,43 +37,10 @@ extern asmlinkage void spurious_interrupt(void); extern asmlinkage void thermal_interrupt(void); extern asmlinkage void reschedule_interrupt(void); -extern asmlinkage void invalidate_interrupt(void); -extern asmlinkage void invalidate_interrupt0(void); -extern asmlinkage void invalidate_interrupt1(void); -extern asmlinkage void invalidate_interrupt2(void); -extern asmlinkage void invalidate_interrupt3(void); -extern asmlinkage void invalidate_interrupt4(void); -extern asmlinkage void invalidate_interrupt5(void); -extern asmlinkage void invalidate_interrupt6(void); -extern asmlinkage void invalidate_interrupt7(void); -extern asmlinkage void invalidate_interrupt8(void); -extern asmlinkage void invalidate_interrupt9(void); -extern asmlinkage void invalidate_interrupt10(void); -extern asmlinkage void invalidate_interrupt11(void); -extern asmlinkage void invalidate_interrupt12(void); -extern asmlinkage void invalidate_interrupt13(void); -extern asmlinkage void invalidate_interrupt14(void); -extern asmlinkage void invalidate_interrupt15(void); -extern asmlinkage void invalidate_interrupt16(void); -extern asmlinkage void invalidate_interrupt17(void); -extern asmlinkage void invalidate_interrupt18(void); -extern asmlinkage void invalidate_interrupt19(void); -extern asmlinkage void invalidate_interrupt20(void); -extern asmlinkage void invalidate_interrupt21(void); -extern asmlinkage void invalidate_interrupt22(void); -extern asmlinkage void invalidate_interrupt23(void); -extern asmlinkage void invalidate_interrupt24(void); -extern asmlinkage void invalidate_interrupt25(void); -extern asmlinkage void invalidate_interrupt26(void); -extern asmlinkage void invalidate_interrupt27(void); -extern asmlinkage void invalidate_interrupt28(void); -extern asmlinkage void invalidate_interrupt29(void); -extern asmlinkage void invalidate_interrupt30(void); -extern asmlinkage void invalidate_interrupt31(void); - extern asmlinkage void irq_move_cleanup_interrupt(void); extern asmlinkage void reboot_interrupt(void); extern asmlinkage void threshold_interrupt(void); +extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); @@ -87,60 +55,93 @@ extern void trace_spurious_interrupt(void); extern void trace_thermal_interrupt(void); extern void trace_reschedule_interrupt(void); extern void trace_threshold_interrupt(void); +extern void trace_deferred_error_interrupt(void); extern void trace_call_function_interrupt(void); extern void trace_call_function_single_interrupt(void); #define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt #define trace_reboot_interrupt reboot_interrupt #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi +#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi #endif /* CONFIG_TRACING */ -#ifdef CONFIG_IRQ_REMAP -/* Intel specific interrupt remapping information */ -struct irq_2_iommu { - struct intel_iommu *iommu; - u16 irte_index; - u16 sub_handle; - u8 irte_mask; -}; - -/* AMD specific interrupt remapping information */ -struct irq_2_irte { - u16 devid; /* Device ID for IRTE table */ - u16 index; /* Index into IRTE table*/ -}; -#endif /* CONFIG_IRQ_REMAP */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; +struct pci_dev; +struct msi_desc; + +enum irq_alloc_type { + X86_IRQ_ALLOC_TYPE_IOAPIC = 1, + X86_IRQ_ALLOC_TYPE_HPET, + X86_IRQ_ALLOC_TYPE_MSI, + X86_IRQ_ALLOC_TYPE_MSIX, + X86_IRQ_ALLOC_TYPE_DMAR, + X86_IRQ_ALLOC_TYPE_UV, +}; -struct irq_cfg { - cpumask_var_t domain; - cpumask_var_t old_domain; - u8 vector; - u8 move_in_progress : 1; -#ifdef CONFIG_IRQ_REMAP - u8 remapped : 1; +struct irq_alloc_info { + enum irq_alloc_type type; + u32 flags; + const struct cpumask *mask; /* CPU mask for vector allocation */ union { - struct irq_2_iommu irq_2_iommu; - struct irq_2_irte irq_2_irte; - }; + int unused; +#ifdef CONFIG_HPET_TIMER + struct { + int hpet_id; + int hpet_index; + void *hpet_data; + }; #endif - union { -#ifdef CONFIG_X86_IO_APIC +#ifdef CONFIG_PCI_MSI struct { - struct list_head irq_2_pin; + struct pci_dev *msi_dev; + irq_hw_number_t msi_hwirq; + }; +#endif +#ifdef CONFIG_X86_IO_APIC + struct { + int ioapic_id; + int ioapic_pin; + int ioapic_node; + u32 ioapic_trigger : 1; + u32 ioapic_polarity : 1; + u32 ioapic_valid : 1; + struct IO_APIC_route_entry *ioapic_entry; + }; +#endif +#ifdef CONFIG_DMAR_TABLE + struct { + int dmar_id; + void *dmar_data; + }; +#endif +#ifdef CONFIG_HT_IRQ + struct { + int ht_pos; + int ht_idx; + struct pci_dev *ht_dev; + void *ht_update; + }; +#endif +#ifdef CONFIG_X86_UV + struct { + int uv_limit; + int uv_blade; + unsigned long uv_offset; + char *uv_name; }; #endif }; }; +struct irq_cfg { + unsigned int dest_apicid; + u8 vector; +}; + extern struct irq_cfg *irq_cfg(unsigned int irq); extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); -extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); -extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); -extern void clear_irq_vector(int irq, struct irq_cfg *cfg); extern void setup_vector_irq(int cpu); #ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); @@ -150,10 +151,7 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif -extern int apic_retrigger_irq(struct irq_data *data); extern void apic_ack_edge(struct irq_data *data); -extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id); #else /* CONFIG_X86_LOCAL_APIC */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} @@ -163,8 +161,7 @@ static inline void unlock_vector_lock(void) {} extern atomic_t irq_err_count; extern atomic_t irq_mis_count; -/* EISA */ -extern void eisa_set_level_irq(unsigned int irq); +extern void elcr_set_level_irq(unsigned int irq); /* SMP */ extern __visible void smp_apic_timer_interrupt(struct pt_regs *); @@ -178,7 +175,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void); extern __visible void smp_reschedule_interrupt(struct pt_regs *); extern __visible void smp_call_function_interrupt(struct pt_regs *); extern __visible void smp_call_function_single_interrupt(struct pt_regs *); -extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif extern char irq_entries_start[]; @@ -186,10 +182,10 @@ extern char irq_entries_start[]; #define trace_irq_entries_start irq_entries_start #endif -#define VECTOR_UNDEFINED (-1) -#define VECTOR_RETRIGGERED (-2) +#define VECTOR_UNUSED NULL +#define VECTOR_RETRIGGERED ((void *)~0UL) -typedef int vector_irq_t[NR_VECTORS]; +typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); #endif /* !ASSEMBLY_ */ diff --git a/kernel/arch/x86/include/asm/i387.h b/kernel/arch/x86/include/asm/i387.h deleted file mode 100644 index 6eb6fcb83..000000000 --- a/kernel/arch/x86/include/asm/i387.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes , May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _ASM_X86_I387_H -#define _ASM_X86_I387_H - -#ifndef __ASSEMBLY__ - -#include -#include - -struct pt_regs; -struct user_i387_struct; - -extern int init_fpu(struct task_struct *child); -extern void fpu_finit(struct fpu *fpu); -extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); -extern void math_state_restore(void); - -extern bool irq_fpu_usable(void); - -/* - * Careful: __kernel_fpu_begin/end() must be called with preempt disabled - * and they don't touch the preempt state on their own. - * If you enable preemption after __kernel_fpu_begin(), preempt notifier - * should call the __kernel_fpu_end() to prevent the kernel/user FPU - * state from getting corrupted. KVM for example uses this model. - * - * All other cases use kernel_fpu_begin/end() which disable preemption - * during kernel FPU usage. - */ -extern void __kernel_fpu_begin(void); -extern void __kernel_fpu_end(void); - -static inline void kernel_fpu_begin(void) -{ - preempt_disable(); - WARN_ON_ONCE(!irq_fpu_usable()); - __kernel_fpu_begin(); -} - -static inline void kernel_fpu_end(void) -{ - __kernel_fpu_end(); - preempt_enable(); -} - -/* Must be called with preempt disabled */ -extern void kernel_fpu_disable(void); -extern void kernel_fpu_enable(void); - -/* - * Some instructions like VIA's padlock instructions generate a spurious - * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context as well. To prevent these kernel instructions - * in interrupt context interacting wrongly with other user/kernel fpu usage, we - * should use them only in the context of irq_ts_save/restore() - */ -static inline int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} - -static inline void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} - -/* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int user_has_fpu(void) -{ - return current->thread.fpu.has_fpu; -} - -extern void unlazy_fpu(struct task_struct *tsk); - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_I387_H */ diff --git a/kernel/arch/x86/include/asm/i8259.h b/kernel/arch/x86/include/asm/i8259.h index ccffa5375..39bcefc20 100644 --- a/kernel/arch/x86/include/asm/i8259.h +++ b/kernel/arch/x86/include/asm/i8259.h @@ -60,6 +60,7 @@ struct legacy_pic { void (*mask_all)(void); void (*restore_mask)(void); void (*init)(int auto_eoi); + int (*probe)(void); int (*irq_pending)(unsigned int irq); void (*make_irq)(unsigned int irq); }; diff --git a/kernel/arch/x86/include/asm/ia32.h b/kernel/arch/x86/include/asm/ia32.h index d0e8e0141..a9bdf5569 100644 --- a/kernel/arch/x86/include/asm/ia32.h +++ b/kernel/arch/x86/include/asm/ia32.h @@ -10,7 +10,7 @@ * 32 bit structures for IA32 support. */ -#include +#include /* signal.h */ @@ -18,16 +18,7 @@ struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; - struct sigcontext_ia32 uc_mcontext; - compat_sigset_t uc_sigmask; /* mask last for extensibility */ -}; - -struct ucontext_x32 { - unsigned int uc_flags; - unsigned int uc_link; - compat_stack_t uc_stack; - unsigned int uc__pad0; /* needed for alignment */ - struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ + struct sigcontext_32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; diff --git a/kernel/arch/x86/include/asm/intel_pmc_ipc.h b/kernel/arch/x86/include/asm/intel_pmc_ipc.h new file mode 100644 index 000000000..cd0310e18 --- /dev/null +++ b/kernel/arch/x86/include/asm/intel_pmc_ipc.h @@ -0,0 +1,55 @@ +#ifndef _ASM_X86_INTEL_PMC_IPC_H_ +#define _ASM_X86_INTEL_PMC_IPC_H_ + +/* Commands */ +#define PMC_IPC_PMIC_ACCESS 0xFF +#define PMC_IPC_PMIC_ACCESS_READ 0x0 +#define PMC_IPC_PMIC_ACCESS_WRITE 0x1 +#define PMC_IPC_USB_PWR_CTRL 0xF0 +#define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF +#define PMC_IPC_PHY_CONFIG 0xEE +#define PMC_IPC_NORTHPEAK_CTRL 0xED +#define PMC_IPC_PM_DEBUG 0xEC +#define PMC_IPC_PMC_TELEMTRY 0xEB +#define PMC_IPC_PMC_FW_MSG_CTRL 0xEA + +/* IPC return code */ +#define IPC_ERR_NONE 0 +#define IPC_ERR_CMD_NOT_SUPPORTED 1 +#define IPC_ERR_CMD_NOT_SERVICED 2 +#define IPC_ERR_UNABLE_TO_SERVICE 3 +#define IPC_ERR_CMD_INVALID 4 +#define IPC_ERR_CMD_FAILED 5 +#define IPC_ERR_EMSECURITY 6 +#define IPC_ERR_UNSIGNEDKERNEL 7 + +#if IS_ENABLED(CONFIG_INTEL_PMC_IPC) + +int intel_pmc_ipc_simple_command(int cmd, int sub); +int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen, u32 dptr, u32 sptr); +int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen); + +#else + +static inline int intel_pmc_ipc_simple_command(int cmd, int sub) +{ + return -EINVAL; +} + +static inline int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen, u32 dptr, u32 sptr) +{ + return -EINVAL; +} + +static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, + u32 *out, u32 outlen) +{ + return -EINVAL; +} + +#endif /*CONFIG_INTEL_PMC_IPC*/ + +#endif diff --git a/kernel/arch/x86/include/asm/io.h b/kernel/arch/x86/include/asm/io.h index 34a5b9370..de25aad07 100644 --- a/kernel/arch/x86/include/asm/io.h +++ b/kernel/arch/x86/include/asm/io.h @@ -35,11 +35,13 @@ */ #define ARCH_HAS_IOREMAP_WC +#define ARCH_HAS_IOREMAP_WT #include #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -177,6 +179,9 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * look at pci_iomap(). */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); +#define ioremap_uc ioremap_uc + extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); @@ -197,8 +202,6 @@ extern void set_iounmap_nonlazy(void); #include -#include - /* * Convert a virtual cached pointer to an uncached pointer */ @@ -320,6 +323,7 @@ extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -338,6 +342,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff #ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_index(int handle); +#define arch_phys_wc_index arch_phys_wc_index + extern int __must_check arch_phys_wc_add(unsigned long base, unsigned long size); extern void arch_phys_wc_del(int handle); diff --git a/kernel/arch/x86/include/asm/io_apic.h b/kernel/arch/x86/include/asm/io_apic.h index 2f91685fe..6cbf2cfb3 100644 --- a/kernel/arch/x86/include/asm/io_apic.h +++ b/kernel/arch/x86/include/asm/io_apic.h @@ -95,9 +95,22 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 +struct irq_alloc_info; +struct ioapic_domain_cfg; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#define IOAPIC_MASKED 1 +#define IOAPIC_UNMASKED 0 + +#define IOAPIC_POL_HIGH 0 +#define IOAPIC_POL_LOW 1 + +#define IOAPIC_DEST_MODE_PHYSICAL 0 +#define IOAPIC_DEST_MODE_LOGICAL 1 + #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 @@ -110,9 +123,6 @@ extern int nr_ioapics; extern int mpc_ioapic_id(int ioapic); extern unsigned int mpc_ioapic_addr(int ioapic); -extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); - -#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -120,9 +130,6 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* Older SiS APIC requires we rewrite the index register */ -extern int sis_apic_bug; - /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; @@ -132,6 +139,8 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; +extern u32 gsi_top; + extern unsigned long io_apic_irqs; #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs)) @@ -147,13 +156,6 @@ struct irq_cfg; extern void ioapic_insert_resources(void); extern int arch_early_ioapic_init(void); -extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); -extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); - -extern void native_eoi_ioapic_pin(int apic, int pin, int vector); - extern int save_ioapic_entries(void); extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); @@ -161,82 +163,32 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); -struct io_apic_irq_attr { - int ioapic; - int ioapic_pin; - int trigger; - int polarity; -}; - -enum ioapic_domain_type { - IOAPIC_DOMAIN_INVALID, - IOAPIC_DOMAIN_LEGACY, - IOAPIC_DOMAIN_STRICT, - IOAPIC_DOMAIN_DYNAMIC, -}; - -struct device_node; -struct irq_domain; -struct irq_domain_ops; - -struct ioapic_domain_cfg { - enum ioapic_domain_type type; - const struct irq_domain_ops *ops; - struct device_node *dev; -}; - -struct mp_ioapic_gsi{ - u32 gsi_base; - u32 gsi_end; -}; -extern u32 gsi_top; - extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); -extern u32 mp_pin_to_gsi(int ioapic, int pin); -extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); +extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info); extern void mp_unmap_irq(int irq); extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg); extern int mp_unregister_ioapic(u32 gsi_base); extern int mp_ioapic_registered(u32 gsi_base); -extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq); -extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); -extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node); -extern void __init pre_init_apic_IRQ0(void); + +extern void ioapic_set_alloc_attr(struct irq_alloc_info *info, + int node, int trigger, int polarity); extern void mp_save_irq(struct mpc_intsrc *m); extern void disable_ioapic_support(void); -extern void __init native_io_apic_init_mappings(void); +extern void __init io_apic_init_mappings(void); extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); -extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); -extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); extern void native_disable_io_apic(void); -extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries); -extern int native_ioapic_set_affinity(struct irq_data *, - const struct cpumask *, - bool); static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { return x86_io_apic_ops.read(apic, reg); } -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.write(apic, reg, value); -} -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - x86_io_apic_ops.modify(apic, reg, value); -} - -extern void io_apic_eoi(unsigned int apic, unsigned int vector); - extern void setup_IO_APIC(void); extern void enable_IO_APIC(void); extern void disable_IO_APIC(void); @@ -253,8 +205,12 @@ static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } -static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } -static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } +static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, + struct irq_alloc_info *info) +{ + return gsi; +} + static inline void mp_unmap_irq(int irq) { } static inline int save_ioapic_entries(void) @@ -268,17 +224,11 @@ static inline int restore_ioapic_entries(void) return -ENOMEM; } -static inline void mp_save_irq(struct mpc_intsrc *m) { }; +static inline void mp_save_irq(struct mpc_intsrc *m) { } static inline void disable_ioapic_support(void) { } -#define native_io_apic_init_mappings NULL +static inline void io_apic_init_mappings(void) { } #define native_io_apic_read NULL -#define native_io_apic_write NULL -#define native_io_apic_modify NULL #define native_disable_io_apic NULL -#define native_io_apic_print_entries NULL -#define native_ioapic_set_affinity NULL -#define native_setup_ioapic_entry NULL -#define native_eoi_ioapic_pin NULL static inline void setup_IO_APIC(void) { } static inline void enable_IO_APIC(void) { } diff --git a/kernel/arch/x86/include/asm/iosf_mbi.h b/kernel/arch/x86/include/asm/iosf_mbi.h index 57995f059..b72ad0faa 100644 --- a/kernel/arch/x86/include/asm/iosf_mbi.h +++ b/kernel/arch/x86/include/asm/iosf_mbi.h @@ -52,20 +52,20 @@ /* Quark available units */ #define QRK_MBI_UNIT_HBA 0x00 -#define QRK_MBI_UNIT_HB 0x03 +#define QRK_MBI_UNIT_HB 0x03 #define QRK_MBI_UNIT_RMU 0x04 -#define QRK_MBI_UNIT_MM 0x05 +#define QRK_MBI_UNIT_MM 0x05 #define QRK_MBI_UNIT_MMESRAM 0x05 #define QRK_MBI_UNIT_SOC 0x31 /* Quark read/write opcodes */ #define QRK_MBI_HBA_READ 0x10 #define QRK_MBI_HBA_WRITE 0x11 -#define QRK_MBI_HB_READ 0x10 +#define QRK_MBI_HB_READ 0x10 #define QRK_MBI_HB_WRITE 0x11 #define QRK_MBI_RMU_READ 0x10 #define QRK_MBI_RMU_WRITE 0x11 -#define QRK_MBI_MM_READ 0x10 +#define QRK_MBI_MM_READ 0x10 #define QRK_MBI_MM_WRITE 0x11 #define QRK_MBI_MMESRAM_READ 0x12 #define QRK_MBI_MMESRAM_WRITE 0x13 diff --git a/kernel/arch/x86/include/asm/irq.h b/kernel/arch/x86/include/asm/irq.h index a80cbb88e..e7de5c9a4 100644 --- a/kernel/arch/x86/include/asm/irq.h +++ b/kernel/arch/x86/include/asm/irq.h @@ -23,16 +23,23 @@ extern void irq_ctx_init(int cpu); #define __ARCH_HAS_DO_SOFTIRQ +struct irq_desc; + #ifdef CONFIG_HOTPLUG_CPU #include extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); -extern void irq_force_complete_move(int); +extern void irq_force_complete_move(struct irq_desc *desc); +#endif + +#ifdef CONFIG_HAVE_KVM +extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); #endif extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); -extern bool handle_irq(unsigned irq, struct pt_regs *regs); + +extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); diff --git a/kernel/arch/x86/include/asm/irq_remapping.h b/kernel/arch/x86/include/asm/irq_remapping.h index 6224d316c..a210eba27 100644 --- a/kernel/arch/x86/include/asm/irq_remapping.h +++ b/kernel/arch/x86/include/asm/irq_remapping.h @@ -22,84 +22,72 @@ #ifndef __X86_IRQ_REMAPPING_H #define __X86_IRQ_REMAPPING_H +#include +#include #include -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_chip; struct msi_msg; -struct pci_dev; -struct irq_cfg; +struct irq_alloc_info; + +enum irq_remap_cap { + IRQ_POSTING_CAP = 0, +}; + +struct vcpu_data { + u64 pi_desc_addr; /* Physical address of PI Descriptor */ + u32 vector; /* Guest vector of the interrupt */ +}; #ifdef CONFIG_IRQ_REMAP +extern bool irq_remapping_cap(enum irq_remap_cap cap); extern void set_irq_remapping_broken(void); extern int irq_remapping_prepare(void); extern int irq_remapping_enable(void); extern void irq_remapping_disable(void); extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); -extern int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr); -extern void free_remapped_irq(int irq); -extern void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); -extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); extern void panic_if_irq_remap(const char *msg); -extern bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip); -void irq_remap_modify_chip_defaults(struct irq_chip *chip); +extern struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info); +extern struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info); + +/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ +extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent); + +/* Get parent irqdomain for interrupt remapping irqdomain */ +static inline struct irq_domain *arch_get_ir_parent_domain(void) +{ + return x86_vector_domain; +} #else /* CONFIG_IRQ_REMAP */ +static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } static inline void set_irq_remapping_broken(void) { } static inline int irq_remapping_prepare(void) { return -ENODEV; } static inline int irq_remapping_enable(void) { return -ENODEV; } static inline void irq_remapping_disable(void) { } static inline int irq_remapping_reenable(int eim) { return -ENODEV; } static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr) -{ - return -ENODEV; -} -static inline void free_remapped_irq(int irq) { } -static inline void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ -} -static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - return -ENODEV; -} static inline void panic_if_irq_remap(const char *msg) { } -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) +static inline struct irq_domain * +irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info) { + return NULL; } -static inline bool setup_remapped_irq(int irq, - struct irq_cfg *cfg, - struct irq_chip *chip) +static inline struct irq_domain * +irq_remapping_get_irq_domain(struct irq_alloc_info *info) { - return false; + return NULL; } -#endif /* CONFIG_IRQ_REMAP */ - -#define dmar_alloc_hwirq() irq_alloc_hwirq(-1) -#define dmar_free_hwirq irq_free_hwirq +#endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/kernel/arch/x86/include/asm/irq_vectors.h b/kernel/arch/x86/include/asm/irq_vectors.h index 666c89ec4..6ca9fd623 100644 --- a/kernel/arch/x86/include/asm/irq_vectors.h +++ b/kernel/arch/x86/include/asm/irq_vectors.h @@ -47,31 +47,12 @@ #define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR #define IA32_SYSCALL_VECTOR 0x80 -#ifdef CONFIG_X86_32 -# define SYSCALL_VECTOR 0x80 -#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary */ -#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) - -#define IRQ1_VECTOR (IRQ0_VECTOR + 1) -#define IRQ2_VECTOR (IRQ0_VECTOR + 2) -#define IRQ3_VECTOR (IRQ0_VECTOR + 3) -#define IRQ4_VECTOR (IRQ0_VECTOR + 4) -#define IRQ5_VECTOR (IRQ0_VECTOR + 5) -#define IRQ6_VECTOR (IRQ0_VECTOR + 6) -#define IRQ7_VECTOR (IRQ0_VECTOR + 7) -#define IRQ8_VECTOR (IRQ0_VECTOR + 8) -#define IRQ9_VECTOR (IRQ0_VECTOR + 9) -#define IRQ10_VECTOR (IRQ0_VECTOR + 10) -#define IRQ11_VECTOR (IRQ0_VECTOR + 11) -#define IRQ12_VECTOR (IRQ0_VECTOR + 12) -#define IRQ13_VECTOR (IRQ0_VECTOR + 13) -#define IRQ14_VECTOR (IRQ0_VECTOR + 14) -#define IRQ15_VECTOR (IRQ0_VECTOR + 15) +#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq) /* * Special IRQ vectors used by the SMP architecture, 0xf0-0xff @@ -102,21 +83,23 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -/* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM -#define POSTED_INTR_VECTOR 0xf2 -#endif - +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 /* * IRQ work vector: */ #define IRQ_WORK_VECTOR 0xf6 #define UV_BAU_MESSAGE 0xf5 +#define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 +/* Vector for KVM to deliver posted interrupt IPI */ +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * Local APIC timer IRQ vector is on a different priority level, * to work around the 'lost local interrupt if more than 2 IRQ @@ -134,16 +117,6 @@ #define FPU_IRQ 13 -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 - -#ifndef __ASSEMBLY__ -static inline int invalid_vm86_irq(int irq) -{ - return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; -} -#endif - /* * Size the maximum number of interrupts. * @@ -155,18 +128,22 @@ static inline int invalid_vm86_irq(int irq) * static arrays. */ -#define NR_IRQS_LEGACY 16 +#define NR_IRQS_LEGACY 16 -#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) +#define CPU_VECTOR_LIMIT (64 * NR_CPUS) +#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS) -#ifdef CONFIG_X86_IO_APIC -# define CPU_VECTOR_LIMIT (64 * NR_CPUS) -# define NR_IRQS \ +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI) +#define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) -#else /* !CONFIG_X86_IO_APIC: */ -# define NR_IRQS NR_IRQS_LEGACY +#elif defined(CONFIG_X86_IO_APIC) +#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) +#elif defined(CONFIG_PCI_MSI) +#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else +#define NR_IRQS NR_IRQS_LEGACY #endif #endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/kernel/arch/x86/include/asm/irqdomain.h b/kernel/arch/x86/include/asm/irqdomain.h new file mode 100644 index 000000000..d26075b52 --- /dev/null +++ b/kernel/arch/x86/include/asm/irqdomain.h @@ -0,0 +1,63 @@ +#ifndef _ASM_IRQDOMAIN_H +#define _ASM_IRQDOMAIN_H + +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +enum { + /* Allocate contiguous CPU vectors */ + X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, +}; + +extern struct irq_domain *x86_vector_domain; + +extern void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask); +extern void copy_irq_alloc_info(struct irq_alloc_info *dst, + struct irq_alloc_info *src); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +struct device_node; +struct irq_data; + +enum ioapic_domain_type { + IOAPIC_DOMAIN_INVALID, + IOAPIC_DOMAIN_LEGACY, + IOAPIC_DOMAIN_STRICT, + IOAPIC_DOMAIN_DYNAMIC, +}; + +struct ioapic_domain_cfg { + enum ioapic_domain_type type; + const struct irq_domain_ops *ops; + struct device_node *dev; +}; + +extern const struct irq_domain_ops mp_ioapic_irqdomain_ops; + +extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg); +extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs); +extern void mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data); +extern void mp_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data); +extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); +#endif /* CONFIG_X86_IO_APIC */ + +#ifdef CONFIG_PCI_MSI +extern void arch_init_msi_domain(struct irq_domain *domain); +#else +static inline void arch_init_msi_domain(struct irq_domain *domain) { } +#endif + +#ifdef CONFIG_HT_IRQ +extern void arch_init_htirq_domain(struct irq_domain *domain); +#else +static inline void arch_init_htirq_domain(struct irq_domain *domain) { } +#endif + +#endif diff --git a/kernel/arch/x86/include/asm/jump_label.h b/kernel/arch/x86/include/asm/jump_label.h index a4c1cf7e9..5daeca3d0 100644 --- a/kernel/arch/x86/include/asm/jump_label.h +++ b/kernel/arch/x86/include/asm/jump_label.h @@ -16,15 +16,32 @@ # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC #endif -static __always_inline bool arch_static_branch(struct static_key *key) +static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 \n\t" + _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" ".popsection \n\t" - : : "i" (key) : : l_yes); + : : "i" (key), "i" (branch) : : l_yes); + + return false; +l_yes: + return true; +} + +static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) +{ + asm_volatile_goto("1:" + ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" + "2:\n\t" + ".pushsection __jump_table, \"aw\" \n\t" + _ASM_ALIGN "\n\t" + _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" + ".popsection \n\t" + : : "i" (key), "i" (branch) : : l_yes); + return false; l_yes: return true; diff --git a/kernel/arch/x86/include/asm/kasan.h b/kernel/arch/x86/include/asm/kasan.h index 74a2a8dc9..1410b567e 100644 --- a/kernel/arch/x86/include/asm/kasan.h +++ b/kernel/arch/x86/include/asm/kasan.h @@ -1,6 +1,9 @@ #ifndef _ASM_X86_KASAN_H #define _ASM_X86_KASAN_H +#include +#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) + /* * Compiler uses shadow offset assuming that addresses start * from 0. Kernel addresses don't start from 0, so shadow diff --git a/kernel/arch/x86/include/asm/kdebug.h b/kernel/arch/x86/include/asm/kdebug.h index 32ce71375..e5f5dc978 100644 --- a/kernel/arch/x86/include/asm/kdebug.h +++ b/kernel/arch/x86/include/asm/kdebug.h @@ -29,11 +29,5 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs, extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); -#ifdef CONFIG_KEXEC -extern int in_crash_kexec; -#else -/* no crash dump is ever in progress if no crash kernel can be kexec'd */ -#define in_crash_kexec 0 -#endif #endif /* _ASM_X86_KDEBUG_H */ diff --git a/kernel/arch/x86/include/asm/kvm_emulate.h b/kernel/arch/x86/include/asm/kvm_emulate.h index 57a9d94fe..e9cd7befc 100644 --- a/kernel/arch/x86/include/asm/kvm_emulate.h +++ b/kernel/arch/x86/include/asm/kvm_emulate.h @@ -111,6 +111,16 @@ struct x86_emulate_ops { unsigned int bytes, struct x86_exception *fault); + /* + * read_phys: Read bytes of standard (non-emulated/special) memory. + * Used for descriptor reading. + * @addr: [IN ] Physical address from which to read. + * @val: [OUT] Value read from memory. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes); + /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. @@ -193,6 +203,8 @@ struct x86_emulate_ops { int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); + u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt); + void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); @@ -262,6 +274,11 @@ enum x86emul_mode { X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; +/* These match some of the HF_* flags defined in kvm_host.h */ +#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define X86EMUL_SMM_MASK (1 << 6) +#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7) + struct x86_emulate_ctxt { const struct x86_emulate_ops *ops; @@ -273,8 +290,8 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + int emul_flags; - bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ diff --git a/kernel/arch/x86/include/asm/kvm_host.h b/kernel/arch/x86/include/asm/kvm_host.h index 41b06fca3..30cfd6429 100644 --- a/kernel/arch/x86/include/asm/kvm_host.h +++ b/kernel/arch/x86/include/asm/kvm_host.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define KVM_HALT_POLL_NS_DEFAULT 500000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS @@ -175,6 +177,8 @@ enum { */ #define KVM_APIC_PV_EOI_PENDING 1 +struct kvm_kernel_irq_routing_entry; + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -184,23 +188,12 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; -/* - * kvm_mmu_page_role, below, is defined as: - * - * bits 0:3 - total guest paging levels (2-4, or zero for real mode) - * bits 4:7 - page table level for this shadow (1-4) - * bits 8:9 - page table quadrant for 2-level guests - * bit 16 - direct mapping of virtual to physical mapping at gfn - * used for real mode and two-dimensional paging - * bits 17:19 - common access permissions for all ptes in this shadow page - */ union kvm_mmu_page_role { unsigned word; struct { unsigned level:4; unsigned cr4_pae:1; unsigned quadrant:2; - unsigned pad_for_nice_hex_output:6; unsigned direct:1; unsigned access:3; unsigned invalid:1; @@ -208,6 +201,15 @@ union kvm_mmu_page_role { unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; + unsigned :8; + + /* + * This is left at the top of the word so that + * kvm_memslots_for_spte_role can extract it with a + * simple shift. While there is room, give it a whole + * byte so it is also faster to load it from memory. + */ + unsigned smm:8; }; }; @@ -254,6 +256,11 @@ struct kvm_pio_request { int size; }; +struct rsvd_bits_validate { + u64 rsvd_bits_mask[2][4]; + u64 bad_mt_xwr; +}; + /* * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level * 32-bit). The kvm_mmu structure abstracts the details of the current mmu @@ -291,8 +298,15 @@ struct kvm_mmu { u64 *pae_root; u64 *lm_root; - u64 rsvd_bits_mask[2][4]; - u64 bad_mt_xwr; + + /* + * check zero bits on shadow page table entries, these + * bits include not only hardware reserved bits but also + * the bits spte never used. + */ + struct rsvd_bits_validate shadow_zero_check; + + struct rsvd_bits_validate guest_rsvd_check; /* * Bitmap: bit set = last pte in walk @@ -338,12 +352,34 @@ struct kvm_pmu { u64 reprogram_pmi; }; +struct kvm_pmu_ops; + enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, KVM_DEBUGREG_RELOAD = 4, }; +struct kvm_mtrr_range { + u64 base; + u64 mask; + struct list_head node; +}; + +struct kvm_mtrr { + struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; + mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; + u64 deftype; + + struct list_head head; +}; + +/* Hyper-V per vcpu emulation context */ +struct kvm_vcpu_hv { + u64 hv_vapic; + s64 runtime_offset; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -364,10 +400,12 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + u64 eoi_exit_bitmap[4]; unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; u64 ia32_misc_enable_msr; + u64 smbase; bool tpr_access_reporting; u64 ia32_xss; @@ -467,12 +505,14 @@ struct kvm_vcpu_arch { u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; + u64 tsc_scaling_ratio; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ + bool smi_pending; /* SMI queued after currently running handler */ - struct mtrr_state_type mtrr_state; + struct kvm_mtrr mtrr_state; u64 pat; unsigned switch_db_regs; @@ -498,8 +538,7 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; - /* fields used by HYPER-V emulation */ - u64 hv_vapic; + struct kvm_vcpu_hv hyperv; cpumask_var_t wbinvd_dirty_mask; @@ -540,6 +579,9 @@ struct kvm_vcpu_arch { struct { bool pv_unhalted; } pv; + + int pending_ioapic_eoi; + int pending_external_vector; }; struct kvm_lpage_info { @@ -570,6 +612,17 @@ struct kvm_apic_map { struct kvm_lapic *logical_map[16][16]; }; +/* Hyper-V emulation context */ +struct kvm_hv { + u64 hv_guest_os_id; + u64 hv_hypercall; + u64 hv_tsc_page; + + /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ + u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; + u64 hv_crash_ctl; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -588,6 +641,8 @@ struct kvm_arch { bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; +#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE + atomic_t assigned_device_count; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; @@ -627,16 +682,19 @@ struct kvm_arch { /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; - /* fields used by HYPER-V emulation */ - u64 hv_guest_os_id; - u64 hv_hypercall; - u64 hv_tsc_page; + struct kvm_hv hyperv; #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif bool boot_vcpu_runs_old_kvmclock; + u32 bsp_vcpu_id; + + u64 disabled_quirks; + + bool irqchip_split; + u8 nr_reserved_ioapic_pins; }; struct kvm_vm_stat { @@ -666,6 +724,7 @@ struct kvm_vcpu_stat { u32 nmi_window_exits; u32 halt_exits; u32 halt_successful_poll; + u32 halt_attempted_poll; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -689,12 +748,13 @@ struct msr_data { struct kvm_lapic_irq { u32 vector; - u32 delivery_mode; - u32 dest_mode; - u32 level; - u32 trig_mode; + u16 delivery_mode; + u16 dest_mode; + bool level; + u16 trig_mode; u32 shorthand; u32 dest_id; + bool msi_redir_hint; }; struct kvm_x86_ops { @@ -706,19 +766,20 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + bool (*cpu_has_high_real_mode_segbase)(void); void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); - void (*vcpu_reset)(struct kvm_vcpu *vcpu); + void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + void (*update_bp_intercept)(struct kvm_vcpu *vcpu); + int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, @@ -770,10 +831,10 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - int (*vm_has_apicv)(struct kvm *kvm); + int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm *kvm, int isr); - void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); + void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); @@ -784,7 +845,7 @@ struct kvm_x86_ops { int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); bool (*invpcid_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); + void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -792,11 +853,9 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); @@ -836,6 +895,22 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); + /* pmu operations of sub-arch */ + const struct kvm_pmu_ops *pmu_ops; + + /* + * Architecture specific hooks for vCPU blocking due to + * HLT instruction. + * Returns for .pre_block(): + * - 0 means continue to block the vCPU. + * - 1 means we cannot block the vCPU since some event + * happens during this period, such as, 'ON' bit in + * posted-interrupts descriptor is set. + */ + int (*pre_block)(struct kvm_vcpu *vcpu); + void (*post_block)(struct kvm_vcpu *vcpu); + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); }; struct kvm_arch_async_pf { @@ -847,17 +922,6 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; -static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, - s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); -} - -static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); -} - int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -871,7 +935,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, @@ -882,7 +946,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); @@ -890,7 +954,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); -u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); struct kvm_irq_mask_notifier { void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); @@ -911,10 +974,12 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); /* control of guest tsc rate supported? */ extern bool kvm_has_tsc_control; -/* minimum supported tsc_khz for guests */ -extern u32 kvm_min_guest_tsc_khz; /* maximum supported tsc_khz for guests */ extern u32 kvm_max_guest_tsc_khz; +/* number of bits of the fractional part of the TSC scaling ratio */ +extern u8 kvm_tsc_scaling_ratio_frac_bits; +/* maximum allowed value of TSC scaling ratio */ +extern u64 kvm_max_tsc_scaling_ratio; enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -938,7 +1003,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); +int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; @@ -967,7 +1032,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); @@ -1002,8 +1067,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -int fx_init(struct kvm_vcpu *vcpu); - void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); @@ -1112,6 +1175,14 @@ enum { #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ +#define HF_SMM_MASK (1 << 6) +#define HF_SMM_INSIDE_NMI_MASK (1 << 7) + +#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#define KVM_ADDRESS_SPACE_NUM 2 + +#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) /* * Hardware virtualization extension instructions may fault if a @@ -1146,7 +1217,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); -void kvm_vcpu_reset(struct kvm_vcpu *vcpu); +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, unsigned long address); @@ -1154,6 +1225,9 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); + unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); @@ -1170,16 +1244,18 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); -void kvm_pmu_init(struct kvm_vcpu *vcpu); -void kvm_pmu_destroy(struct kvm_vcpu *vcpu); -void kvm_pmu_reset(struct kvm_vcpu *vcpu); -void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); -bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc); -int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); -void kvm_deliver_pmi(struct kvm_vcpu *vcpu); +int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); +bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); +bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); + +bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, + struct kvm_vcpu **dest_vcpu); + +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm_lapic_irq *irq); + +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/kernel/arch/x86/include/asm/livepatch.h b/kernel/arch/x86/include/asm/livepatch.h index 2d29197bd..19c099afa 100644 --- a/kernel/arch/x86/include/asm/livepatch.h +++ b/kernel/arch/x86/include/asm/livepatch.h @@ -21,6 +21,7 @@ #ifndef _ASM_X86_LIVEPATCH_H #define _ASM_X86_LIVEPATCH_H +#include #include #include diff --git a/kernel/arch/x86/include/asm/math_emu.h b/kernel/arch/x86/include/asm/math_emu.h index 031f6266f..0d9b14f60 100644 --- a/kernel/arch/x86/include/asm/math_emu.h +++ b/kernel/arch/x86/include/asm/math_emu.h @@ -2,7 +2,6 @@ #define _ASM_X86_MATH_EMU_H #include -#include /* This structure matches the layout of the data saved to the stack following a device-not-present interrupt, part of it saved @@ -10,9 +9,6 @@ */ struct math_emu_info { long ___orig_eip; - union { - struct pt_regs *regs; - struct kernel_vm86_regs *vm86; - }; + struct pt_regs *regs; }; #endif /* _ASM_X86_MATH_EMU_H */ diff --git a/kernel/arch/x86/include/asm/mce.h b/kernel/arch/x86/include/asm/mce.h index 1f5a86d51..2ea4527e4 100644 --- a/kernel/arch/x86/include/asm/mce.h +++ b/kernel/arch/x86/include/asm/mce.h @@ -17,11 +17,16 @@ #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ #define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ +#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */ /* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */ + +/* MCG_EXT_CTL register defines */ +#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */ /* MCi_STATUS register defines */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ @@ -104,6 +109,7 @@ struct mce_log { struct mca_config { bool dont_log_ce; bool cmci_disabled; + bool lmce_disabled; bool ignore_ce; bool disabled; bool ser; @@ -117,8 +123,27 @@ struct mca_config { }; struct mce_vendor_flags { - __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ - __reserved_0 : 63; + /* + * Indicates that overflow conditions are not fatal, when set. + */ + __u64 overflow_recov : 1, + + /* + * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and + * Recovery. It indicates support for data poisoning in HW and deferred + * error interrupts. + */ + succor : 1, + + /* + * (AMD) SMCA: This bit indicates support for Scalable MCA which expands + * the register space for each MCA bank and also increases number of + * banks. Also, to accommodate the new banks and registers, the MCA + * register space is moved to a new MSR range. + */ + smca : 1, + + __reserved_0 : 61; }; extern struct mce_vendor_flags mce_flags; @@ -134,10 +159,12 @@ extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_cpu_clear(struct cpuinfo_x86 *c); void mcheck_vendor_init_severity(void); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline void mcheck_vendor_init_severity(void) {} #endif @@ -164,12 +191,14 @@ DECLARE_PER_CPU(struct device *, mce_device); #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); +void mce_intel_feature_clear(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} @@ -223,6 +252,9 @@ void do_machine_check(struct pt_regs *, long); extern void (*mce_threshold_vector)(void); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* Deferred error interrupt handler */ +extern void (*deferred_error_int_vector)(void); + /* * Thermal handler */ diff --git a/kernel/arch/x86/include/asm/microcode.h b/kernel/arch/x86/include/asm/microcode.h index 2fb20d6f7..34e62b1dc 100644 --- a/kernel/arch/x86/include/asm/microcode.h +++ b/kernel/arch/x86/include/asm/microcode.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H +#include + #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = native_read_msr((msr)); \ @@ -25,7 +27,6 @@ struct cpu_signature { struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; -extern bool dis_ucode_ldr; struct microcode_ops { enum ucode_state (*request_microcode_user) (int cpu, @@ -53,6 +54,12 @@ struct ucode_cpu_info { }; extern struct ucode_cpu_info ucode_cpu_info[]; +#ifdef CONFIG_MICROCODE +int __init microcode_init(void); +#else +static inline int __init microcode_init(void) { return 0; }; +#endif + #ifdef CONFIG_MICROCODE_INTEL extern struct microcode_ops * __init init_intel_microcode(void); #else @@ -73,7 +80,6 @@ static inline struct microcode_ops * __init init_amd_microcode(void) static inline void __exit exit_amd_microcode(void) {} #endif -#ifdef CONFIG_MICROCODE_EARLY #define MAX_UCODE_COUNT 128 #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) @@ -148,18 +154,18 @@ static inline unsigned int x86_model(unsigned int sig) return model; } +#ifdef CONFIG_MICROCODE extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); void reload_early_microcode(void); +extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else -static inline void __init load_ucode_bsp(void) {} -static inline void load_ucode_ap(void) {} -static inline int __init save_microcode_in_initrd(void) -{ - return 0; -} -static inline void reload_early_microcode(void) {} +static inline void __init load_ucode_bsp(void) { } +static inline void load_ucode_ap(void) { } +static inline int __init save_microcode_in_initrd(void) { return 0; } +static inline void reload_early_microcode(void) { } +static inline bool +get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif - #endif /* _ASM_X86_MICROCODE_H */ diff --git a/kernel/arch/x86/include/asm/microcode_amd.h b/kernel/arch/x86/include/asm/microcode_amd.h index af935397e..adfc847a3 100644 --- a/kernel/arch/x86/include/asm/microcode_amd.h +++ b/kernel/arch/x86/include/asm/microcode_amd.h @@ -64,16 +64,17 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s #define PATCH_MAX_SIZE PAGE_SIZE extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; -#ifdef CONFIG_MICROCODE_AMD_EARLY -extern void __init load_ucode_amd_bsp(void); +#ifdef CONFIG_MICROCODE_AMD +extern void __init load_ucode_amd_bsp(unsigned int family); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); void reload_ucode_amd(void); #else -static inline void __init load_ucode_amd_bsp(void) {} +static inline void __init load_ucode_amd_bsp(unsigned int family) {} static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } void reload_ucode_amd(void) {} #endif +extern bool check_current_patch_level(u32 *rev, bool early); #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/kernel/arch/x86/include/asm/microcode_intel.h b/kernel/arch/x86/include/asm/microcode_intel.h index 2b9209c46..8559b0102 100644 --- a/kernel/arch/x86/include/asm/microcode_intel.h +++ b/kernel/arch/x86/include/asm/microcode_intel.h @@ -51,22 +51,13 @@ struct extended_sigtable { (((struct microcode_intel *)mc)->hdr.datasize ? \ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc); +extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); extern int microcode_sanity_check(void *mc, int print_err); -extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc); - -static inline int -revision_is_newer(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} +extern int find_matching_signature(void *mc, unsigned int csig, int cpf); -#ifdef CONFIG_MICROCODE_INTEL_EARLY +#ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); extern void show_ucode_info_early(void); @@ -80,13 +71,9 @@ static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; static inline void reload_ucode_intel(void) {} #endif -#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +#ifdef CONFIG_HOTPLUG_CPU extern int save_mc_for_early(u8 *mc); #else -static inline int save_mc_for_early(u8 *mc) -{ - return 0; -} +static inline int save_mc_for_early(u8 *mc) { return 0; } #endif - #endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/kernel/arch/x86/include/asm/mmu.h b/kernel/arch/x86/include/asm/mmu.h index 364d27481..55234d5e7 100644 --- a/kernel/arch/x86/include/asm/mmu.h +++ b/kernel/arch/x86/include/asm/mmu.h @@ -9,7 +9,9 @@ * we put the segment information here. */ typedef struct { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; +#endif #ifdef CONFIG_X86_64 /* True if mm supports a task running in 32 bit compatibility mode. */ diff --git a/kernel/arch/x86/include/asm/mmu_context.h b/kernel/arch/x86/include/asm/mmu_context.h index 80d67dd80..bfd9b2a35 100644 --- a/kernel/arch/x86/include/asm/mmu_context.h +++ b/kernel/arch/x86/include/asm/mmu_context.h @@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm) static inline void load_mm_cr4(struct mm_struct *mm) {} #endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * ldt_structs can be allocated, used, and freed, but they are never * modified while live. @@ -48,8 +49,23 @@ struct ldt_struct { int size; }; +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); +#else /* CONFIG_MODIFY_LDT_SYSCALL */ +static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +{ + return 0; +} +static inline void destroy_context(struct mm_struct *mm) {} +#endif + static inline void load_mm_ldt(struct mm_struct *mm) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* lockless_dereference synchronizes with smp_store_release */ @@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm) set_ldt(ldt->entries, ldt->size); else clear_LDT(); +#else + clear_LDT(); +#endif DEBUG_LOCKS_WARN_ON(preemptible()); } -/* - * Used for LDT copy/destruction. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); - - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP @@ -104,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, #endif cpumask_set_cpu(cpu, mm_cpumask(next)); - /* Re-load page tables */ + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ @@ -114,6 +154,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Load per-mm CR4 state */ load_mm_cr4(next); +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * Load the LDT, if the LDT is different. * @@ -128,6 +169,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ if (unlikely(prev->context.ldt != next->context.ldt)) load_mm_ldt(next); +#endif } #ifdef CONFIG_SMP else { @@ -142,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * schedule, protecting us from simultaneous changes. */ cpumask_set_cpu(cpu, mm_cpumask(next)); + /* * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); @@ -186,6 +232,19 @@ static inline void arch_exit_mmap(struct mm_struct *mm) paravirt_arch_exit_mmap(mm); } +#ifdef CONFIG_X86_64 +static inline bool is_64bit_mm(struct mm_struct *mm) +{ + return !config_enabled(CONFIG_IA32_EMULATION) || + !(mm->context.ia32_compat == TIF_IA32); +} +#else +static inline bool is_64bit_mm(struct mm_struct *mm) +{ + return false; +} +#endif + static inline void arch_bprm_mm_init(struct mm_struct *mm, struct vm_area_struct *vma) { diff --git a/kernel/arch/x86/include/asm/mpx.h b/kernel/arch/x86/include/asm/mpx.h index a952a13d5..7a3549527 100644 --- a/kernel/arch/x86/include/asm/mpx.h +++ b/kernel/arch/x86/include/asm/mpx.h @@ -13,55 +13,50 @@ #define MPX_BNDCFG_ENABLE_FLAG 0x1 #define MPX_BD_ENTRY_VALID_FLAG 0x1 -#ifdef CONFIG_X86_64 - -/* upper 28 bits [47:20] of the virtual address in 64-bit used to - * index into bounds directory (BD). - */ -#define MPX_BD_ENTRY_OFFSET 28 -#define MPX_BD_ENTRY_SHIFT 3 -/* bits [19:3] of the virtual address in 64-bit used to index into - * bounds table (BT). +/* + * The upper 28 bits [47:20] of the virtual address in 64-bit + * are used to index into bounds directory (BD). + * + * The directory is 2G (2^31) in size, and with 8-byte entries + * it has 2^28 entries. */ -#define MPX_BT_ENTRY_OFFSET 17 -#define MPX_BT_ENTRY_SHIFT 5 -#define MPX_IGN_BITS 3 -#define MPX_BD_ENTRY_TAIL 3 +#define MPX_BD_SIZE_BYTES_64 (1UL<<31) +#define MPX_BD_ENTRY_BYTES_64 8 +#define MPX_BD_NR_ENTRIES_64 (MPX_BD_SIZE_BYTES_64/MPX_BD_ENTRY_BYTES_64) -#else - -#define MPX_BD_ENTRY_OFFSET 20 -#define MPX_BD_ENTRY_SHIFT 2 -#define MPX_BT_ENTRY_OFFSET 10 -#define MPX_BT_ENTRY_SHIFT 4 -#define MPX_IGN_BITS 2 -#define MPX_BD_ENTRY_TAIL 2 +/* + * The 32-bit directory is 4MB (2^22) in size, and with 4-byte + * entries it has 2^20 entries. + */ +#define MPX_BD_SIZE_BYTES_32 (1UL<<22) +#define MPX_BD_ENTRY_BYTES_32 4 +#define MPX_BD_NR_ENTRIES_32 (MPX_BD_SIZE_BYTES_32/MPX_BD_ENTRY_BYTES_32) -#endif +/* + * A 64-bit table is 4MB total in size, and an entry is + * 4 64-bit pointers in size. + */ +#define MPX_BT_SIZE_BYTES_64 (1UL<<22) +#define MPX_BT_ENTRY_BYTES_64 32 +#define MPX_BT_NR_ENTRIES_64 (MPX_BT_SIZE_BYTES_64/MPX_BT_ENTRY_BYTES_64) -#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) -#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) +/* + * A 32-bit table is 16kB total in size, and an entry is + * 4 32-bit pointers in size. + */ +#define MPX_BT_SIZE_BYTES_32 (1UL<<14) +#define MPX_BT_ENTRY_BYTES_32 16 +#define MPX_BT_NR_ENTRIES_32 (MPX_BT_SIZE_BYTES_32/MPX_BT_ENTRY_BYTES_32) #define MPX_BNDSTA_TAIL 2 #define MPX_BNDCFG_TAIL 12 #define MPX_BNDSTA_ADDR_MASK (~((1UL<>(MPX_BT_ENTRY_OFFSET+ \ - MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT) -#define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \ - MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) - #ifdef CONFIG_X86_INTEL_MPX -siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf); -int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); +int mpx_handle_bd_fault(void); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); @@ -77,12 +72,11 @@ static inline void mpx_mm_init(struct mm_struct *mm) void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); #else -static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, - struct xsave_struct *xsave_buf) +static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { return NULL; } -static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +static inline int mpx_handle_bd_fault(void) { return -EINVAL; } diff --git a/kernel/arch/x86/include/asm/mshyperv.h b/kernel/arch/x86/include/asm/mshyperv.h index c163215ab..aaf59b7da 100644 --- a/kernel/arch/x86/include/asm/mshyperv.h +++ b/kernel/arch/x86/include/asm/mshyperv.h @@ -7,6 +7,7 @@ struct ms_hyperv_info { u32 features; + u32 misc_features; u32 hints; }; @@ -20,4 +21,8 @@ void hyperv_vector_handler(struct pt_regs *regs); void hv_setup_vmbus_irq(void (*handler)(void)); void hv_remove_vmbus_irq(void); +void hv_setup_kexec_handler(void (*handler)(void)); +void hv_remove_kexec_handler(void); +void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); +void hv_remove_crash_handler(void); #endif diff --git a/kernel/arch/x86/include/asm/msi.h b/kernel/arch/x86/include/asm/msi.h new file mode 100644 index 000000000..93724cc62 --- /dev/null +++ b/kernel/arch/x86/include/asm/msi.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_MSI_H +#define _ASM_X86_MSI_H +#include + +typedef struct irq_alloc_info msi_alloc_info_t; + +#endif /* _ASM_X86_MSI_H */ diff --git a/kernel/arch/x86/include/asm/msr-index.h b/kernel/arch/x86/include/asm/msr-index.h new file mode 100644 index 000000000..690b4027e --- /dev/null +++ b/kernel/arch/x86/include/asm/msr-index.h @@ -0,0 +1,694 @@ +#ifndef _ASM_X86_MSR_INDEX_H +#define _ASM_X86_MSR_INDEX_H + +/* CPU model specific register (MSR) numbers */ + +/* x86-64 specific MSRs */ +#define MSR_EFER 0xc0000080 /* extended feature register */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ +#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ +#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ +#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ +#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ +#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ + +/* EFER bits: */ +#define _EFER_SCE 0 /* SYSCALL/SYSRET */ +#define _EFER_LME 8 /* Long mode enable */ +#define _EFER_LMA 10 /* Long mode active (read-only) */ +#define _EFER_NX 11 /* No execute enable */ +#define _EFER_SVME 12 /* Enable virtualization */ +#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ +#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ + +#define EFER_SCE (1<<_EFER_SCE) +#define EFER_LME (1<<_EFER_LME) +#define EFER_LMA (1<<_EFER_LMA) +#define EFER_NX (1<<_EFER_NX) +#define EFER_SVME (1<<_EFER_SVME) +#define EFER_LMSLE (1<<_EFER_LMSLE) +#define EFER_FFXSR (1<<_EFER_FFXSR) + +/* Intel MSRs. Some also available on other CPUs */ +#define MSR_IA32_PERFCTR0 0x000000c1 +#define MSR_IA32_PERFCTR1 0x000000c2 +#define MSR_FSB_FREQ 0x000000cd +#define MSR_PLATFORM_INFO 0x000000ce + +#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 +#define NHM_C3_AUTO_DEMOTE (1UL << 25) +#define NHM_C1_AUTO_DEMOTE (1UL << 26) +#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) +#define SNB_C1_AUTO_UNDEMOTE (1UL << 27) +#define SNB_C3_AUTO_UNDEMOTE (1UL << 28) + +#define MSR_MTRRcap 0x000000fe +#define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e + +#define MSR_IA32_SYSENTER_CS 0x00000174 +#define MSR_IA32_SYSENTER_ESP 0x00000175 +#define MSR_IA32_SYSENTER_EIP 0x00000176 + +#define MSR_IA32_MCG_CAP 0x00000179 +#define MSR_IA32_MCG_STATUS 0x0000017a +#define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_IA32_MCG_EXT_CTL 0x000004d0 + +#define MSR_OFFCORE_RSP_0 0x000001a6 +#define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae +#define MSR_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT1 0x000001ae +#define MSR_TURBO_RATIO_LIMIT2 0x000001af + +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + +#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */ +#define LBR_INFO_MISPRED BIT_ULL(63) +#define LBR_INFO_IN_TX BIT_ULL(62) +#define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYCLES 0xffff + +#define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_IA32_DS_AREA 0x00000600 +#define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 + +#define MSR_IA32_RTIT_CTL 0x00000570 +#define RTIT_CTL_TRACEEN BIT(0) +#define RTIT_CTL_CYCLEACC BIT(1) +#define RTIT_CTL_OS BIT(2) +#define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_CR3EN BIT(7) +#define RTIT_CTL_TOPA BIT(8) +#define RTIT_CTL_MTC_EN BIT(9) +#define RTIT_CTL_TSC_EN BIT(10) +#define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_BRANCH_EN BIT(13) +#define RTIT_CTL_MTC_RANGE_OFFSET 14 +#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) +#define RTIT_CTL_CYC_THRESH_OFFSET 19 +#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET) +#define RTIT_CTL_PSB_FREQ_OFFSET 24 +#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET) +#define MSR_IA32_RTIT_STATUS 0x00000571 +#define RTIT_STATUS_CONTEXTEN BIT(1) +#define RTIT_STATUS_TRIGGEREN BIT(2) +#define RTIT_STATUS_ERROR BIT(4) +#define RTIT_STATUS_STOPPED BIT(5) +#define MSR_IA32_RTIT_CR3_MATCH 0x00000572 +#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560 +#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561 + +#define MSR_MTRRfix64K_00000 0x00000250 +#define MSR_MTRRfix16K_80000 0x00000258 +#define MSR_MTRRfix16K_A0000 0x00000259 +#define MSR_MTRRfix4K_C0000 0x00000268 +#define MSR_MTRRfix4K_C8000 0x00000269 +#define MSR_MTRRfix4K_D0000 0x0000026a +#define MSR_MTRRfix4K_D8000 0x0000026b +#define MSR_MTRRfix4K_E0000 0x0000026c +#define MSR_MTRRfix4K_E8000 0x0000026d +#define MSR_MTRRfix4K_F0000 0x0000026e +#define MSR_MTRRfix4K_F8000 0x0000026f +#define MSR_MTRRdefType 0x000002ff + +#define MSR_IA32_CR_PAT 0x00000277 + +#define MSR_IA32_DEBUGCTLMSR 0x000001d9 +#define MSR_IA32_LASTBRANCHFROMIP 0x000001db +#define MSR_IA32_LASTBRANCHTOIP 0x000001dc +#define MSR_IA32_LASTINTFROMIP 0x000001dd +#define MSR_IA32_LASTINTTOIP 0x000001de + +/* DEBUGCTLMSR bits (others vary by model): */ +#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_TR (1UL << 6) +#define DEBUGCTLMSR_BTS (1UL << 7) +#define DEBUGCTLMSR_BTINT (1UL << 8) +#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) +#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) +#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) + +#define MSR_PEBS_FRONTEND 0x000003f7 + +#define MSR_IA32_POWER_CTL 0x000001fc + +#define MSR_IA32_MC0_CTL 0x00000400 +#define MSR_IA32_MC0_STATUS 0x00000401 +#define MSR_IA32_MC0_ADDR 0x00000402 +#define MSR_IA32_MC0_MISC 0x00000403 + +/* C-state Residency Counters */ +#define MSR_PKG_C3_RESIDENCY 0x000003f8 +#define MSR_PKG_C6_RESIDENCY 0x000003f9 +#define MSR_PKG_C7_RESIDENCY 0x000003fa +#define MSR_CORE_C3_RESIDENCY 0x000003fc +#define MSR_CORE_C6_RESIDENCY 0x000003fd +#define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff +#define MSR_PKG_C2_RESIDENCY 0x0000060d +#define MSR_PKG_C8_RESIDENCY 0x00000630 +#define MSR_PKG_C9_RESIDENCY 0x00000631 +#define MSR_PKG_C10_RESIDENCY 0x00000632 + +/* Run Time Average Power Limiting (RAPL) Interface */ + +#define MSR_RAPL_POWER_UNIT 0x00000606 + +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_PERF_STATUS 0x00000613 +#define MSR_PKG_POWER_INFO 0x00000614 + +#define MSR_DRAM_POWER_LIMIT 0x00000618 +#define MSR_DRAM_ENERGY_STATUS 0x00000619 +#define MSR_DRAM_PERF_STATUS 0x0000061b +#define MSR_DRAM_POWER_INFO 0x0000061c + +#define MSR_PP0_POWER_LIMIT 0x00000638 +#define MSR_PP0_ENERGY_STATUS 0x00000639 +#define MSR_PP0_POLICY 0x0000063a +#define MSR_PP0_PERF_STATUS 0x0000063b + +#define MSR_PP1_POWER_LIMIT 0x00000640 +#define MSR_PP1_ENERGY_STATUS 0x00000641 +#define MSR_PP1_POLICY 0x00000642 + +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL_1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + +#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 +#define MSR_PKG_ANY_CORE_C0_RES 0x00000659 +#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A +#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B + +#define MSR_CORE_C1_RES 0x00000660 + +#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 +#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669 + +#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690 +#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 +#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 + +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + +/* Hardware P state interface */ +#define MSR_PPERF 0x0000064e +#define MSR_PERF_LIMIT_REASONS 0x0000064f +#define MSR_PM_ENABLE 0x00000770 +#define MSR_HWP_CAPABILITIES 0x00000771 +#define MSR_HWP_REQUEST_PKG 0x00000772 +#define MSR_HWP_INTERRUPT 0x00000773 +#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_STATUS 0x00000777 + +/* CPUID.6.EAX */ +#define HWP_BASE_BIT (1<<7) +#define HWP_NOTIFICATIONS_BIT (1<<8) +#define HWP_ACTIVITY_WINDOW_BIT (1<<9) +#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10) +#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11) + +/* IA32_HWP_CAPABILITIES */ +#define HWP_HIGHEST_PERF(x) (x & 0xff) +#define HWP_GUARANTEED_PERF(x) ((x & (0xff << 8)) >>8) +#define HWP_MOSTEFFICIENT_PERF(x) ((x & (0xff << 16)) >>16) +#define HWP_LOWEST_PERF(x) ((x & (0xff << 24)) >>24) + +/* IA32_HWP_REQUEST */ +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) +#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24) +#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42) + +/* IA32_HWP_STATUS */ +#define HWP_GUARANTEED_CHANGE(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4) + +/* IA32_HWP_INTERRUPT */ +#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1) +#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2) + +#define MSR_AMD64_MC0_MASK 0xc0010044 + +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + +#define MSR_P6_PERFCTR0 0x000000c1 +#define MSR_P6_PERFCTR1 0x000000c2 +#define MSR_P6_EVNTSEL0 0x00000186 +#define MSR_P6_EVNTSEL1 0x00000187 + +#define MSR_KNC_PERFCTR0 0x00000020 +#define MSR_KNC_PERFCTR1 0x00000021 +#define MSR_KNC_EVNTSEL0 0x00000028 +#define MSR_KNC_EVNTSEL1 0x00000029 + +/* Alternative perfctr range with full access. */ +#define MSR_IA32_PMC0 0x000004c1 + +/* AMD64 MSRs. Not complete. See the architecture manual for a more + complete list. */ + +#define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 +#define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_LS_CFG 0xc0011020 +#define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_BU_CFG2 0xc001102a +#define MSR_AMD64_IBSFETCHCTL 0xc0011030 +#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 +#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL< +#include "msr-index.h" #ifndef __ASSEMBLY__ #include #include #include +#include struct msr { union { @@ -46,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 -#define DECLARE_ARGS(val, low, high) unsigned low, high -#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32)) -#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) +/* Using 64-bit values saves one instruction clearing the high half of low */ +#define DECLARE_ARGS(val, low, high) unsigned long low, high +#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) #define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) #else #define DECLARE_ARGS(val, low, high) unsigned long long val #define EAX_EDX_VAL(val, low, high) (val) -#define EAX_EDX_ARGS(val, low, high) "A" (val) #define EAX_EDX_RET(val, low, high) "=A" (val) #endif @@ -105,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr, return err; } -extern unsigned long long native_read_tsc(void); - extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); -static __always_inline unsigned long long __native_read_tsc(void) +/** + * rdtsc() - returns the current TSC without ordering constraints + * + * rdtsc() returns the result of RDTSC as a 64-bit integer. The + * only ordering constraint it supplies is the ordering implied by + * "asm volatile": it will put the RDTSC in the place you expect. The + * CPU can and will speculatively execute that RDTSC, though, so the + * results can be non-monotonic if compared on different CPUs. + */ +static __always_inline unsigned long long rdtsc(void) { DECLARE_ARGS(val, low, high); @@ -119,6 +126,35 @@ static __always_inline unsigned long long __native_read_tsc(void) return EAX_EDX_VAL(val, low, high); } +/** + * rdtsc_ordered() - read the current TSC in program order + * + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. + * It is ordered like a load to a global in-memory counter. It should + * be impossible to observe non-monotonic rdtsc_unordered() behavior + * across multiple CPUs as long as the TSC is synced. + */ +static __always_inline unsigned long long rdtsc_ordered(void) +{ + /* + * The RDTSC instruction is not ordered relative to memory + * access. The Intel SDM and the AMD APM are both vague on this + * point, but empirically an RDTSC instruction can be + * speculatively executed before prior loads. An RDTSC + * immediately after an appropriate barrier appears to be + * ordered as a normal load, that is, it provides the same + * ordering guarantees as reading from a global memory location + * that some other imaginary CPU is updating continuously with a + * time stamp. + */ + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); + return rdtsc(); +} + +/* Deprecated, keep it for a cycle for easier merging: */ +#define rdtscll(now) do { (now) = rdtsc_ordered(); } while (0) + static inline unsigned long long native_read_pmc(int counter) { DECLARE_ARGS(val, low, high); @@ -152,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high) #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) -#define wrmsrl(msr, val) \ - native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32)) +static inline void wrmsrl(unsigned msr, u64 val) +{ + native_write_msr(msr, (u32)val, (u32)(val >> 32)); +} /* wrmsr with exception handling */ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) @@ -179,12 +217,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -#define rdtscl(low) \ - ((low) = (u32)__native_read_tsc()) - -#define rdtscll(val) \ - ((val) = __native_read_tsc()) - #define rdpmc(counter, low, high) \ do { \ u64 _l = native_read_pmc((counter)); \ @@ -194,19 +226,15 @@ do { \ #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) -#define rdtscp(low, high, aux) \ -do { \ - unsigned long long _val = native_read_tscp(&(aux)); \ - (low) = (u32)_val; \ - (high) = (u32)(_val >> 32); \ -} while (0) - -#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) - #endif /* !CONFIG_PARAVIRT */ -#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ - (u32)((val) >> 32)) +/* + * 64-bit version of wrmsr_safe(): + */ +static inline int wrmsrl_safe(u32 msr, u64 val) +{ + return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); +} #define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high)) diff --git a/kernel/arch/x86/include/asm/mtrr.h b/kernel/arch/x86/include/asm/mtrr.h index f768f6298..b94f6f64e 100644 --- a/kernel/arch/x86/include/asm/mtrr.h +++ b/kernel/arch/x86/include/asm/mtrr.h @@ -31,7 +31,7 @@ * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR -extern u8 mtrr_type_lookup(u64 addr, u64 end); +extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); extern int mtrr_add(unsigned long base, unsigned long size, @@ -48,14 +48,13 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern int phys_wc_to_mtrr_index(int handle); # else -static inline u8 mtrr_type_lookup(u64 addr, u64 end) +static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* * Return no-MTRRs: */ - return 0xff; + return MTRR_TYPE_INVALID; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) @@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -static inline int phys_wc_to_mtrr_index(int handle) -{ - return -1; -} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) @@ -127,4 +122,8 @@ struct mtrr_gentry32 { _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) #endif /* CONFIG_COMPAT */ +/* Bit fields for enabled in struct mtrr_state_type */ +#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 +#define MTRR_STATE_MTRR_ENABLED 0x02 + #endif /* _ASM_X86_MTRR_H */ diff --git a/kernel/arch/x86/include/asm/mwait.h b/kernel/arch/x86/include/asm/mwait.h index 653dfa766..c70689b5e 100644 --- a/kernel/arch/x86/include/asm/mwait.h +++ b/kernel/arch/x86/include/asm/mwait.h @@ -14,6 +14,9 @@ #define CPUID5_ECX_INTERRUPT_BREAK 0x2 #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +#define MWAITX_ECX_TIMER_ENABLE BIT(1) +#define MWAITX_MAX_LOOPS ((u32)-1) +#define MWAITX_DISABLE_CSTATES 0xf static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) @@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx, :: "a" (eax), "c" (ecx), "d"(edx)); } +static inline void __monitorx(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitorx %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xfa;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + static inline void __mwait(unsigned long eax, unsigned long ecx) { /* "mwait %eax, %ecx;" */ @@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } +/* + * MWAITX allows for a timer expiration to get the core out a wait state in + * addition to the default MWAIT exit condition of a store appearing at a + * monitored virtual address. + * + * Registers: + * + * MWAITX ECX[1]: enable timer if set + * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0 + * frequency is the same as the TSC frequency. + * + * Below is a comparison between MWAIT and MWAITX on AMD processors: + * + * MWAIT MWAITX + * opcode 0f 01 c9 | 0f 01 fb + * ECX[0] value of RFLAGS.IF seen by instruction + * ECX[1] unused/#GP if set | enable timer if set + * ECX[31:2] unused/#GP if set + * EAX unused (reserve for hint) + * EBX[31:0] unused | max wait time (P0 clocks) + * + * MONITOR MONITORX + * opcode 0f 01 c8 | 0f 01 fa + * EAX (logical) address to monitor + * ECX #GP if not zero + */ +static inline void __mwaitx(unsigned long eax, unsigned long ebx, + unsigned long ecx) +{ + /* "mwaitx %eax, %ebx, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xfb;" + :: "a" (eax), "b" (ebx), "c" (ecx)); +} + static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { trace_hardirqs_on(); diff --git a/kernel/arch/x86/include/asm/numachip/numachip.h b/kernel/arch/x86/include/asm/numachip/numachip.h index 1c6f7f621..c64373a2d 100644 --- a/kernel/arch/x86/include/asm/numachip/numachip.h +++ b/kernel/arch/x86/include/asm/numachip/numachip.h @@ -14,6 +14,7 @@ #ifndef _ASM_X86_NUMACHIP_NUMACHIP_H #define _ASM_X86_NUMACHIP_NUMACHIP_H +extern u8 numachip_system; extern int __init pci_numachip_init(void); #endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */ diff --git a/kernel/arch/x86/include/asm/numachip/numachip_csr.h b/kernel/arch/x86/include/asm/numachip/numachip_csr.h index 660f843df..29719eecd 100644 --- a/kernel/arch/x86/include/asm/numachip/numachip_csr.h +++ b/kernel/arch/x86/include/asm/numachip/numachip_csr.h @@ -14,12 +14,8 @@ #ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H #define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H -#include -#include +#include #include -#include -#include -#include #define CSR_NODE_SHIFT 16 #define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT) @@ -27,11 +23,8 @@ /* 32K CSR space, b15 indicates geo/non-geo */ #define CSR_OFFSET_MASK 0x7fffUL - -/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */ -#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL -#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL -#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1) +#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) +#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) /* * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however @@ -41,12 +34,7 @@ #define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL #define NUMACHIP_LCSR_LIM 0x3fffffffffffULL #define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1) - -static inline void *gcsr_address(int node, unsigned long offset) -{ - return __va(NUMACHIP_GCSR_BASE | (1UL << 15) | - CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK)); -} +#define NUMACHIP_LAPIC_BITS 8 static inline void *lcsr_address(unsigned long offset) { @@ -54,114 +42,57 @@ static inline void *lcsr_address(unsigned long offset) CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK)); } -static inline unsigned int read_gcsr(int node, unsigned long offset) +static inline unsigned int read_lcsr(unsigned long offset) { - return swab32(readl(gcsr_address(node, offset))); + return swab32(readl(lcsr_address(offset))); } -static inline void write_gcsr(int node, unsigned long offset, unsigned int val) +static inline void write_lcsr(unsigned long offset, unsigned int val) { - writel(swab32(val), gcsr_address(node, offset)); + writel(swab32(val), lcsr_address(offset)); } -static inline unsigned int read_lcsr(unsigned long offset) +/* + * On NumaChip2, local CSR space is 16MB and starts at fixed offset below 4G + */ + +#define NUMACHIP2_LCSR_BASE 0xf0000000UL +#define NUMACHIP2_LCSR_SIZE 0x1000000UL +#define NUMACHIP2_APIC_ICR 0x100000 +#define NUMACHIP2_TIMER_DEADLINE 0x200000 +#define NUMACHIP2_TIMER_INT 0x200008 +#define NUMACHIP2_TIMER_NOW 0x200018 +#define NUMACHIP2_TIMER_RESET 0x200020 + +static inline void __iomem *numachip2_lcsr_address(unsigned long offset) { - return swab32(readl(lcsr_address(offset))); + return (void __iomem *)__va(NUMACHIP2_LCSR_BASE | + (offset & (NUMACHIP2_LCSR_SIZE - 1))); } -static inline void write_lcsr(unsigned long offset, unsigned int val) +static inline u32 numachip2_read32_lcsr(unsigned long offset) { - writel(swab32(val), lcsr_address(offset)); + return readl(numachip2_lcsr_address(offset)); } -/* ========================================================================= */ -/* CSR_G0_STATE_CLEAR */ -/* ========================================================================= */ - -#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12)) -union numachip_csr_g0_state_clear { - unsigned int v; - struct numachip_csr_g0_state_clear_s { - unsigned int _state:2; - unsigned int _rsvd_2_6:5; - unsigned int _lost:1; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G0_NODE_IDS */ -/* ========================================================================= */ +static inline u64 numachip2_read64_lcsr(unsigned long offset) +{ + return readq(numachip2_lcsr_address(offset)); +} -#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) -union numachip_csr_g0_node_ids { - unsigned int v; - struct numachip_csr_g0_node_ids_s { - unsigned int _initialid:16; - unsigned int _nodeid:12; - unsigned int _rsvd_28_31:4; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_GEN */ -/* ========================================================================= */ +static inline void numachip2_write32_lcsr(unsigned long offset, u32 val) +{ + writel(val, numachip2_lcsr_address(offset)); +} -#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) -union numachip_csr_g3_ext_irq_gen { - unsigned int v; - struct numachip_csr_g3_ext_irq_gen_s { - unsigned int _vector:8; - unsigned int _msgtype:3; - unsigned int _index:5; - unsigned int _destination_apic_id:16; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_STATUS */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12)) -union numachip_csr_g3_ext_irq_status { - unsigned int v; - struct numachip_csr_g3_ext_irq_status_s { - unsigned int _result:32; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_EXT_IRQ_DEST */ -/* ========================================================================= */ - -#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12)) -union numachip_csr_g3_ext_irq_dest { - unsigned int v; - struct numachip_csr_g3_ext_irq_dest_s { - unsigned int _irq:8; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12)) -union numachip_csr_g3_nc_att_map_select { - unsigned int v; - struct numachip_csr_g3_nc_att_map_select_s { - unsigned int _upper_address_bits:4; - unsigned int _select_ram:4; - unsigned int _rsvd_8_31:24; - } s; -}; - -/* ========================================================================= */ -/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */ -/* ========================================================================= */ - -#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12)) +static inline void numachip2_write64_lcsr(unsigned long offset, u64 val) +{ + writeq(val, numachip2_lcsr_address(offset)); +} -#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ +static inline unsigned int numachip2_timer(void) +{ + return (smp_processor_id() % 48) << 6; +} +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ diff --git a/kernel/arch/x86/include/asm/page_64_types.h b/kernel/arch/x86/include/asm/page_64_types.h index 4edd53b79..4928cf0d5 100644 --- a/kernel/arch/x86/include/asm/page_64_types.h +++ b/kernel/arch/x86/include/asm/page_64_types.h @@ -26,9 +26,6 @@ #define MCE_STACK 4 #define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) - /* * Set __PAGE_OFFSET to the most negative possible address + * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a diff --git a/kernel/arch/x86/include/asm/page_types.h b/kernel/arch/x86/include/asm/page_types.h index c7c712f26..cc071c6f7 100644 --- a/kernel/arch/x86/include/asm/page_types.h +++ b/kernel/arch/x86/include/asm/page_types.h @@ -9,16 +9,21 @@ #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) + +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) -/* Cast PAGE_MASK to a signed type so that it is sign-extended if +/* Cast *PAGE_MASK to a signed type so that it is sign-extended if virtual addresses are 32-bits but physical addresses are larger (ie, 32-bit PAE). */ #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) - -#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) -#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) +#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK) #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) diff --git a/kernel/arch/x86/include/asm/paravirt.h b/kernel/arch/x86/include/asm/paravirt.h index 8957810ad..c759b3cca 100644 --- a/kernel/arch/x86/include/asm/paravirt.h +++ b/kernel/arch/x86/include/asm/paravirt.h @@ -19,6 +19,12 @@ static inline int paravirt_enabled(void) return pv_info.paravirt_enabled; } +static inline int paravirt_has_feature(unsigned int feature) +{ + WARN_ON_ONCE(!pv_info.paravirt_enabled); + return (pv_info.features & feature); +} + static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -153,7 +159,11 @@ do { \ val = paravirt_read_msr(msr, &_err); \ } while (0) -#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32) +static inline void wrmsrl(unsigned msr, u64 val) +{ + wrmsr(msr, (u32)val, (u32)(val>>32)); +} + #define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) /* rdmsr with exception handling */ @@ -174,19 +184,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline u64 paravirt_read_tsc(void) -{ - return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); -} - -#define rdtscl(low) \ -do { \ - u64 _l = paravirt_read_tsc(); \ - low = (int)_l; \ -} while (0) - -#define rdtscll(val) (val = paravirt_read_tsc()) - static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); @@ -215,27 +212,6 @@ do { \ #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) -static inline unsigned long long paravirt_rdtscp(unsigned int *aux) -{ - return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); -} - -#define rdtscp(low, high, aux) \ -do { \ - int __aux; \ - unsigned long __val = paravirt_rdtscp(&__aux); \ - (low) = (u32)__val; \ - (high) = (u32)(__val >> 32); \ - (aux) = __aux; \ -} while (0) - -#define rdtscpll(val, aux) \ -do { \ - unsigned long __aux; \ - val = paravirt_rdtscp(&__aux); \ - (aux) = __aux; \ -} while (0) - static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); @@ -712,6 +688,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#ifdef CONFIG_QUEUED_SPINLOCKS + +static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, + u32 val) +{ + PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val); +} + +static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) +{ + PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock); +} + +static __always_inline void pv_wait(u8 *ptr, u8 val) +{ + PVOP_VCALL2(pv_lock_ops.wait, ptr, val); +} + +static __always_inline void pv_kick(int cpu) +{ + PVOP_VCALL1(pv_lock_ops.kick, cpu); +} + +#else /* !CONFIG_QUEUED_SPINLOCKS */ + static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { @@ -724,7 +725,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } -#endif +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +#endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" diff --git a/kernel/arch/x86/include/asm/paravirt_types.h b/kernel/arch/x86/include/asm/paravirt_types.h index f7b0b5c11..3d4419118 100644 --- a/kernel/arch/x86/include/asm/paravirt_types.h +++ b/kernel/arch/x86/include/asm/paravirt_types.h @@ -70,9 +70,14 @@ struct pv_info { #endif int paravirt_enabled; + unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; +#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) +/* Supported features */ +#define PV_SUPPORTED_RTC (1<<0) + struct pv_init_ops { /* * Patch may replace one of the defined code sequences with @@ -97,7 +102,6 @@ struct pv_lazy_ops { struct pv_time_ops { unsigned long long (*sched_clock)(void); unsigned long long (*steal_clock)(int cpu); - unsigned long (*get_tsc_khz)(void); }; struct pv_cpu_ops { @@ -156,17 +160,16 @@ struct pv_cpu_ops { u64 (*read_msr)(unsigned int msr, int *err); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); +#ifdef CONFIG_X86_32 /* * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) + * is only used in 32-bit kernels. 64-bit kernels use + * usergs_sysret32 instead. */ void (*irq_enable_sysexit)(void); +#endif /* * Switch to usermode gs and return to 64-bit usermode using @@ -333,9 +336,19 @@ struct arch_spinlock; typedef u16 __ticket_t; #endif +struct qspinlock; + struct pv_lock_ops { +#ifdef CONFIG_QUEUED_SPINLOCKS + void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); + struct paravirt_callee_save queued_spin_unlock; + + void (*wait)(u8 *ptr, u8 val); + void (*kick)(int cpu); +#else /* !CONFIG_QUEUED_SPINLOCKS */ struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); +#endif /* !CONFIG_QUEUED_SPINLOCKS */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/kernel/arch/x86/include/asm/pat.h b/kernel/arch/x86/include/asm/pat.h index 91bc4ba95..ca6c228d5 100644 --- a/kernel/arch/x86/include/asm/pat.h +++ b/kernel/arch/x86/include/asm/pat.h @@ -4,14 +4,9 @@ #include #include -#ifdef CONFIG_X86_PAT -extern int pat_enabled; -#else -static const int pat_enabled; -#endif - +bool pat_enabled(void); extern void pat_init(void); -void pat_init_cache_modes(void); +void pat_init_cache_modes(u64); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/kernel/arch/x86/include/asm/pci.h b/kernel/arch/x86/include/asm/pci.h index 4e370a5d8..462594320 100644 --- a/kernel/arch/x86/include/asm/pci.h +++ b/kernel/arch/x86/include/asm/pci.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include @@ -80,13 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, #ifdef CONFIG_PCI extern void early_quirks(void); -static inline void pci_dma_burst_advice(struct pci_dev *pdev, - enum pci_dma_burst_strategy *strat, - unsigned long *strategy_parameter) -{ - *strat = PCI_DMA_BURST_INFINITY; - *strategy_parameter = ~0UL; -} #else static inline void early_quirks(void) { } #endif @@ -96,15 +89,10 @@ extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; -void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, u8 hpet_id); int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev); -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset); #else -#define native_compose_msi_msg NULL #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif diff --git a/kernel/arch/x86/include/asm/pci_x86.h b/kernel/arch/x86/include/asm/pci_x86.h index 164e3f8d3..fa1195dae 100644 --- a/kernel/arch/x86/include/asm/pci_x86.h +++ b/kernel/arch/x86/include/asm/pci_x86.h @@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); -extern bool mp_should_keep_irq(struct device *dev); - struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/kernel/arch/x86/include/asm/perf_event.h b/kernel/arch/x86/include/asm/perf_event.h index dc0f6ed35..7bcb861a0 100644 --- a/kernel/arch/x86/include/asm/perf_event.h +++ b/kernel/arch/x86/include/asm/perf_event.h @@ -159,6 +159,13 @@ struct x86_pmu_capability { */ #define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) +#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) +#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62) +#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) +#define GLOBAL_STATUS_ASIF BIT_ULL(60) +#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) + /* * IBS cpuid feature detection */ diff --git a/kernel/arch/x86/include/asm/pgtable.h b/kernel/arch/x86/include/asm/pgtable.h index fe57e7a98..6ec0c8b2e 100644 --- a/kernel/arch/x86/include/asm/pgtable.h +++ b/kernel/arch/x86/include/asm/pgtable.h @@ -19,6 +19,13 @@ #include void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_checkwx(void); + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_walk_pgd_level_checkwx() +#else +#define debug_checkwx() do { } while (0) +#endif /* * ZERO_PAGE is a global shared page that is always zero: used @@ -142,12 +149,12 @@ static inline unsigned long pte_pfn(pte_t pte) static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT; + return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -318,6 +325,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); +} + #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* @@ -379,7 +396,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) return __pgprot(preservebits | addbits); } -#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) +#define pte_pgprot(x) __pgprot(pte_flags(x)) +#define pmd_pgprot(x) __pgprot(pmd_flags(x)) +#define pud_pgprot(x) __pgprot(pud_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) @@ -398,11 +417,17 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, * requested memtype: * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back + * - request is write-through, return cannot be write-back + * - request is write-through, return cannot be write-combine */ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && new_pcm == _PAGE_CACHE_MODE_WB) || (pcm == _PAGE_CACHE_MODE_WC && - new_pcm == _PAGE_CACHE_MODE_WB)) { + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WT && + new_pcm == _PAGE_CACHE_MODE_WC)) { return 0; } @@ -496,14 +521,15 @@ static inline int pmd_none(pmd_t pmd) static inline unsigned long pmd_page_vaddr(pmd_t pmd) { - return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); + return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pmd_page(pmd) \ + pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -564,14 +590,15 @@ static inline int pud_present(pud_t pud) static inline unsigned long pud_page_vaddr(pud_t pud) { - return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); + return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) +#define pud_page(pud) \ + pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -799,8 +826,8 @@ static inline int pmd_write(pmd_t pmd) return pmd_flags(pmd) & _PAGE_RW; } -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, +#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = native_pmdp_get_and_clear(pmdp); diff --git a/kernel/arch/x86/include/asm/pgtable_types.h b/kernel/arch/x86/include/asm/pgtable_types.h index 78f0c8cbe..79c91853e 100644 --- a/kernel/arch/x86/include/asm/pgtable_types.h +++ b/kernel/arch/x86/include/asm/pgtable_types.h @@ -209,10 +209,10 @@ enum page_cache_mode { #include -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; @@ -276,14 +276,40 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline pudval_t pud_pfn_mask(pud_t pud) +{ + if (native_pud_val(pud) & _PAGE_PSE) + return PHYSICAL_PUD_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pudval_t pud_flags_mask(pud_t pud) +{ + return ~pud_pfn_mask(pud); +} + static inline pudval_t pud_flags(pud_t pud) { - return native_pud_val(pud) & PTE_FLAGS_MASK; + return native_pud_val(pud) & pud_flags_mask(pud); +} + +static inline pmdval_t pmd_pfn_mask(pmd_t pmd) +{ + if (native_pmd_val(pmd) & _PAGE_PSE) + return PHYSICAL_PMD_PAGE_MASK; + else + return PTE_PFN_MASK; +} + +static inline pmdval_t pmd_flags_mask(pmd_t pmd) +{ + return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) { - return native_pmd_val(pmd) & PTE_FLAGS_MASK; + return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) @@ -337,20 +363,18 @@ static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) } static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { + pgprotval_t val = pgprot_val(pgprot); pgprot_t new; - unsigned long val; - val = pgprot_val(pgprot); pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); return new; } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { + pgprotval_t val = pgprot_val(pgprot); pgprot_t new; - unsigned long val; - val = pgprot_val(pgprot); pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT_LARGE) >> (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); @@ -367,6 +391,9 @@ extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); +#define pgprot_writethrough pgprot_writethrough +extern pgprot_t pgprot_writethrough(pgprot_t prot); + /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING diff --git a/kernel/arch/x86/include/asm/pmc_atom.h b/kernel/arch/x86/include/asm/pmc_atom.h index bc0fc0866..aa8744c77 100644 --- a/kernel/arch/x86/include/asm/pmc_atom.h +++ b/kernel/arch/x86/include/asm/pmc_atom.h @@ -18,6 +18,8 @@ /* ValleyView Power Control Unit PCI Device ID */ #define PCI_DEVICE_ID_VLV_PMC 0x0F1C +/* CherryTrail Power Control Unit PCI Device ID */ +#define PCI_DEVICE_ID_CHT_PMC 0x229C /* PMC Memory mapped IO registers */ #define PMC_BASE_ADDR_OFFSET 0x44 @@ -29,6 +31,10 @@ #define PMC_FUNC_DIS 0x34 #define PMC_FUNC_DIS_2 0x38 +/* CHT specific bits in FUNC_DIS2 register */ +#define BIT_FD_GMM BIT(3) +#define BIT_FD_ISH BIT(4) + /* S0ix wake event control */ #define PMC_S0IX_WAKE_EN 0x3C @@ -75,6 +81,21 @@ #define PMC_PSS_BIT_USB BIT(16) #define PMC_PSS_BIT_USB_SUS BIT(17) +/* CHT specific bits in PSS register */ +#define PMC_PSS_BIT_CHT_UFS BIT(7) +#define PMC_PSS_BIT_CHT_UXD BIT(11) +#define PMC_PSS_BIT_CHT_UXD_FD BIT(12) +#define PMC_PSS_BIT_CHT_UX_ENG BIT(15) +#define PMC_PSS_BIT_CHT_USB_SUS BIT(16) +#define PMC_PSS_BIT_CHT_GMM BIT(17) +#define PMC_PSS_BIT_CHT_ISH BIT(18) +#define PMC_PSS_BIT_CHT_DFX_MASTER BIT(26) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER1 BIT(27) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER2 BIT(28) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER3 BIT(29) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER4 BIT(30) +#define PMC_PSS_BIT_CHT_DFX_CLUSTER5 BIT(31) + /* These registers reflect D3 status of functions */ #define PMC_D3_STS_0 0xA0 @@ -117,6 +138,10 @@ #define BIT_USH_SS_PHY BIT(2) #define BIT_DFX BIT(3) +/* CHT specific bits in PMC_D3_STS_1 register */ +#define BIT_STS_GMM BIT(1) +#define BIT_STS_ISH BIT(2) + /* PMC I/O Registers */ #define ACPI_BASE_ADDR_OFFSET 0x40 #define ACPI_BASE_ADDR_MASK 0xFFFFFE00 @@ -126,4 +151,8 @@ #define SLEEP_TYPE_MASK 0xFFFFECFF #define SLEEP_TYPE_S5 0x1C00 #define SLEEP_ENABLE 0x2000 + +extern int pmc_atom_read(int offset, u32 *value); +extern int pmc_atom_write(int offset, u32 value); + #endif /* PMC_ATOM_H */ diff --git a/kernel/arch/x86/include/asm/pmem.h b/kernel/arch/x86/include/asm/pmem.h new file mode 100644 index 000000000..d8ce3ec81 --- /dev/null +++ b/kernel/arch/x86/include/asm/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ASM_X86_PMEM_H__ +#define __ASM_X86_PMEM_H__ + +#include +#include +#include +#include + +#ifdef CONFIG_ARCH_HAS_PMEM_API +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +/** + * __arch_wb_cache_pmem - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. This function requires explicit ordering with an + * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + */ +static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; + + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +/* + * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec + * iterators, so for other types (bvec & kvec) we must do a cache write-back. + */ +static inline bool __iter_needs_pmem_wb(struct iov_iter *i) +{ + return iter_is_iovec(i) == false; +} + +/** + * arch_copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr: PMEM destination address + * @bytes: number of bytes to copy + * @i: iterator with source data + * + * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, + struct iov_iter *i) +{ + void *vaddr = (void __force *)addr; + size_t len; + + /* TODO: skip the write-back by always using non-temporal stores */ + len = copy_from_iter_nocache(vaddr, bytes, i); + + if (__iter_needs_pmem_wb(i)) + __arch_wb_cache_pmem(vaddr, bytes); + + return len; +} + +/** + * arch_clear_pmem - zero a PMEM memory range + * @addr: virtual start address + * @size: number of bytes to zero + * + * Write zeros into the memory range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline void arch_clear_pmem(void __pmem *addr, size_t size) +{ + void *vaddr = (void __force *)addr; + + /* TODO: implement the zeroing via non-temporal writes */ + if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) + clear_page(vaddr); + else + memset(vaddr, 0, size); + + __arch_wb_cache_pmem(vaddr, size); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +} +#endif /* CONFIG_ARCH_HAS_PMEM_API */ +#endif /* __ASM_X86_PMEM_H__ */ diff --git a/kernel/arch/x86/include/asm/preempt.h b/kernel/arch/x86/include/asm/preempt.h index 6ef9a161d..5dbd2d0f9 100644 --- a/kernel/arch/x86/include/asm/preempt.h +++ b/kernel/arch/x86/include/asm/preempt.h @@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ -} while (0) +#define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ } while (0) @@ -101,13 +98,13 @@ static __always_inline bool __preempt_count_dec_and_test(void) /* * Returns true when we need to resched and can (barring IRQ state). */ -static __always_inline bool should_resched(void) +static __always_inline bool should_resched(int preempt_offset) { #ifdef CONFIG_PREEMPT_LAZY - return unlikely(!raw_cpu_read_4(__preempt_count) || \ + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset || test_thread_flag(TIF_NEED_RESCHED_LAZY)); #else - return unlikely(!raw_cpu_read_4(__preempt_count)); + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); #endif } @@ -115,11 +112,9 @@ static __always_inline bool should_resched(void) extern asmlinkage void ___preempt_schedule(void); # define __preempt_schedule() asm ("call ___preempt_schedule") extern asmlinkage void preempt_schedule(void); -# ifdef CONFIG_CONTEXT_TRACKING - extern asmlinkage void ___preempt_schedule_context(void); -# define __preempt_schedule_context() asm ("call ___preempt_schedule_context") - extern asmlinkage void preempt_schedule_context(void); -# endif + extern asmlinkage void ___preempt_schedule_notrace(void); +# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace") + extern asmlinkage void preempt_schedule_notrace(void); #endif #endif /* __ASM_PREEMPT_H */ diff --git a/kernel/arch/x86/include/asm/processor.h b/kernel/arch/x86/include/asm/processor.h index 23ba6765b..2d5a50cb6 100644 --- a/kernel/arch/x86/include/asm/processor.h +++ b/kernel/arch/x86/include/asm/processor.h @@ -6,12 +6,12 @@ /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; +struct vm86; -#include #include #include #include -#include +#include #include #include #include @@ -21,6 +21,7 @@ struct mm_struct; #include #include #include +#include #include #include @@ -52,11 +53,16 @@ static inline void *current_text_addr(void) return pc; } +/* + * These alignment constraints are for performance in the vSMP case, + * but in the task_struct case we must also meet hardware imposed + * alignment requirements of the FPU state: + */ #ifdef CONFIG_X86_VSMP # define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) # define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) #else -# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state) # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif @@ -166,7 +172,6 @@ extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); -extern void fpu_detect(struct cpuinfo_x86 *c); extern void early_cpu_init(void); extern void identify_boot_cpu(void); @@ -313,128 +318,6 @@ struct orig_ist { unsigned long ist[7]; }; -#define MXCSR_DEFAULT 0x1f80 - -struct i387_fsave_struct { - u32 cwd; /* FPU Control Word */ - u32 swd; /* FPU Status Word */ - u32 twd; /* FPU Tag Word */ - u32 fip; /* FPU IP Offset */ - u32 fcs; /* FPU IP Selector */ - u32 foo; /* FPU Operand Pointer Offset */ - u32 fos; /* FPU Operand Pointer Selector */ - - /* 8*10 bytes for each FP-reg = 80 bytes: */ - u32 st_space[20]; - - /* Software status information [not touched by FSAVE ]: */ - u32 status; -}; - -struct i387_fxsave_struct { - u16 cwd; /* Control Word */ - u16 swd; /* Status Word */ - u16 twd; /* Tag Word */ - u16 fop; /* Last Instruction Opcode */ - union { - struct { - u64 rip; /* Instruction Pointer */ - u64 rdp; /* Data Pointer */ - }; - struct { - u32 fip; /* FPU IP Offset */ - u32 fcs; /* FPU IP Selector */ - u32 foo; /* FPU Operand Offset */ - u32 fos; /* FPU Operand Selector */ - }; - }; - u32 mxcsr; /* MXCSR Register State */ - u32 mxcsr_mask; /* MXCSR Mask */ - - /* 8*16 bytes for each FP-reg = 128 bytes: */ - u32 st_space[32]; - - /* 16*16 bytes for each XMM-reg = 256 bytes: */ - u32 xmm_space[64]; - - u32 padding[12]; - - union { - u32 padding1[12]; - u32 sw_reserved[12]; - }; - -} __attribute__((aligned(16))); - -struct i387_soft_struct { - u32 cwd; - u32 swd; - u32 twd; - u32 fip; - u32 fcs; - u32 foo; - u32 fos; - /* 8*10 bytes for each FP-reg = 80 bytes: */ - u32 st_space[20]; - u8 ftop; - u8 changed; - u8 lookahead; - u8 no_update; - u8 rm; - u8 alimit; - struct math_emu_info *info; - u32 entry_eip; -}; - -struct ymmh_struct { - /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ - u32 ymmh_space[64]; -}; - -/* We don't support LWP yet: */ -struct lwp_struct { - u8 reserved[128]; -}; - -struct bndreg { - u64 lower_bound; - u64 upper_bound; -} __packed; - -struct bndcsr { - u64 bndcfgu; - u64 bndstatus; -} __packed; - -struct xsave_hdr_struct { - u64 xstate_bv; - u64 xcomp_bv; - u64 reserved[6]; -} __attribute__((packed)); - -struct xsave_struct { - struct i387_fxsave_struct i387; - struct xsave_hdr_struct xsave_hdr; - struct ymmh_struct ymmh; - struct lwp_struct lwp; - struct bndreg bndreg[4]; - struct bndcsr bndcsr; - /* new processor state extensions will go here */ -} __attribute__ ((packed, aligned (64))); - -union thread_xstate { - struct i387_fsave_struct fsave; - struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; - struct xsave_struct xsave; -}; - -struct fpu { - unsigned int last_cpu; - unsigned int has_fpu; - union thread_xstate *state; -}; - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(struct orig_ist, orig_ist); @@ -483,8 +366,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack); #endif /* X86_64 */ extern unsigned int xstate_size; -extern void free_thread_xstate(struct task_struct *); -extern struct kmem_cache *task_xstate_cachep; struct perf_event; @@ -508,6 +389,7 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; + /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ @@ -518,32 +400,22 @@ struct thread_struct { unsigned long cr2; unsigned long trap_nr; unsigned long error_code; - /* floating point and extended processor state */ - struct fpu fpu; -#ifdef CONFIG_X86_32 +#ifdef CONFIG_VM86 /* Virtual 86 mode info */ - struct vm86_struct __user *vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags; - unsigned long v86mask; - unsigned long saved_sp0; - unsigned int saved_fs; - unsigned int saved_gs; + struct vm86 *vm86; #endif /* IO permissions: */ unsigned long *io_bitmap_ptr; unsigned long iopl; /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; + + /* Floating point and extended processor state */ + struct fpu fpu; /* - * fpu_counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time + * WARNING: 'fpu' is dynamically-sized. It *MUST* be at + * the end. */ - unsigned char fpu_counter; }; /* @@ -600,6 +472,7 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid #define paravirt_enabled() 0 +#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) @@ -684,12 +557,12 @@ static inline unsigned int cpuid_edx(unsigned int op) } /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) +static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } -static inline void cpu_relax(void) +static __always_inline void cpu_relax(void) { rep_nop(); } @@ -773,14 +646,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) extern void set_task_blockstep(struct task_struct *task, bool on); -/* - * from system description table in BIOS. Mostly for MCA use, but - * others may find it useful: - */ -extern unsigned int machine_id; -extern unsigned int machine_submodel_id; -extern unsigned int BIOS_revision; - /* Boot loader type from the setup header: */ extern int bootloader_type; extern int bootloader_version; @@ -842,7 +707,6 @@ static inline void spin_lock_prefetch(const void *x) #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ - .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ } @@ -928,24 +792,25 @@ extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); /* Register/unregister a process' MPX related resource */ -#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) -#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) +#define MPX_ENABLE_MANAGEMENT() mpx_enable_management() +#define MPX_DISABLE_MANAGEMENT() mpx_disable_management() #ifdef CONFIG_X86_INTEL_MPX -extern int mpx_enable_management(struct task_struct *tsk); -extern int mpx_disable_management(struct task_struct *tsk); +extern int mpx_enable_management(void); +extern int mpx_disable_management(void); #else -static inline int mpx_enable_management(struct task_struct *tsk) +static inline int mpx_enable_management(void) { return -EINVAL; } -static inline int mpx_disable_management(struct task_struct *tsk) +static inline int mpx_disable_management(void) { return -EINVAL; } #endif /* CONFIG_X86_INTEL_MPX */ extern u16 amd_get_nb_id(int cpu); +extern u32 amd_get_nodes_per_socket(void); static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { diff --git a/kernel/arch/x86/include/asm/proto.h b/kernel/arch/x86/include/asm/proto.h index a90f8972d..a4a77286c 100644 --- a/kernel/arch/x86/include/asm/proto.h +++ b/kernel/arch/x86/include/asm/proto.h @@ -5,12 +5,14 @@ /* misc architecture specific prototypes */ -void system_call(void); void syscall_init(void); -void ia32_syscall(void); -void ia32_cstar_target(void); -void ia32_sysenter_target(void); +void entry_SYSCALL_64(void); +void entry_SYSCALL_compat(void); +void entry_INT80_32(void); +void entry_INT80_compat(void); +void entry_SYSENTER_32(void); +void entry_SYSENTER_compat(void); void x86_configure_nx(void); void x86_report_nx(void); diff --git a/kernel/arch/x86/include/asm/ptrace.h b/kernel/arch/x86/include/asm/ptrace.h index 5fabf1362..6271281f9 100644 --- a/kernel/arch/x86/include/asm/ptrace.h +++ b/kernel/arch/x86/include/asm/ptrace.h @@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, unsigned long phase1_result); extern long syscall_trace_enter(struct pt_regs *); -extern void syscall_trace_leave(struct pt_regs *); static inline unsigned long regs_return_value(struct pt_regs *regs) { diff --git a/kernel/arch/x86/include/asm/pvclock-abi.h b/kernel/arch/x86/include/asm/pvclock-abi.h index 6167fd798..67f082301 100644 --- a/kernel/arch/x86/include/asm/pvclock-abi.h +++ b/kernel/arch/x86/include/asm/pvclock-abi.h @@ -41,5 +41,7 @@ struct pvclock_wall_clock { #define PVCLOCK_TSC_STABLE_BIT (1 << 0) #define PVCLOCK_GUEST_STOPPED (1 << 1) +/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ +#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/kernel/arch/x86/include/asm/pvclock.h b/kernel/arch/x86/include/asm/pvclock.h index d6b078e9f..7a6bed5c0 100644 --- a/kernel/arch/x86/include/asm/pvclock.h +++ b/kernel/arch/x86/include/asm/pvclock.h @@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) static __always_inline u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) { - u64 delta = __native_read_tsc() - src->tsc_timestamp; + u64 delta = rdtsc_ordered() - src->tsc_timestamp; return pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); } @@ -76,17 +76,10 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u8 ret_flags; version = src->version; - /* Note: emulated platforms which do not advertise SSE2 support - * result in kvmclock not using the necessary RDTSC barriers. - * Without barriers, it is possible that RDTSC instruction reads from - * the time stamp counter outside rdtsc_barrier protected section - * below, resulting in violation of monotonicity. - */ - rdtsc_barrier(); + offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; ret_flags = src->flags; - rdtsc_barrier(); *cycles = ret; *flags = ret_flags; diff --git a/kernel/arch/x86/include/asm/qrwlock.h b/kernel/arch/x86/include/asm/qrwlock.h index ae0e241e2..c537cbb03 100644 --- a/kernel/arch/x86/include/asm/qrwlock.h +++ b/kernel/arch/x86/include/asm/qrwlock.h @@ -2,16 +2,6 @@ #define _ASM_X86_QRWLOCK_H #include - -#ifndef CONFIG_X86_PPRO_FENCE -#define queue_write_unlock queue_write_unlock -static inline void queue_write_unlock(struct qrwlock *lock) -{ - barrier(); - ACCESS_ONCE(*(u8 *)&lock->cnts) = 0; -} -#endif - #include #endif /* _ASM_X86_QRWLOCK_H */ diff --git a/kernel/arch/x86/include/asm/qspinlock.h b/kernel/arch/x86/include/asm/qspinlock.h new file mode 100644 index 000000000..eaba08076 --- /dev/null +++ b/kernel/arch/x86/include/asm/qspinlock.h @@ -0,0 +1,66 @@ +#ifndef _ASM_X86_QSPINLOCK_H +#define _ASM_X86_QSPINLOCK_H + +#include +#include +#include + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + * + * A smp_store_release() on the least-significant byte. + */ +static inline void native_queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release((u8 *)lock, 0); +} + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); + +static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + pv_queued_spin_lock_slowpath(lock, val); +} + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + pv_queued_spin_unlock(lock); +} +#else +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} +#endif + +#ifdef CONFIG_PARAVIRT +#define virt_spin_lock virt_spin_lock +static inline bool virt_spin_lock(struct qspinlock *lock) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + /* + * On hypervisors without PARAVIRT_SPINLOCKS support we fall + * back to a Test-and-Set spinlock, because fair locks have + * horrible lock 'holder' preemption issues. + */ + + do { + while (atomic_read(&lock->val) != 0) + cpu_relax(); + } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0); + + return true; +} +#endif /* CONFIG_PARAVIRT */ + +#include + +#endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/kernel/arch/x86/include/asm/qspinlock_paravirt.h b/kernel/arch/x86/include/asm/qspinlock_paravirt.h new file mode 100644 index 000000000..b002e711b --- /dev/null +++ b/kernel/arch/x86/include/asm/qspinlock_paravirt.h @@ -0,0 +1,6 @@ +#ifndef __ASM_QSPINLOCK_PARAVIRT_H +#define __ASM_QSPINLOCK_PARAVIRT_H + +PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); + +#endif diff --git a/kernel/arch/x86/include/asm/serial.h b/kernel/arch/x86/include/asm/serial.h index 8378b8c91..bb658211e 100644 --- a/kernel/arch/x86/include/asm/serial.h +++ b/kernel/arch/x86/include/asm/serial.h @@ -11,7 +11,7 @@ #define BASE_BAUD (1843200/16) /* Standard COM flags (except for COM4, because of the 8514 problem) */ -#ifdef CONFIG_SERIAL_DETECT_IRQ +#ifdef CONFIG_SERIAL_8250_DETECT_IRQ # define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ) # define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ) #else diff --git a/kernel/arch/x86/include/asm/setup.h b/kernel/arch/x86/include/asm/setup.h index f69e06b28..11af24e09 100644 --- a/kernel/arch/x86/include/asm/setup.h +++ b/kernel/arch/x86/include/asm/setup.h @@ -60,17 +60,24 @@ static inline void x86_ce4100_early_setup(void) { } #ifndef _SETUP #include +#include /* * This is set up by the setup-routine at boot-time */ extern struct boot_params boot_params; +extern char _text[]; static inline bool kaslr_enabled(void) { return !!(boot_params.hdr.loadflags & KASLR_FLAG); } +static inline unsigned long kaslr_offset(void) +{ + return (unsigned long)&_text - __START_KERNEL; +} + /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/kernel/arch/x86/include/asm/sigcontext.h b/kernel/arch/x86/include/asm/sigcontext.h index 9dfce4e04..e6cd2c489 100644 --- a/kernel/arch/x86/include/asm/sigcontext.h +++ b/kernel/arch/x86/include/asm/sigcontext.h @@ -1,79 +1,8 @@ #ifndef _ASM_X86_SIGCONTEXT_H #define _ASM_X86_SIGCONTEXT_H -#include - -#ifdef __i386__ -struct sigcontext { - unsigned short gs, __gsh; - unsigned short fs, __fsh; - unsigned short es, __esh; - unsigned short ds, __dsh; - unsigned long di; - unsigned long si; - unsigned long bp; - unsigned long sp; - unsigned long bx; - unsigned long dx; - unsigned long cx; - unsigned long ax; - unsigned long trapno; - unsigned long err; - unsigned long ip; - unsigned short cs, __csh; - unsigned long flags; - unsigned long sp_at_signal; - unsigned short ss, __ssh; +/* This is a legacy header - all kernel code includes directly. */ - /* - * fpstate is really (struct _fpstate *) or (struct _xstate *) - * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved - * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the definition of - * (struct _fpx_sw_bytes) - */ - void __user *fpstate; /* zero when no FPU/extended context */ - unsigned long oldmask; - unsigned long cr2; -}; -#else /* __i386__ */ -struct sigcontext { - unsigned long r8; - unsigned long r9; - unsigned long r10; - unsigned long r11; - unsigned long r12; - unsigned long r13; - unsigned long r14; - unsigned long r15; - unsigned long di; - unsigned long si; - unsigned long bp; - unsigned long bx; - unsigned long dx; - unsigned long ax; - unsigned long cx; - unsigned long sp; - unsigned long ip; - unsigned long flags; - unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; - unsigned long err; - unsigned long trapno; - unsigned long oldmask; - unsigned long cr2; +#include - /* - * fpstate is really (struct _fpstate *) or (struct _xstate *) - * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved - * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end - * of extended memory layout. See comments at the definition of - * (struct _fpx_sw_bytes) - */ - void __user *fpstate; /* zero when no FPU/extended context */ - unsigned long reserved1[8]; -}; -#endif /* !__i386__ */ #endif /* _ASM_X86_SIGCONTEXT_H */ diff --git a/kernel/arch/x86/include/asm/sigframe.h b/kernel/arch/x86/include/asm/sigframe.h index 7c7c27c97..34edd1650 100644 --- a/kernel/arch/x86/include/asm/sigframe.h +++ b/kernel/arch/x86/include/asm/sigframe.h @@ -1,15 +1,14 @@ #ifndef _ASM_X86_SIGFRAME_H #define _ASM_X86_SIGFRAME_H -#include +#include #include #include +#include #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe #define rt_sigframe_ia32 rt_sigframe -#define sigcontext_ia32 sigcontext -#define _fpstate_ia32 _fpstate #define ucontext_ia32 ucontext #else /* !CONFIG_X86_32 */ @@ -23,7 +22,7 @@ struct sigframe_ia32 { u32 pretcode; int sig; - struct sigcontext_ia32 sc; + struct sigcontext_32 sc; /* * fpstate is unused. fpstate is moved/allocated after * retcode[] below. This movement allows to have the FP state and the @@ -32,7 +31,7 @@ struct sigframe_ia32 { * the offset of extramask[] in the sigframe and thus prevent any * legacy application accessing/modifying it. */ - struct _fpstate_ia32 fpstate_unused; + struct _fpstate_32 fpstate_unused; #ifdef CONFIG_IA32_EMULATION unsigned int extramask[_COMPAT_NSIG_WORDS-1]; #else /* !CONFIG_IA32_EMULATION */ @@ -69,6 +68,15 @@ struct rt_sigframe { #ifdef CONFIG_X86_X32_ABI +struct ucontext_x32 { + unsigned int uc_flags; + unsigned int uc_link; + compat_stack_t uc_stack; + unsigned int uc__pad0; /* needed for alignment */ + struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ + compat_sigset_t uc_sigmask; /* mask last for extensibility */ +}; + struct rt_sigframe_x32 { u64 pretcode; struct ucontext_x32 uc; diff --git a/kernel/arch/x86/include/asm/signal.h b/kernel/arch/x86/include/asm/signal.h index b1b08a28c..3f5b4ee2e 100644 --- a/kernel/arch/x86/include/asm/signal.h +++ b/kernel/arch/x86/include/asm/signal.h @@ -32,7 +32,7 @@ typedef struct { * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the * trap. */ -#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64) +#if defined(CONFIG_PREEMPT_RT_FULL) #define ARCH_RT_DELAYS_SIGNAL_SEND #endif @@ -43,11 +43,11 @@ typedef sigset_t compat_sigset_t; #endif /* __ASSEMBLY__ */ #include #ifndef __ASSEMBLY__ -extern void do_notify_resume(struct pt_regs *, void *, __u32); +extern void do_signal(struct pt_regs *regs); #define __ARCH_HAS_SA_RESTORER -#include +#include #ifdef __i386__ diff --git a/kernel/arch/x86/include/asm/simd.h b/kernel/arch/x86/include/asm/simd.h index ee80b92f0..6c8a7ed13 100644 --- a/kernel/arch/x86/include/asm/simd.h +++ b/kernel/arch/x86/include/asm/simd.h @@ -1,5 +1,5 @@ -#include +#include /* * may_use_simd - whether it is allowable at this time to issue SIMD diff --git a/kernel/arch/x86/include/asm/smp.h b/kernel/arch/x86/include/asm/smp.h index 17a8dced1..222a6a3ca 100644 --- a/kernel/arch/x86/include/asm/smp.h +++ b/kernel/arch/x86/include/asm/smp.h @@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); -static inline struct cpumask *cpu_sibling_mask(int cpu) -{ - return per_cpu(cpu_sibling_map, cpu); -} - -static inline struct cpumask *cpu_core_mask(int cpu) -{ - return per_cpu(cpu_core_map, cpu); -} - static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return per_cpu(cpu_llc_shared_map, cpu); diff --git a/kernel/arch/x86/include/asm/special_insns.h b/kernel/arch/x86/include/asm/special_insns.h index aeb4666e0..2270e41b3 100644 --- a/kernel/arch/x86/include/asm/special_insns.h +++ b/kernel/arch/x86/include/asm/special_insns.h @@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } +/** + * pcommit_sfence() - persistent commit and fence + * + * The PCOMMIT instruction ensures that data that has been flushed from the + * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to + * memory and is durable on the DIMM. The primary use case for this is + * persistent memory. + * + * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT + * with appropriate fencing. + * + * Example: + * void flush_and_commit_buffer(void *vaddr, unsigned int size) + * { + * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + * void *vend = vaddr + size; + * void *p; + * + * for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + * p < vend; p += boot_cpu_data.x86_clflush_size) + * clwb(p); + * + * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes + * // MFENCE via mb() also works + * wmb(); + * + * // PCOMMIT and the required SFENCE for ordering + * pcommit_sfence(); + * } + * + * After this function completes the data pointed to by 'vaddr' has been + * accepted to memory and will be durable if the 'vaddr' points to persistent + * memory. + * + * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify + * things we include both the PCOMMIT and the required SFENCE in the + * alternatives generated by pcommit_sfence(). + */ static inline void pcommit_sfence(void) { alternative(ASM_NOP7, diff --git a/kernel/arch/x86/include/asm/spinlock.h b/kernel/arch/x86/include/asm/spinlock.h index 64b611782..be0a05913 100644 --- a/kernel/arch/x86/include/asm/spinlock.h +++ b/kernel/arch/x86/include/asm/spinlock.h @@ -42,6 +42,10 @@ extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); +#ifdef CONFIG_QUEUED_SPINLOCKS +#include +#else + #ifdef CONFIG_PARAVIRT_SPINLOCKS static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) @@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) cpu_relax(); } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ /* * Read-write spinlocks, allowing multiple readers diff --git a/kernel/arch/x86/include/asm/spinlock_types.h b/kernel/arch/x86/include/asm/spinlock_types.h index 5f9d7572d..65c3e37f8 100644 --- a/kernel/arch/x86/include/asm/spinlock_types.h +++ b/kernel/arch/x86/include/asm/spinlock_types.h @@ -23,6 +23,9 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#ifdef CONFIG_QUEUED_SPINLOCKS +#include +#else typedef struct arch_spinlock { union { __ticketpair_t head_tail; @@ -33,6 +36,7 @@ typedef struct arch_spinlock { } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ #include diff --git a/kernel/arch/x86/include/asm/stackprotector.h b/kernel/arch/x86/include/asm/stackprotector.h index 64fb5cbe5..02fa39652 100644 --- a/kernel/arch/x86/include/asm/stackprotector.h +++ b/kernel/arch/x86/include/asm/stackprotector.h @@ -39,7 +39,9 @@ #include #include #include + #include +#include /* * 24 byte read-only segment initializer for stack canary. Linker @@ -72,13 +74,12 @@ static __always_inline void boot_init_stack_canary(void) * For preempt-rt we need to weaken the randomness a bit, as * we can't call into the random generator from atomic context * due to locking constraints. We just leave canary - * uninitialized and use the TSC based randomness on top of - * it. + * uninitialized and use the TSC based randomness on top of it. */ #ifndef CONFIG_PREEMPT_RT_FULL get_random_bytes(&canary, sizeof(canary)); #endif - tsc = __native_read_tsc(); + tsc = rdtsc(); canary += tsc + (tsc << 32UL); current->stack_canary = canary; diff --git a/kernel/arch/x86/include/asm/string_64.h b/kernel/arch/x86/include/asm/string_64.h index e46611969..ff8b9a17d 100644 --- a/kernel/arch/x86/include/asm/string_64.h +++ b/kernel/arch/x86/include/asm/string_64.h @@ -27,12 +27,11 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #ifndef CONFIG_KMEMCHECK -#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 -extern void *memcpy(void *to, const void *from, size_t len); -#else +#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ ({ \ size_t __len = (len); \ diff --git a/kernel/arch/x86/include/asm/suspend_32.h b/kernel/arch/x86/include/asm/suspend_32.h index 552d6c90a..d1793f068 100644 --- a/kernel/arch/x86/include/asm/suspend_32.h +++ b/kernel/arch/x86/include/asm/suspend_32.h @@ -7,7 +7,7 @@ #define _ASM_X86_SUSPEND_32_H #include -#include +#include /* image of the saved processor state */ struct saved_context { diff --git a/kernel/arch/x86/include/asm/suspend_64.h b/kernel/arch/x86/include/asm/suspend_64.h index bc6232834..7ebf0ebe4 100644 --- a/kernel/arch/x86/include/asm/suspend_64.h +++ b/kernel/arch/x86/include/asm/suspend_64.h @@ -7,7 +7,7 @@ #define _ASM_X86_SUSPEND_64_H #include -#include +#include /* * Image of the saved processor state, used by the low level ACPI suspend to diff --git a/kernel/arch/x86/include/asm/syscall.h b/kernel/arch/x86/include/asm/syscall.h index d6a756ae0..999b7cd2e 100644 --- a/kernel/arch/x86/include/asm/syscall.h +++ b/kernel/arch/x86/include/asm/syscall.h @@ -20,9 +20,21 @@ #include /* for TS_COMPAT */ #include -typedef void (*sys_call_ptr_t)(void); +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); extern const sys_call_ptr_t sys_call_table[]; +#if defined(CONFIG_X86_32) +#define ia32_sys_call_table sys_call_table +#define __NR_syscall_compat_max __NR_syscall_max +#define IA32_NR_syscalls NR_syscalls +#endif + +#if defined(CONFIG_IA32_EMULATION) +extern const sys_call_ptr_t ia32_sys_call_table[]; +#endif + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/kernel/arch/x86/include/asm/syscalls.h b/kernel/arch/x86/include/asm/syscalls.h index 592a6a672..91dfcafe2 100644 --- a/kernel/arch/x86/include/asm/syscalls.h +++ b/kernel/arch/x86/include/asm/syscalls.h @@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *); asmlinkage unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ +struct vm86_struct; asmlinkage long sys_vm86old(struct vm86_struct __user *); asmlinkage long sys_vm86(unsigned long, unsigned long); diff --git a/kernel/arch/x86/include/asm/thread_info.h b/kernel/arch/x86/include/asm/thread_info.h index 606144afb..ddb63bd90 100644 --- a/kernel/arch/x86/include/asm/thread_info.h +++ b/kernel/arch/x86/include/asm/thread_info.h @@ -27,14 +27,17 @@ * Without this offset, that can result in a page fault. (We are * careful that, in this case, the value we read doesn't matter.) * - * In vm86 mode, the hardware frame is much longer still, but we neither - * access the extra members from NMI context, nor do we write such a - * frame at sp0 at all. + * In vm86 mode, the hardware frame is much longer still, so add 16 + * bytes to make room for the real-mode segments. * * x86_64 has a fixed-length stack frame. */ #ifdef CONFIG_X86_32 -# define TOP_OF_KERNEL_STACK_PADDING 8 +# ifdef CONFIG_VM86 +# define TOP_OF_KERNEL_STACK_PADDING 16 +# else +# define TOP_OF_KERNEL_STACK_PADDING 8 +# endif #else # define TOP_OF_KERNEL_STACK_PADDING 0 #endif @@ -54,11 +57,9 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int saved_preempt_count; - int preempt_lazy_count; /* 0 => lazy preemptable - <0 => BUG */ mm_segment_t addr_limit; - void __user *sysenter_return; + int preempt_lazy_count; /* 0 => lazy preemptable + <0 => BUG */ unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ }; @@ -68,7 +69,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } @@ -144,27 +144,11 @@ struct thread_info { _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) -/* work to do in syscall_trace_leave() */ -#define _TIF_WORK_SYSCALL_EXIT \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) - -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK \ - (0x0000FFFF & \ - ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ - _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) - /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) -/* Only used for 64 bit */ -#define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) - /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) @@ -183,8 +167,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -DECLARE_PER_CPU(unsigned long, kernel_stack); - static inline struct thread_info *current_thread_info(void) { return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); @@ -203,9 +185,13 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ +#ifdef CONFIG_X86_64 +# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +#endif + /* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ _ASM_SUB $(THREAD_SIZE),reg ; /* diff --git a/kernel/arch/x86/include/asm/tlbflush.h b/kernel/arch/x86/include/asm/tlbflush.h index cd791948b..6df202940 100644 --- a/kernel/arch/x86/include/asm/tlbflush.h +++ b/kernel/arch/x86/include/asm/tlbflush.h @@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void) #endif /* SMP */ +/* Not inlined due to inc_irq_stat not being defined yet */ +#define flush_tlb_local() { \ + inc_irq_stat(irq_tlb_count); \ + local_flush_tlb(); \ +} + #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, start, end) \ native_flush_tlb_others(mask, mm, start, end) diff --git a/kernel/arch/x86/include/asm/topology.h b/kernel/arch/x86/include/asm/topology.h index 0e8f04f2c..0fb46482d 100644 --- a/kernel/arch/x86/include/asm/topology.h +++ b/kernel/arch/x86/include/asm/topology.h @@ -26,7 +26,7 @@ #define _ASM_X86_TOPOLOGY_H #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT +# ifdef CONFIG_SMP # define ENABLE_TOPO_DEFINES # endif #else @@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) -#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) +#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/kernel/arch/x86/include/asm/trace/irq_vectors.h b/kernel/arch/x86/include/asm/trace/irq_vectors.h index 4cab89000..38a09a13a 100644 --- a/kernel/arch/x86/include/asm/trace/irq_vectors.h +++ b/kernel/arch/x86/include/asm/trace/irq_vectors.h @@ -100,6 +100,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single); */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +/* + * deferred_error_apic - called when entering/exiting a deferred apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); + /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler diff --git a/kernel/arch/x86/include/asm/trace/mpx.h b/kernel/arch/x86/include/asm/trace/mpx.h new file mode 100644 index 000000000..0f492fc50 --- /dev/null +++ b/kernel/arch/x86/include/asm/trace/mpx.h @@ -0,0 +1,133 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mpx + +#if !defined(_TRACE_MPX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MPX_H + +#include + +#ifdef CONFIG_X86_INTEL_MPX + +TRACE_EVENT(mpx_bounds_register_exception, + + TP_PROTO(void *addr_referenced, + const struct mpx_bndreg *bndreg), + TP_ARGS(addr_referenced, bndreg), + + TP_STRUCT__entry( + __field(void *, addr_referenced) + __field(u64, lower_bound) + __field(u64, upper_bound) + ), + + TP_fast_assign( + __entry->addr_referenced = addr_referenced; + __entry->lower_bound = bndreg->lower_bound; + __entry->upper_bound = bndreg->upper_bound; + ), + /* + * Note that we are printing out the '~' of the upper + * bounds register here. It is actually stored in its + * one's complement form so that its 'init' state + * corresponds to all 0's. But, that looks like + * gibberish when printed out, so print out the 1's + * complement instead of the actual value here. Note + * though that you still need to specify filters for the + * actual value, not the displayed one. + */ + TP_printk("address referenced: 0x%p bounds: lower: 0x%llx ~upper: 0x%llx", + __entry->addr_referenced, + __entry->lower_bound, + ~__entry->upper_bound + ) +); + +TRACE_EVENT(bounds_exception_mpx, + + TP_PROTO(const struct mpx_bndcsr *bndcsr), + TP_ARGS(bndcsr), + + TP_STRUCT__entry( + __field(u64, bndcfgu) + __field(u64, bndstatus) + ), + + TP_fast_assign( + /* need to get rid of the 'const' on bndcsr */ + __entry->bndcfgu = (u64)bndcsr->bndcfgu; + __entry->bndstatus = (u64)bndcsr->bndstatus; + ), + + TP_printk("bndcfgu:0x%llx bndstatus:0x%llx", + __entry->bndcfgu, + __entry->bndstatus) +); + +DECLARE_EVENT_CLASS(mpx_range_trace, + + TP_PROTO(unsigned long start, + unsigned long end), + TP_ARGS(start, end), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + ), + + TP_printk("[0x%p:0x%p]", + (void *)__entry->start, + (void *)__entry->end + ) +); + +DEFINE_EVENT(mpx_range_trace, mpx_unmap_zap, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end) +); + +DEFINE_EVENT(mpx_range_trace, mpx_unmap_search, + TP_PROTO(unsigned long start, unsigned long end), + TP_ARGS(start, end) +); + +TRACE_EVENT(mpx_new_bounds_table, + + TP_PROTO(unsigned long table_vaddr), + TP_ARGS(table_vaddr), + + TP_STRUCT__entry( + __field(unsigned long, table_vaddr) + ), + + TP_fast_assign( + __entry->table_vaddr = table_vaddr; + ), + + TP_printk("table vaddr:%p", (void *)__entry->table_vaddr) +); + +#else + +/* + * This gets used outside of MPX-specific code, so we need a stub. + */ +static inline +void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr) +{ +} + +#endif /* CONFIG_X86_INTEL_MPX */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE mpx +#endif /* _TRACE_MPX_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/arch/x86/include/asm/traps.h b/kernel/arch/x86/include/asm/traps.h index 4e49d7dff..c34966197 100644 --- a/kernel/arch/x86/include/asm/traps.h +++ b/kernel/arch/x86/include/asm/traps.h @@ -108,11 +108,12 @@ extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); -asmlinkage void mce_threshold_interrupt(void); +asmlinkage void smp_threshold_interrupt(void); +asmlinkage void smp_deferred_error_interrupt(void); #endif -extern enum ctx_state ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); +extern void ist_enter(struct pt_regs *regs); +extern void ist_exit(struct pt_regs *regs); extern void ist_begin_non_atomic(struct pt_regs *regs); extern void ist_end_non_atomic(void); diff --git a/kernel/arch/x86/include/asm/tsc.h b/kernel/arch/x86/include/asm/tsc.h index 94605c0e9..6d7c5479b 100644 --- a/kernel/arch/x86/include/asm/tsc.h +++ b/kernel/arch/x86/include/asm/tsc.h @@ -21,28 +21,12 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { - unsigned long long ret = 0; - #ifndef CONFIG_X86_TSC if (!cpu_has_tsc) return 0; #endif - rdtscll(ret); - - return ret; -} -static __always_inline cycles_t vget_cycles(void) -{ - /* - * We only do VDSOs on TSC capable CPUs, so this shouldn't - * access boot_cpu_data (which is not VDSO-safe): - */ -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - return (cycles_t)__native_read_tsc(); + return rdtsc(); } extern void tsc_init(void); @@ -51,6 +35,7 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern int check_tsc_disabled(void); extern unsigned long native_calibrate_tsc(void); +extern unsigned long long native_sched_clock_from_tsc(u64 tsc); extern int tsc_clocksource_reliable; diff --git a/kernel/arch/x86/include/asm/uaccess.h b/kernel/arch/x86/include/asm/uaccess.h index a8df874f3..09b1b0ab9 100644 --- a/kernel/arch/x86/include/asm/uaccess.h +++ b/kernel/arch/x86/include/asm/uaccess.h @@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * limit, not add it to the address). */ if (__builtin_constant_p(size)) - return addr > limit - size; + return unlikely(addr > limit - size); /* Arbitrary sizes? Be careful about overflow */ addr += size; - if (addr < size) + if (unlikely(addr < size)) return true; - return addr > limit; + return unlikely(addr > limit); } #define __range_not_ok(addr, size, limit) \ @@ -182,7 +182,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ - __ret_gu; \ + __builtin_expect(__ret_gu, 0); \ }) #define __put_user_x(size, x, ptr, __ret_pu) \ @@ -278,7 +278,7 @@ extern void __put_user_8(void); __put_user_x(X, __pu_val, ptr, __ret_pu); \ break; \ } \ - __ret_pu; \ + __builtin_expect(__ret_pu, 0); \ }) #define __put_user_size(x, ptr, size, retval, errret) \ @@ -401,7 +401,7 @@ do { \ ({ \ int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ - __pu_err; \ + __builtin_expect(__pu_err, 0); \ }) #define __get_user_nocheck(x, ptr, size) \ @@ -410,7 +410,7 @@ do { \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __gu_err; \ + __builtin_expect(__gu_err, 0); \ }) /* FIXME: this hack is definitely wrong -AK */ diff --git a/kernel/arch/x86/include/asm/uaccess_32.h b/kernel/arch/x86/include/asm/uaccess_32.h index 7c8ad3451..f5dcb5204 100644 --- a/kernel/arch/x86/include/asm/uaccess_32.h +++ b/kernel/arch/x86/include/asm/uaccess_32.h @@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) __put_user_size(*(u32 *)from, (u32 __user *)to, 4, ret, 4); return ret; + case 8: + __put_user_size(*(u64 *)from, (u64 __user *)to, + 8, ret, 8); + return ret; } } return __copy_to_user_ll(to, from, n); diff --git a/kernel/arch/x86/include/asm/user.h b/kernel/arch/x86/include/asm/user.h index ccab4af16..59a54e869 100644 --- a/kernel/arch/x86/include/asm/user.h +++ b/kernel/arch/x86/include/asm/user.h @@ -14,8 +14,8 @@ struct user_ymmh_regs { __u32 ymmh_space[64]; }; -struct user_xsave_hdr { - __u64 xstate_bv; +struct user_xstate_header { + __u64 xfeatures; __u64 reserved1[2]; __u64 reserved2[5]; }; @@ -41,11 +41,11 @@ struct user_xsave_hdr { * particular process/thread. * * Also when the user modifies certain state FP/SSE/etc through the - * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * ptrace interface, they must ensure that the header.xfeatures * bytes[512..519] of the memory layout are updated correspondingly. * i.e., for example when FP state is modified to a non-init state, - * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to - * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + * header.xfeatures's bit 0 must be set to '1', when SSE is modified to + * non-init state, header.xfeatures's bit 1 must to be set to '1', etc. */ #define USER_XSTATE_FX_SW_WORDS 6 #define USER_XSTATE_XCR0_WORD 0 @@ -55,7 +55,7 @@ struct user_xstateregs { __u64 fpx_space[58]; __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; } i387; - struct user_xsave_hdr xsave_hdr; + struct user_xstate_header header; struct user_ymmh_regs ymmh; /* further processor state extensions go here */ }; diff --git a/kernel/arch/x86/include/asm/uv/uv_hub.h b/kernel/arch/x86/include/asm/uv/uv_hub.h index c2729abe0..01ec643ce 100644 --- a/kernel/arch/x86/include/asm/uv/uv_hub.h +++ b/kernel/arch/x86/include/asm/uv/uv_hub.h @@ -609,7 +609,7 @@ struct uv_cpu_nmi_s { DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi); -#define uv_hub_nmi (uv_cpu_nmi.hub) +#define uv_hub_nmi this_cpu_read(uv_cpu_nmi.hub) #define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu)) #define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub) diff --git a/kernel/arch/x86/include/asm/vdso.h b/kernel/arch/x86/include/asm/vdso.h index 8021bd28c..756de9190 100644 --- a/kernel/arch/x86/include/asm/vdso.h +++ b/kernel/arch/x86/include/asm/vdso.h @@ -26,7 +26,7 @@ struct vdso_image { long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; - long sym_VDSO32_SYSENTER_RETURN; + long sym_int80_landing_pad; }; #ifdef CONFIG_X86_64 @@ -38,13 +38,7 @@ extern const struct vdso_image vdso_image_x32; #endif #if defined CONFIG_X86_32 || defined CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_int80; -#ifdef CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_syscall; -#endif -extern const struct vdso_image vdso_image_32_sysenter; - -extern const struct vdso_image *selected_vdso32; +extern const struct vdso_image vdso_image_32; #endif extern void __init init_vdso_image(const struct vdso_image *image); diff --git a/kernel/arch/x86/include/asm/vm86.h b/kernel/arch/x86/include/asm/vm86.h index 1d8de3f3f..1e491f3af 100644 --- a/kernel/arch/x86/include/asm/vm86.h +++ b/kernel/arch/x86/include/asm/vm86.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_VM86_H #define _ASM_X86_VM86_H - #include #include @@ -28,43 +27,49 @@ struct kernel_vm86_regs { unsigned short gs, __gsh; }; -struct kernel_vm86_struct { - struct kernel_vm86_regs regs; -/* - * the below part remains on the kernel stack while we are in VM86 mode. - * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we - * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above - * 'struct kernel_vm86_regs' with the then actual values. - * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' - * in kernelspace, hence we need not reget the data from userspace. - */ -#define VM86_TSS_ESP0 flags +struct vm86 { + struct vm86plus_struct __user *user_vm86; + struct pt_regs regs32; + unsigned long veflags; + unsigned long veflags_mask; + unsigned long saved_sp0; + unsigned long flags; unsigned long screen_bitmap; unsigned long cpu_type; struct revectored_struct int_revectored; struct revectored_struct int21_revectored; struct vm86plus_info_struct vm86plus; - struct pt_regs *regs32; /* here we save the pointer to the old regs */ -/* - * The below is not part of the structure, but the stack layout continues - * this way. In front of 'return-eip' may be some data, depending on - * compilation, so we don't rely on this and save the pointer to 'oldregs' - * in 'regs32' above. - * However, with GCC-2.7.2 and the current CFLAGS you see exactly this: - - long return-eip; from call to vm86() - struct pt_regs oldregs; user space registers as saved by syscall - */ }; #ifdef CONFIG_VM86 void handle_vm86_fault(struct kernel_vm86_regs *, long); int handle_vm86_trap(struct kernel_vm86_regs *, long, int); -struct pt_regs *save_v86_state(struct kernel_vm86_regs *); +void save_v86_state(struct kernel_vm86_regs *, int); struct task_struct; + +#define free_vm86(t) do { \ + struct thread_struct *__t = (t); \ + if (__t->vm86 != NULL) { \ + kfree(__t->vm86); \ + __t->vm86 = NULL; \ + } \ +} while (0) + +/* + * Support for VM86 programs to request interrupts for + * real mode hardware drivers: + */ +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 + +static inline int invalid_vm86_irq(int irq) +{ + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} + void release_vm86_irqs(struct task_struct *); #else @@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) return 0; } +static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } + +#define free_vm86(t) do { } while(0) + #endif /* CONFIG_VM86 */ #endif /* _ASM_X86_VM86_H */ diff --git a/kernel/arch/x86/include/asm/vmx.h b/kernel/arch/x86/include/asm/vmx.h index da772edd1..14c63c7e8 100644 --- a/kernel/arch/x86/include/asm/vmx.h +++ b/kernel/arch/x86/include/asm/vmx.h @@ -47,6 +47,7 @@ #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 +#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000 #define CPU_BASED_USE_MSR_BITMAPS 0x10000000 #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 @@ -71,7 +72,8 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 - +#define SECONDARY_EXEC_PCOMMIT 0x00200000 +#define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -166,6 +168,8 @@ enum vmcs_field { VMWRITE_BITMAP = 0x00002028, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + TSC_MULTIPLIER = 0x00002032, + TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -367,29 +371,29 @@ enum vmcs_field { #define TYPE_PHYSICAL_APIC_EVENT (10 << 12) #define TYPE_PHYSICAL_APIC_INST (15 << 12) -/* segment AR */ -#define SEGMENT_AR_L_MASK (1 << 13) - -#define AR_TYPE_ACCESSES_MASK 1 -#define AR_TYPE_READABLE_MASK (1 << 1) -#define AR_TYPE_WRITEABLE_MASK (1 << 2) -#define AR_TYPE_CODE_MASK (1 << 3) -#define AR_TYPE_MASK 0x0f -#define AR_TYPE_BUSY_64_TSS 11 -#define AR_TYPE_BUSY_32_TSS 11 -#define AR_TYPE_BUSY_16_TSS 3 -#define AR_TYPE_LDT 2 - -#define AR_UNUSABLE_MASK (1 << 16) -#define AR_S_MASK (1 << 4) -#define AR_P_MASK (1 << 7) -#define AR_L_MASK (1 << 13) -#define AR_DB_MASK (1 << 14) -#define AR_G_MASK (1 << 15) -#define AR_DPL_SHIFT 5 -#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3) - -#define AR_RESERVD_MASK 0xfffe0f00 +/* segment AR in VMCS -- these are different from what LAR reports */ +#define VMX_SEGMENT_AR_L_MASK (1 << 13) + +#define VMX_AR_TYPE_ACCESSES_MASK 1 +#define VMX_AR_TYPE_READABLE_MASK (1 << 1) +#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2) +#define VMX_AR_TYPE_CODE_MASK (1 << 3) +#define VMX_AR_TYPE_MASK 0x0f +#define VMX_AR_TYPE_BUSY_64_TSS 11 +#define VMX_AR_TYPE_BUSY_32_TSS 11 +#define VMX_AR_TYPE_BUSY_16_TSS 3 +#define VMX_AR_TYPE_LDT 2 + +#define VMX_AR_UNUSABLE_MASK (1 << 16) +#define VMX_AR_S_MASK (1 << 4) +#define VMX_AR_P_MASK (1 << 7) +#define VMX_AR_L_MASK (1 << 13) +#define VMX_AR_DB_MASK (1 << 14) +#define VMX_AR_G_MASK (1 << 15) +#define VMX_AR_DPL_SHIFT 5 +#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3) + +#define VMX_AR_RESERVD_MASK 0xfffe0f00 #define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0) #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1) @@ -415,6 +419,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ diff --git a/kernel/arch/x86/include/asm/x86_init.h b/kernel/arch/x86/include/asm/x86_init.h index f58a9c7a3..cd0fc0cc7 100644 --- a/kernel/arch/x86/include/asm/x86_init.h +++ b/kernel/arch/x86/include/asm/x86_init.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include #include struct mpc_bus; @@ -171,38 +170,17 @@ struct x86_platform_ops { }; struct pci_dev; -struct msi_msg; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); - void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq, - unsigned int dest, struct msi_msg *msg, - u8 hpet_id); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); - int (*setup_hpet_msi)(unsigned int irq, unsigned int id); }; -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_data; -struct cpumask; - struct x86_io_apic_ops { - void (*init) (void); unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify) (unsigned int apic, unsigned int reg, unsigned int value); void (*disable)(void); - void (*print_entries)(unsigned int apic, unsigned int nr_entries); - int (*set_affinity)(struct irq_data *data, - const struct cpumask *mask, - bool force); - int (*setup_entry)(int irq, struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr); - void (*eoi_ioapic_pin)(int apic, int pin, int vector); }; extern struct x86_init_ops x86_init; diff --git a/kernel/arch/x86/include/asm/xcr.h b/kernel/arch/x86/include/asm/xcr.h deleted file mode 100644 index f2cba4e79..000000000 --- a/kernel/arch/x86/include/asm/xcr.h +++ /dev/null @@ -1,49 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2008 rPath, Inc. - All Rights Reserved - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * asm-x86/xcr.h - * - * Definitions for the eXtended Control Register instructions - */ - -#ifndef _ASM_X86_XCR_H -#define _ASM_X86_XCR_H - -#define XCR_XFEATURE_ENABLED_MASK 0x00000000 - -#ifdef __KERNEL__ -# ifndef __ASSEMBLY__ - -#include - -static inline u64 xgetbv(u32 index) -{ - u32 eax, edx; - - asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ - : "=a" (eax), "=d" (edx) - : "c" (index)); - return eax + ((u64)edx << 32); -} - -static inline void xsetbv(u32 index, u64 value) -{ - u32 eax = value; - u32 edx = value >> 32; - - asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ - : : "a" (eax), "d" (edx), "c" (index)); -} - -# endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_XCR_H */ diff --git a/kernel/arch/x86/include/asm/xen/events.h b/kernel/arch/x86/include/asm/xen/events.h index 608a79d5a..e6911caf5 100644 --- a/kernel/arch/x86/include/asm/xen/events.h +++ b/kernel/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/kernel/arch/x86/include/asm/xen/hypercall.h b/kernel/arch/x86/include/asm/xen/hypercall.h index ca08a27b9..4c20dd333 100644 --- a/kernel/arch/x86/include/asm/xen/hypercall.h +++ b/kernel/arch/x86/include/asm/xen/hypercall.h @@ -336,10 +336,10 @@ HYPERVISOR_update_descriptor(u64 ma, u64 desc) return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); } -static inline int +static inline long HYPERVISOR_memory_op(unsigned int cmd, void *arg) { - return _hypercall2(int, memory_op, cmd, arg); + return _hypercall2(long, memory_op, cmd, arg); } static inline int @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( return _hypercall1(int, tmem_op, op); } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ + return _hypercall2(int, xenpmu_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/kernel/arch/x86/include/asm/xen/hypervisor.h b/kernel/arch/x86/include/asm/xen/hypervisor.h index d866959e5..8b2d4bea9 100644 --- a/kernel/arch/x86/include/asm/xen/hypervisor.h +++ b/kernel/arch/x86/include/asm/xen/hypervisor.h @@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void) } #endif +#ifdef CONFIG_HOTPLUG_CPU +void xen_arch_register_cpu(int num); +void xen_arch_unregister_cpu(int num); +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/kernel/arch/x86/include/asm/xen/interface.h b/kernel/arch/x86/include/asm/xen/interface.h index 3400dbaec..62ca03ef5 100644 --- a/kernel/arch/x86/include/asm/xen/interface.h +++ b/kernel/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@ * * Guest OS interface to x86 Xen. * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser */ #ifndef _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info { DEFINE_GUEST_HANDLE_STRUCT(trap_info); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it + * has been set to invalid e.g. due to the p2m being too large for the + * 3-level p2m tree. In this case the linear mapper p2m list anchored + * at p2m_vaddr is to be used. + */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + /* + * Following three fields are valid if p2m_cr3 contains a value + * different from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register + * state and holds the folded machine frame number (via xen_pfn_to_cr3) + * of a L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All + * entries in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 + * and a value with the least significant bit set indicates that a + * mapping update is in progress. This allows guest external software + * (e.g. in Dom0) to verify that read mappings are consistent and + * whether they have changed since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an + * atomic write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ }; #endif /* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info { /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST (1<<1) -#define VGCF_IN_KERNEL (1<<2) +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ @@ -172,6 +250,129 @@ struct vcpu_guest_context { #endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* + * Padding for adding new registers to xen_pmu_regs in + * the future + */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include + * MSR banks that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; + #endif /* !__ASSEMBLY__ */ /* diff --git a/kernel/arch/x86/include/asm/xen/page.h b/kernel/arch/x86/include/asm/xen/page.h index c44a5d53e..f5fb840b4 100644 --- a/kernel/arch/x86/include/asm/xen/page.h +++ b/kernel/arch/x86/include/asm/xen/page.h @@ -12,7 +12,7 @@ #include #include -#include +#include #include /* Xen machine address */ @@ -35,9 +35,7 @@ typedef struct xpaddr { #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) -/* Maximum amount of memory we can handle in a domain in pages */ -#define MAX_DOMAIN_PAGES \ - ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; @@ -45,11 +43,13 @@ extern unsigned long *xen_p2m_addr; extern unsigned long xen_p2m_size; extern unsigned long xen_max_p2m_pfn; +extern int xen_alloc_p2m_entry(unsigned long pfn); + extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern unsigned long set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, @@ -103,6 +103,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) { unsigned long mfn; + /* + * Some x86 code are still using pfn_to_mfn instead of + * pfn_to_mfn. This will have to be removed when we figured + * out which call. + */ if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; @@ -149,6 +154,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) { unsigned long pfn; + /* + * Some x86 code are still using mfn_to_pfn instead of + * gfn_to_pfn. This will have to be removed when we figure + * out which call. + */ if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; @@ -178,6 +188,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); } +/* Pseudo-physical <-> Guest conversion */ +static inline unsigned long pfn_to_gfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + else + return pfn_to_mfn(pfn); +} + +static inline unsigned long gfn_to_pfn(unsigned long gfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return gfn; + else + return mfn_to_pfn(gfn); +} + +/* Pseudo-physical <-> Bus conversion */ +#define pfn_to_bfn(pfn) pfn_to_gfn(pfn) +#define bfn_to_pfn(bfn) gfn_to_pfn(bfn) + /* * We detect special mappings in one of two ways: * 1. If the MFN is an I/O page then Xen will set the m2p entry @@ -198,7 +229,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) * require. In all the cases we care about, the FOREIGN_FRAME bit is * masked (e.g., pfn_to_mfn()) so behaviour there is correct. */ -static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +static inline unsigned long bfn_to_local_pfn(unsigned long mfn) { unsigned long pfn; @@ -217,6 +248,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) +/* VIRT <-> GUEST conversion */ +#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v))) +#define gfn_to_virt(g) (__va(gfn_to_pfn(g) << PAGE_SHIFT)) + static inline unsigned long pte_mfn(pte_t pte) { return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; @@ -263,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_unmap(cookie) iounmap((cookie)) static inline bool xen_arch_need_swiotlb(struct device *dev, - unsigned long pfn, - unsigned long mfn) + phys_addr_t phys, + dma_addr_t dev_addr) { return false; } diff --git a/kernel/arch/x86/include/asm/xor.h b/kernel/arch/x86/include/asm/xor.h index d8829751b..1f5c5161e 100644 --- a/kernel/arch/x86/include/asm/xor.h +++ b/kernel/arch/x86/include/asm/xor.h @@ -36,7 +36,7 @@ * no advantages to be gotten from x86-64 here anyways. */ -#include +#include #ifdef CONFIG_X86_32 /* reduce register pressure */ diff --git a/kernel/arch/x86/include/asm/xor_32.h b/kernel/arch/x86/include/asm/xor_32.h index ce05722e3..5a08bc8bf 100644 --- a/kernel/arch/x86/include/asm/xor_32.h +++ b/kernel/arch/x86/include/asm/xor_32.h @@ -26,7 +26,7 @@ #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" -#include +#include static void xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) diff --git a/kernel/arch/x86/include/asm/xor_avx.h b/kernel/arch/x86/include/asm/xor_avx.h index 492b29802..7c0a517ec 100644 --- a/kernel/arch/x86/include/asm/xor_avx.h +++ b/kernel/arch/x86/include/asm/xor_avx.h @@ -18,7 +18,7 @@ #ifdef CONFIG_AS_AVX #include -#include +#include #define BLOCK4(i) \ BLOCK(32 * i, 0) \ diff --git a/kernel/arch/x86/include/asm/xsave.h b/kernel/arch/x86/include/asm/xsave.h deleted file mode 100644 index c9a6d68b8..000000000 --- a/kernel/arch/x86/include/asm/xsave.h +++ /dev/null @@ -1,257 +0,0 @@ -#ifndef __ASM_X86_XSAVE_H -#define __ASM_X86_XSAVE_H - -#include -#include - -#define XSTATE_CPUID 0x0000000d - -#define XSTATE_FP 0x1 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_BNDREGS 0x8 -#define XSTATE_BNDCSR 0x10 -#define XSTATE_OPMASK 0x20 -#define XSTATE_ZMM_Hi256 0x40 -#define XSTATE_Hi16_ZMM 0x80 - -#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) -#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) -/* Bit 63 of XCR0 is reserved for future expansion */ -#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) - -#define FXSAVE_SIZE 512 - -#define XSAVE_HDR_SIZE 64 -#define XSAVE_HDR_OFFSET FXSAVE_SIZE - -#define XSAVE_YMM_SIZE 256 -#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) - -/* Supported features which support lazy state saving */ -#define XSTATE_LAZY (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ - | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) - -/* Supported features which require eager state saving */ -#define XSTATE_EAGER (XSTATE_BNDREGS | XSTATE_BNDCSR) - -/* All currently supported features */ -#define XCNTXT_MASK (XSTATE_LAZY | XSTATE_EAGER) - -#ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " -#else -#define REX_PREFIX -#endif - -extern unsigned int xstate_size; -extern u64 pcntxt_mask; -extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern struct xsave_struct *init_xstate_buf; - -extern void xsave_init(void); -extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); -extern int init_fpu(struct task_struct *child); - -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -#define xstate_fault ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err) - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XSAVES"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XSAVE"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - asm volatile("1:"XRSTORS"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - else - asm volatile("1:"XRSTOR"\n\t" - "2:\n\t" - xstate_fault - : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - return err; -} - -/* - * Save processor xstate to xsave area. - */ -static inline int xsave_state(struct xsave_struct *fx, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err = 0; - - /* - * If xsaves is enabled, xsaves replaces xsaveopt because - * it supports compact format and supervisor states in addition to - * modified optimization in xsaveopt. - * - * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave - * because xsaveopt supports modified optimization which is not - * supported by xsave. - * - * If none of xsaves and xsaveopt is enabled, use xsave. - */ - alternative_input_2( - "1:"XSAVE, - XSAVEOPT, - X86_FEATURE_XSAVEOPT, - XSAVES, - X86_FEATURE_XSAVES, - [fx] "D" (fx), "a" (lmask), "d" (hmask) : - "memory"); - asm volatile("2:\n\t" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Restore processor xstate from xsave area. - */ -static inline int xrstor_state(struct xsave_struct *fx, u64 mask) -{ - int err = 0; - u32 lmask = mask; - u32 hmask = mask >> 32; - - /* - * Use xrstors to restore context if it is enabled. xrstors supports - * compacted format of xsave area which is not supported by xrstor. - */ - alternative_input( - "1: " XRSTOR, - XRSTORS, - X86_FEATURE_XSAVES, - "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) - : "memory"); - - asm volatile("2:\n" - xstate_fault - : "0" (0) - : "memory"); - - return err; -} - -/* - * Save xstate context for old process during context switch. - */ -static inline void fpu_xsave(struct fpu *fpu) -{ - xsave_state(&fpu->state->xsave, -1); -} - -/* - * Restore xstate context for new process during context switch. - */ -static inline int fpu_xrstor_checking(struct xsave_struct *fx) -{ - return xrstor_state(fx, -1); -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. - */ -static inline int xsave_user(struct xsave_struct __user *buf) -{ - int err; - - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr)); - if (unlikely(err)) - return -EFAULT; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XSAVE"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (buf), "a" (-1), "d" (-1), "0" (0) - : "memory"); - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) -{ - int err = 0; - struct xsave_struct *xstate = ((__force struct xsave_struct *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - - __asm__ __volatile__(ASM_STAC "\n" - "1:"XRSTOR"\n" - "2: " ASM_CLAC "\n" - xstate_fault - : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) - : "memory"); /* memory required? */ - return err; -} - -void *get_xsave_addr(struct xsave_struct *xsave, int xstate); -void setup_xstate_comp(void); - -#endif -- cgit 1.2.3-korg