summaryrefslogtreecommitdiffstats
path: root/kernel/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/arch/x86/kvm/vmx.c')
-rw-r--r--kernel/arch/x86/kvm/vmx.c214
1 files changed, 209 insertions, 5 deletions
diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index 0958fa2b7..a722f724c 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -109,6 +109,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -596,6 +603,9 @@ struct vcpu_vmx {
#define PML_ENTITY_NUM 512
struct page *pml_pg;
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
u64 current_tsc_ratio;
};
@@ -1043,6 +1053,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
}
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
static inline bool cpu_has_vmx_posted_intr(void)
{
return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -3209,11 +3271,15 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
return -EIO;
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
@@ -4683,6 +4749,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
if (!vmx_cpu_uses_apicv(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ /* Enable the preemption timer dynamically */
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
return pin_based_exec_ctrl;
}
@@ -4781,6 +4849,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* Control */
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ vmx->hv_deadline_tsc = -1;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
@@ -6292,6 +6361,17 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+ u64 vmx_msr;
+
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ cpu_preemption_timer_multi =
+ vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ } else {
+ kvm_x86_ops->set_hv_timer = NULL;
+ kvm_x86_ops->cancel_hv_timer = NULL;
+ }
+
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
return alloc_kvm_area();
@@ -7460,6 +7540,12 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7511,6 +7597,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_PCOMMIT] = handle_pcommit,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7814,6 +7901,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
case EXIT_REASON_PCOMMIT:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return false;
default:
return true;
}
@@ -8510,6 +8599,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
}
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (vmx->hv_deadline_tsc == -1)
+ return;
+
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* sure to be 32 bit only because checked on set_hv_timer */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8556,6 +8665,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
+ vmx_arm_hv_timer(vcpu);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -9520,9 +9631,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
exec_control = vmcs12->pin_based_vm_exec_control;
- exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+ /* Preemption timer setting is only taken from vmcs01. */
exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control |= vmcs_config.pin_based_exec_ctrl;
+ if (vmx->hv_deadline_tsc == -1)
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ /* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
/*
* Note that we use L0's vector here and in
@@ -10451,8 +10567,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
- /* Update TSC_OFFSET if TSC was changed while L2 ran */
+ /* Update any VMCS fields that might have changed while L2 ran */
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ if (vmx->hv_deadline_tsc == -1)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ else
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;
@@ -10532,6 +10654,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl = rdtsc();
+ u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ u64_shl_div_u64(delta_tsc,
+ kvm_tsc_scaling_ratio_frac_bits,
+ vcpu->arch.tsc_scaling_ratio,
+ &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->hv_deadline_tsc = -1;
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (ple_gap)
@@ -10576,7 +10756,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
* this case, return 1, otherwise, return 0.
*
*/
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
{
unsigned long flags;
unsigned int dest;
@@ -10642,7 +10822,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
return 0;
}
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ if (pi_pre_block(vcpu))
+ return 1;
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
@@ -10683,6 +10874,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
}
}
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->set_hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ pi_post_block(vcpu);
+}
+
/*
* vmx_update_pi_irte - set IRTE for Posted-Interrupts
*
@@ -10878,6 +11077,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
.pmu_ops = &intel_pmu_ops,
.update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
};
static int __init vmx_init(void)