summaryrefslogtreecommitdiffstats
path: root/kernel/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/arch/x86/kvm')
-rw-r--r--kernel/arch/x86/kvm/cpuid.c1
-rw-r--r--kernel/arch/x86/kvm/emulate.c287
-rw-r--r--kernel/arch/x86/kvm/i8254.c12
-rw-r--r--kernel/arch/x86/kvm/ioapic.c2
-rw-r--r--kernel/arch/x86/kvm/irq_comm.c13
-rw-r--r--kernel/arch/x86/kvm/lapic.c6
-rw-r--r--kernel/arch/x86/kvm/lapic.h1
-rw-r--r--kernel/arch/x86/kvm/mtrr.c3
-rw-r--r--kernel/arch/x86/kvm/vmx.c85
-rw-r--r--kernel/arch/x86/kvm/x86.c66
10 files changed, 367 insertions, 109 deletions
diff --git a/kernel/arch/x86/kvm/cpuid.c b/kernel/arch/x86/kvm/cpuid.c
index 6525e926f..2e1fd586b 100644
--- a/kernel/arch/x86/kvm/cpuid.c
+++ b/kernel/arch/x86/kvm/cpuid.c
@@ -509,6 +509,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
do_cpuid_1_ent(&entry[i], function, idx);
if (idx == 1) {
entry[i].eax &= kvm_supported_word10_x86_features;
+ cpuid_mask(&entry[i].eax, 10);
entry[i].ebx = 0;
if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
entry[i].ebx =
diff --git a/kernel/arch/x86/kvm/emulate.c b/kernel/arch/x86/kvm/emulate.c
index b9b09fec1..1dcea2259 100644
--- a/kernel/arch/x86/kvm/emulate.c
+++ b/kernel/arch/x86/kvm/emulate.c
@@ -172,6 +172,7 @@
#define NearBranch ((u64)1 << 52) /* Near branches */
#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define Aligned16 ((u64)1 << 55) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -434,6 +435,26 @@ FOP_END;
FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
FOP_END;
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+ int _fault = 0; \
+ \
+ asm volatile("1:" insn "\n" \
+ "2:\n" \
+ ".pushsection .fixup, \"ax\"\n" \
+ "3: movl $1, %[_fault]\n" \
+ " jmp 2b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [_fault] "+qm"(_fault) inoutclob ); \
+ \
+ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
+})
+
static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
enum x86_intercept intercept,
enum x86_intercept_stage stage)
@@ -620,21 +641,24 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
* depending on whether they're AVX encoded or not.
*
* Also included is CMPXCHG16B which is not a vector instruction, yet it is
- * subject to the same check.
+ * subject to the same check. FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
*/
-static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
{
if (likely(size < 16))
- return false;
+ return 1;
if (ctxt->d & Aligned)
- return true;
+ return size;
else if (ctxt->d & Unaligned)
- return false;
+ return 1;
else if (ctxt->d & Avx)
- return false;
+ return 1;
+ else if (ctxt->d & Aligned16)
+ return 16;
else
- return true;
+ return size;
}
static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
@@ -692,7 +716,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
}
break;
}
- if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
+ if (la & (insn_alignment(ctxt, size) - 1))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
bad:
@@ -779,6 +803,20 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
}
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned int size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception);
+}
+
/*
* Prefetch the remaining bytes of the instruction without crossing page
* boundary if they are not in fetch_cache yet.
@@ -1532,7 +1570,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
&ctxt->exception);
}
-/* Does not support long mode */
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg, u8 cpl,
enum x86_transfer_type transfer,
@@ -1569,20 +1606,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
rpl = selector & 3;
- /* NULL selector is not valid for TR, CS and SS (except for long mode) */
- if ((seg == VCPU_SREG_CS
- || (seg == VCPU_SREG_SS
- && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
- || seg == VCPU_SREG_TR)
- && null_selector)
- goto exception;
-
/* TR should be in GDT only */
if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
goto exception;
- if (null_selector) /* for NULL selector skip all following checks */
+ /* NULL selector is not valid for TR, CS and (except for long mode) SS */
+ if (null_selector) {
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+ goto exception;
+
+ if (seg == VCPU_SREG_SS) {
+ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+ goto exception;
+
+ /*
+ * ctxt->ops->set_segment expects the CPL to be in
+ * SS.DPL, so fake an expand-up 32-bit data segment.
+ */
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = cpl;
+ seg_desc.d = 1;
+ seg_desc.g = 1;
+ }
+
+ /* Skip all following checks */
goto load;
+ }
ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
@@ -1698,6 +1749,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg)
{
u8 cpl = ctxt->ops->cpl(ctxt);
+
+ /*
+ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+ * they can load it at CPL<3 (Intel's manual says only LSS can,
+ * but it's wrong).
+ *
+ * However, the Intel manual says that putting IST=1/DPL=3 in
+ * an interrupt gate will result in SS=3 (the AMD manual instead
+ * says it doesn't), so allow SS=3 in __load_segment_descriptor
+ * and only forbid it here.
+ */
+ if (seg == VCPU_SREG_SS && selector == 3 &&
+ ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
return __load_segment_descriptor(ctxt, selector, seg, cpl,
X86_TRANSFER_NONE, NULL);
}
@@ -2093,16 +2159,10 @@ static int em_iret(struct x86_emulate_ctxt *ctxt)
static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned short sel, old_sel;
- struct desc_struct old_desc, new_desc;
- const struct x86_emulate_ops *ops = ctxt->ops;
+ unsigned short sel;
+ struct desc_struct new_desc;
u8 cpl = ctxt->ops->cpl(ctxt);
- /* Assignment of RIP may only fail in 64-bit mode */
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- ops->get_segment(ctxt, &old_sel, &old_desc, NULL,
- VCPU_SREG_CS);
-
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
@@ -2112,12 +2172,10 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
return rc;
rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
- if (rc != X86EMUL_CONTINUE) {
- WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
- /* assigning eip failed; restore the old cs */
- ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS);
- return rc;
- }
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
return rc;
}
@@ -2177,14 +2235,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
unsigned long eip, cs;
- u16 old_cs;
int cpl = ctxt->ops->cpl(ctxt);
- struct desc_struct old_desc, new_desc;
- const struct x86_emulate_ops *ops = ctxt->ops;
-
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- ops->get_segment(ctxt, &old_cs, &old_desc, NULL,
- VCPU_SREG_CS);
+ struct desc_struct new_desc;
rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2201,10 +2253,10 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
rc = assign_eip_far(ctxt, eip, &new_desc);
- if (rc != X86EMUL_CONTINUE) {
- WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
- ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
- }
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
return rc;
}
@@ -3660,8 +3712,8 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
}
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
- return segmented_write(ctxt, ctxt->dst.addr.mem,
- &desc_ptr, 2 + ctxt->op_bytes);
+ return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
}
static int em_sgdt(struct x86_emulate_ctxt *ctxt)
@@ -3844,6 +3896,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax = 1, ebx, ecx = 0, edx;
+
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ if (!(edx & FFL(FXSR)))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ /*
+ * Don't emulate a case that should never be hit, instead of working
+ * around a lack of fxsave64/fxrstor64 on old compilers.
+ */
+ if (ctxt->mode >= X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ * 1) 16 bit mode
+ * 2) 32 bit mode
+ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs
+ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ * save and restore
+ * 3) 64-bit mode with REX.W prefix
+ * - like (2), but XMM 8-15 are being saved and restored
+ * 4) 64-bit mode without REX.W prefix
+ * - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result. (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ size_t size;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->ops->get_fpu(ctxt);
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+ ctxt->ops->put_fpu(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
+ size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
+ else
+ size = offsetof(struct fxregs_state, xmm_space[0]);
+
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+}
+
+static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
+ struct fxregs_state *new)
+{
+ int rc = X86EMUL_CONTINUE;
+ struct fxregs_state old;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /*
+ * 64 bit host will restore XMM 8-15, which is not correct on non-64
+ * bit guests. Load the current values in order to preserve 64 bit
+ * XMMs after fxrstor.
+ */
+#ifdef CONFIG_X86_64
+ /* XXX: accessing XMM 8-15 very awkwardly */
+ memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
+#endif
+
+ /*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
+ * does save and restore MXCSR.
+ */
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
+ memcpy(new->xmm_space, old.xmm_space, 8 * 16);
+
+ return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (fx_state.mxcsr >> 16)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->ops->get_fpu(ctxt);
+
+ if (ctxt->mode < X86EMUL_MODE_PROT64)
+ rc = fxrstor_fixup(ctxt, &fx_state);
+
+ if (rc == X86EMUL_CONTINUE)
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+ ctxt->ops->put_fpu(ctxt);
+
+ return rc;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -4196,7 +4373,9 @@ static const struct gprefix pfx_0f_ae_7 = {
};
static const struct group_dual group15 = { {
- N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+ I(ModRM | Aligned16, em_fxsave),
+ I(ModRM | Aligned16, em_fxrstor),
+ N, N, N, N, N, GP(0, &pfx_0f_ae_7),
}, {
N, N, N, N, N, N, N, N,
} };
@@ -5033,7 +5212,7 @@ done_prefixes:
/* Decode and fetch the destination operand: register or memory. */
rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
- if (ctxt->rip_relative)
+ if (ctxt->rip_relative && likely(ctxt->memopp))
ctxt->memopp->addr.mem.ea = address_mask(ctxt,
ctxt->memopp->addr.mem.ea + ctxt->_eip);
@@ -5068,21 +5247,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
{
- bool fault = false;
+ int rc;
ctxt->ops->get_fpu(ctxt);
- asm volatile("1: fwait \n\t"
- "2: \n\t"
- ".pushsection .fixup,\"ax\" \n\t"
- "3: \n\t"
- "movb $1, %[fault] \n\t"
- "jmp 2b \n\t"
- ".popsection \n\t"
- _ASM_EXTABLE(1b, 3b)
- : [fault]"+qm"(fault));
+ rc = asm_safe("fwait");
ctxt->ops->put_fpu(ctxt);
- if (unlikely(fault))
+ if (unlikely(rc != X86EMUL_CONTINUE))
return emulate_exception(ctxt, MF_VECTOR, 0, false);
return X86EMUL_CONTINUE;
diff --git a/kernel/arch/x86/kvm/i8254.c b/kernel/arch/x86/kvm/i8254.c
index b0ea42b78..ab5318727 100644
--- a/kernel/arch/x86/kvm/i8254.c
+++ b/kernel/arch/x86/kvm/i8254.c
@@ -245,7 +245,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
* PIC is being reset. Handle it gracefully here
*/
atomic_inc(&ps->pending);
- else if (value > 0)
+ else if (value > 0 && ps->reinject)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
*/
@@ -288,7 +288,9 @@ static void pit_do_work(struct kthread_work *work)
* last one has been acked.
*/
spin_lock(&ps->inject_lock);
- if (ps->irq_ack) {
+ if (!ps->reinject)
+ inject = 1;
+ else if (ps->irq_ack) {
ps->irq_ack = 0;
inject = 1;
}
@@ -317,10 +319,10 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
struct kvm_pit *pt = ps->kvm->arch.vpit;
- if (ps->reinject || !atomic_read(&ps->pending)) {
+ if (ps->reinject)
atomic_inc(&ps->pending);
- queue_kthread_work(&pt->worker, &pt->expired);
- }
+
+ queue_kthread_work(&pt->worker, &pt->expired);
if (ps->is_periodic) {
hrtimer_add_expires_ns(&ps->timer, ps->period);
diff --git a/kernel/arch/x86/kvm/ioapic.c b/kernel/arch/x86/kvm/ioapic.c
index 88d0a92d3..3aab53f8c 100644
--- a/kernel/arch/x86/kvm/ioapic.c
+++ b/kernel/arch/x86/kvm/ioapic.c
@@ -580,7 +580,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->irr = 0;
ioapic->irr_delivered = 0;
ioapic->id = 0;
- memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
+ memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi));
rtc_irq_eoi_tracking_reset(ioapic);
}
diff --git a/kernel/arch/x86/kvm/irq_comm.c b/kernel/arch/x86/kvm/irq_comm.c
index 84b96d319..d09544e82 100644
--- a/kernel/arch/x86/kvm/irq_comm.c
+++ b/kernel/arch/x86/kvm/irq_comm.c
@@ -38,6 +38,15 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
bool line_status)
{
struct kvm_pic *pic = pic_irqchip(kvm);
+
+ /*
+ * XXX: rejecting pic routes when pic isn't in use would be better,
+ * but the default routing table is installed while kvm->arch.vpic is
+ * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE.
+ */
+ if (!pic)
+ return -1;
+
return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
}
@@ -46,6 +55,10 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
bool line_status)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return -1;
+
return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
line_status);
}
diff --git a/kernel/arch/x86/kvm/lapic.c b/kernel/arch/x86/kvm/lapic.c
index 3d1b17068..6cd4f4b32 100644
--- a/kernel/arch/x86/kvm/lapic.c
+++ b/kernel/arch/x86/kvm/lapic.c
@@ -2266,3 +2266,9 @@ void kvm_lapic_init(void)
jump_label_rate_limit(&apic_hw_disabled, HZ);
jump_label_rate_limit(&apic_sw_disabled, HZ);
}
+
+void kvm_lapic_exit(void)
+{
+ static_key_deferred_flush(&apic_hw_disabled);
+ static_key_deferred_flush(&apic_sw_disabled);
+}
diff --git a/kernel/arch/x86/kvm/lapic.h b/kernel/arch/x86/kvm/lapic.h
index 640ad275b..04141a598 100644
--- a/kernel/arch/x86/kvm/lapic.h
+++ b/kernel/arch/x86/kvm/lapic.h
@@ -96,6 +96,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
void kvm_lapic_init(void);
+void kvm_lapic_exit(void);
static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
{
diff --git a/kernel/arch/x86/kvm/mtrr.c b/kernel/arch/x86/kvm/mtrr.c
index 3f8c73211..0149ac59c 100644
--- a/kernel/arch/x86/kvm/mtrr.c
+++ b/kernel/arch/x86/kvm/mtrr.c
@@ -44,8 +44,6 @@ static bool msr_mtrr_valid(unsigned msr)
case MSR_MTRRdefType:
case MSR_IA32_CR_PAT:
return true;
- case 0x2f8:
- return true;
}
return false;
}
@@ -541,6 +539,7 @@ static void mtrr_lookup_var_start(struct mtrr_iter *iter)
iter->fixed = false;
iter->start_max = iter->start;
+ iter->range = NULL;
iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
__mtrr_lookup_var_next(iter);
diff --git a/kernel/arch/x86/kvm/vmx.c b/kernel/arch/x86/kvm/vmx.c
index a722f724c..8d744a82f 100644
--- a/kernel/arch/x86/kvm/vmx.c
+++ b/kernel/arch/x86/kvm/vmx.c
@@ -415,6 +415,7 @@ struct nested_vmx {
struct list_head vmcs02_pool;
int vmcs02_num;
u64 vmcs01_tsc_offset;
+ bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
/*
@@ -1308,10 +1309,10 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
}
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+ == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
}
static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
@@ -2699,8 +2700,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
if (enable_vpid)
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
else
vmx->nested.nested_vmx_vpid_caps = 0;
@@ -5016,8 +5024,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
vmx_fpu_activate(vcpu);
@@ -5295,7 +5303,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+ if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
if (is_no_device(intr_info)) {
@@ -6652,7 +6660,13 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
/* Checks for #GP/#SS exceptions. */
exn = false;
- if (is_protmode(vcpu)) {
+ if (is_long_mode(vcpu)) {
+ /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+ * non-canonical form. This is the only check on the memory
+ * destination for long mode!
+ */
+ exn = is_noncanonical_address(*ret);
+ } else if (is_protmode(vcpu)) {
/* Protected mode: apply checks for segment validity in the
* following order:
* - segment type check (#GP(0) may be thrown)
@@ -6669,17 +6683,10 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
* execute-only code segment
*/
exn = ((s.type & 0xa) == 8);
- }
- if (exn) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
- return 1;
- }
- if (is_long_mode(vcpu)) {
- /* Long mode: #GP(0)/#SS(0) if the memory address is in a
- * non-canonical form. This is an only check for long mode.
- */
- exn = is_noncanonical_address(*ret);
- } else if (is_protmode(vcpu)) {
+ if (exn) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
*/
exn = (s.unusable != 0);
@@ -7420,6 +7427,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7478,6 +7486,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7494,12 +7503,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
}
switch (type) {
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ /*
+ * Old versions of KVM use the single-context version so we
+ * have to support it; just treat it the same as all-context.
+ */
case VMX_VPID_EXTENT_ALL_CONTEXT:
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
nested_vmx_succeed(vcpu);
break;
default:
- /* Trap single context invalidation invvpid calls */
+ /* Trap individual address invalidation invvpid calls */
BUG_ON(1);
break;
}
@@ -7795,7 +7809,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
- if (!is_exception(intr_info))
+ if (is_nmi(intr_info))
return false;
else if (is_page_fault(intr_info))
return enable_ept;
@@ -8200,6 +8214,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason != EXIT_REASON_PML_FULL &&
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@ -8259,6 +8274,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
{
u32 sec_exec_control;
+ /* Postpone execution until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true;
+ return;
+ }
+
/*
* There is not point to enable virtualize x2apic without enable
* apicv
@@ -8397,8 +8418,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
- if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ if (is_nmi(exit_intr_info)) {
kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
kvm_after_handle_nmi(&vmx->vcpu);
@@ -8834,6 +8854,22 @@ static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
put_cpu();
}
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = vcpu_load(vcpu);
+ BUG_ON(r);
+ vmx_load_vmcs01(vcpu);
+ free_nested(vmx);
+ vcpu_put(vcpu);
+}
+
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8842,8 +8878,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
vmx_destroy_pml_buffer(vmx);
free_vpid(vmx->vpid);
leave_guest_mode(vcpu);
- vmx_load_vmcs01(vcpu);
- free_nested(vmx);
+ vmx_free_vcpu_nested(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
@@ -10576,6 +10611,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
PIN_BASED_VMX_PREEMPTION_TIMER);
+ if (vmx->nested.change_vmcs01_virtual_x2apic_mode) {
+ vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
+ vmx_set_virtual_x2apic_mode(vcpu,
+ vcpu->arch.apic_base & X2APIC_ENABLE);
+ }
+
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;
diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c
index c7695cea4..c265bb0c2 100644
--- a/kernel/arch/x86/kvm/x86.c
+++ b/kernel/arch/x86/kvm/x86.c
@@ -200,7 +200,18 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
struct kvm_shared_msrs *locals
= container_of(urn, struct kvm_shared_msrs, urn);
struct kvm_shared_msr_values *values;
+ unsigned long flags;
+ /*
+ * Disabling irqs at this point since the following code could be
+ * interrupted and executed through kvm_arch_hardware_disable()
+ */
+ local_irq_save(flags);
+ if (locals->registered) {
+ locals->registered = false;
+ user_return_notifier_unregister(urn);
+ }
+ local_irq_restore(flags);
for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
values = &locals->values[slot];
if (values->host != values->curr) {
@@ -208,8 +219,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
values->curr = values->host;
}
}
- locals->registered = false;
- user_return_notifier_unregister(urn);
}
static void shared_msr_update(unsigned slot, u32 msr)
@@ -698,7 +707,6 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
return 1;
}
- kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -2947,6 +2955,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
memset(&events->reserved, 0, sizeof(events->reserved));
}
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
+
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
@@ -2979,10 +2989,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ u32 hflags = vcpu->arch.hflags;
if (events->smi.smm)
- vcpu->arch.hflags |= HF_SMM_MASK;
+ hflags |= HF_SMM_MASK;
else
- vcpu->arch.hflags &= ~HF_SMM_MASK;
+ hflags &= ~HF_SMM_MASK;
+ kvm_set_hflags(vcpu, hflags);
+
vcpu->arch.smi_pending = events->smi.pending;
if (events->smi.smm_inside_nmi)
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
@@ -3020,6 +3033,11 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
+ if (dbgregs->dr6 & ~0xffffffffull)
+ return -EINVAL;
+ if (dbgregs->dr7 & ~0xffffffffull)
+ return -EINVAL;
+
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
@@ -3045,6 +3063,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
memcpy(dest, xsave, XSAVE_HDR_OFFSET);
/* Set XSTATE_BV */
+ xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
/*
@@ -3319,6 +3338,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
};
case KVM_SET_VAPIC_ADDR: {
struct kvm_vapic_addr va;
+ int idx;
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
@@ -3326,7 +3346,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
goto out;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_X86_SETUP_MCE: {
@@ -5834,6 +5856,7 @@ out:
void kvm_arch_exit(void)
{
+ kvm_lapic_exit();
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
@@ -6036,12 +6059,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
/* try to inject new event if pending */
- if (vcpu->arch.nmi_pending) {
- if (kvm_x86_ops->nmi_allowed(vcpu)) {
- --vcpu->arch.nmi_pending;
- vcpu->arch.nmi_injected = true;
- kvm_x86_ops->set_nmi(vcpu);
- }
+ if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
} else if (kvm_cpu_has_injectable_intr(vcpu)) {
/*
* Because interrupts can be injected asynchronously, we are
@@ -6486,10 +6507,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
/* enable NMI/IRQ window open exits if needed */
- else if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
+ else {
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+ }
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -6507,8 +6530,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
- kvm_load_guest_xcr0(vcpu);
-
vcpu->mode = IN_GUEST_MODE;
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6531,6 +6552,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
+ kvm_load_guest_xcr0(vcpu);
+
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
@@ -6580,6 +6603,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ kvm_put_guest_xcr0(vcpu);
+
/* Interrupt is enabled by handle_external_intr() */
kvm_x86_ops->handle_external_intr(vcpu);
@@ -7227,7 +7252,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
* and assume host would use all available bits.
* Guest xcr0 would be loaded later.
*/
- kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
@@ -7236,8 +7260,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- kvm_put_guest_xcr0(vcpu);
-
if (!vcpu->guest_fpu_loaded) {
vcpu->fpu_counter = 0;
return;
@@ -7262,10 +7284,12 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
+ void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+
kvmclock_reset(vcpu);
- free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
kvm_x86_ops->vcpu_free(vcpu);
+ free_cpumask_var(wbinvd_dirty_mask);
}
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,